In [5]:
# import packages

import pandas as pd
from nltk import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Clean the data

In [2]:
# read in training data

train = pd.read_csv("data/share_repurchase_paragraphs.csv")

In [6]:
def relevant_sentence_from_text(df):
    """Use previously identified information pulled from paragraph text to identify 
    relevant sentences for each entry in training dataset"""
    
    # identify and isolate relevant and irrelevant sentences from paragraph text
    relevant_sentence = list()
    irrelevant_sentences = list()
    
    # iterate over each row in df
    for ind, para in enumerate(df.paragraph_text):
        
        # assign components to search for to variables
        row_text = df.text[ind] # pull entire value from text column
        
        # reported data value is variable. 
        # value in column is always float but in text can be int, float or absent (mainly for date)
        int_rdv = int(df.reported_data_value[ind])
        if (df.reported_data_value[ind] == int_rdv) & (len(str(int_rdv)) <= 3): # if text value is int
            row_data_val = str(int_rdv)
        elif (df.reported_data_value[ind] != int_rdv) & (len(str(int_rdv)) <= 3): # if text value is float
            row_data_val = str(df.reported_data_value[ind])[:-1]
        else: # if reported data value is spurrious
            row_data_val = ""
        
        # get and clean reported units value
        if df.reported_units[ind] != "ones": 
            row_unit = df.reported_units[ind][:-1] # values are plural in column but singular in text so remove "s"
        else: # ones doesn't exist so ignore that value
            row_unit = ""
        
        # tokenize each sentence in paragraph
        para_sent_tokens = sent_tokenize(para)
        
        rel_sents = list()
        irrel_sents = list()
        
        # separate sentences into relevant and irrelevant for each row
        for sent in para_sent_tokens:
            
            # all the components are present in sentence, it is the relevant sentence
            if (row_text in sent) & (("$"+row_data_val in sent)|(" "+row_data_val+" " in sent)) & (row_unit in sent):
                rel_sents.append(sent)
            
            # if not all present then it is irrelevant.
            #This may miss some info but consequence is getting too many duplicates
            else:
                irrel_sents.append(sent)

        relevant_sentence.append(rel_sents)
        irrelevant_sentences.append(irrel_sents) # irrelevant sentences will be list of lists
    
    df["irrelevant_sentences"] = irrelevant_sentences # add irrelevant sentences
    
    # create new df so that each relevant sentence gets its own row, incase there are duplicates.
    # likely will not be an issue but just in case
    new_df = list()
    sents = list()
    for ind, sent_list in enumerate(relevant_sentence):
        for sent in sent_list:
            new_df.append(df.iloc[ind])
            sents.append(sent)
    
    df_sents = pd.DataFrame(new_df)
    df_sents["relevant_sentence"] = sents
    
    return(df_sents)

In [7]:
# run function on training data
train_split = relevant_sentence_from_text(train)

In [8]:
# Splic training data into a dataframe for each data_key_friendly_name
categories = train_split.data_key_friendly_name.unique()

sub_df_list = list(range(len(categories)))
for ind, category in enumerate(categories):
    sub_df_list[ind] = train_split[train_split["data_key_friendly_name"] == category]
    
[date, auth, intent, count, amount, util, unk] = sub_df_list

In [9]:
# reset the indexes, may not be necessary but just in case
for df in [date, auth, intent, count, amount, util, unk]:
    df.reset_index(drop = True, inplace = True)

# Logistic Regression

In [10]:
def category_df_for_lr(df):
    """Take in relevant and irrelevant sentences and create new dataframe to be used in logistic regression.
    Return dataframe"""
    relevant_sentences = df.relevant_sentence.tolist()
    
    irrelevant_sentences = list()
    for l in df.irrelevant_sentences:
        for s in l:
            irrelevant_sentences.append(s)
    
    relevant_df = pd.DataFrame(relevant_sentences, columns = ["sentences"])
    
    irrelevant_df = pd.DataFrame(irrelevant_sentences, columns = ["sentences"])
    
    # new column assigning if sentence is about category or not. 1 = yes, 0 = no
    
    relevant_df["in_category"] = 1
    
    irrelevant_df["in_category"] = 0
    
    df_lr = pd.concat([relevant_df, irrelevant_df])
    
    return(df_lr)

In [14]:
def train_lr_model(df, ngram_range = (1, 4), cm = True):
    """Prepare text with TfidfVectorizer, 
    Select top features based on chi2, 
    and fit model using LogisticRegressionCV.
    Return score and predictions"""
    
    # initialize TfidfVectorizer, specifying certain hyperparameters
    tfidf = TfidfVectorizer(sublinear_tf = True, # use log to scale tf
                           norm = "l2", # norm method
                           ngram_range = ngram_range, # n-gram range to test (default is unigram to fourgram)
                           stop_words = "english") # language to use when removing stopwords
    
    # select input variables from dataframe 
    X = df.sentences 
    y = df.in_category
    
    # fit and transform text column
    X_tfidf = tfidf.fit_transform(X)
    
    # split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.33, random_state=42)

    # select features to use for model
    ch2 = SelectKBest(chi2, k=100) # select top 100 features based on chi2
    
    # fit and transform training and test data using previously selected features
    X_train_chi2_selected = ch2.fit_transform(X_train, y_train)
    X_test_chi2_selected = ch2.transform(X_test)
    
    # initialize logistic regression
    lr = LogisticRegressionCV(cv = 5, penalty = "l2")
    
    # fit lr model to training data
    lr_model = lr.fit(X_train_chi2_selected, y_train)
    
    # use model to predict values for test data
    y_pred = lr_model.predict(X_test_chi2_selected)
    
    # score the model
    score = lr.score(X_test_chi2_selected, y_test)
    
    print("Score: ", score, "\n")
    
    if cm == True:
        # Count values for confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_test.tolist(), y_pred).ravel()
        
        data = {'no': [tn, fp], 'yes': [fn, tp]}
        cm = pd.DataFrame.from_dict(data, orient='index',
                                    columns=["no", "yes"])
        print(cm)
    
    #return(y_pred)

In [16]:
# test functions on one df

date_lr = category_df_for_lr(date)

train_lr_model(date_lr)

Score:  0.8215158924205379 

      no  yes
no   267   16
yes   57   69


In [18]:
# do for all

df_names = ["date", "auth", "intent", "count", "amount", "util", "unk"]

for name, df in zip(df_names, sub_df_list):
    df_lr = category_df_for_lr(df)
    print(name, "\n")
    train_lr_model(df_lr)
    print("\n", "\n")

date 

Score:  0.8215158924205379 

      no  yes
no   267   16
yes   57   69

 

auth 

Score:  0.7909090909090909 

      no  yes
no   142    7
yes   39   32

 

intent 

Score:  0.825 

     no  yes
no   48    7
yes   7   18

 

count 

Score:  0.8713692946058091 

      no  yes
no   142   13
yes   18   68

 

amount 



  chisq /= f_exp


Score:  0.7777777777777778 

     no  yes
no   20    5
yes   7   22

 

util 

Score:  0.7837837837837838 

     no  yes
no   25    0
yes   8    4

 

unk 

Score:  0.6153846153846154 

     no  yes
no   15    2
yes   8    1

 

