# Main Notebook

In [1]:
import re
import os
import spacy
nlp = spacy.load("en_core_web_sm")

## Parse Data

### coca-samples-text

In [2]:
def parse_articles_text(file_text): 
    article_texts = file_text.split("\n")

    articles = []
    for article_text in article_texts: 
        if len(article_text) == 0: 
            continue
        
        pattern = r'@@\d+ '
        rv = re.findall(pattern, article_text[:20])
        if len(rv) == 0: 
            continue
        article_number = int(rv[0][2:-1])
        print(f"article_number: {article_number}")
        article_doc = nlp(article_text)
        article = {"number" : article_number, "doc" : article_doc}
        
        articles.append(article)
    
    if len(articles) == 0: 
        return None
    return articles

In [3]:
def parse_filename_text(directory, filename): 
    print(f"filename: {filename}")
    file = open(directory + filename, "r", encoding="ISO-8859-1")
    file_text = file.read()
    file.close()
    
    articles = parse_articles_text(file_text)
    if articles == None: 
        return None
    file = {"filename" : filename, "articles" : articles}
    return file

In [4]:
def get_files_text(directory): 
    files = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        
        file = parse_filename_text(directory, filename)
        if file == None: 
            continue
        files.append(file)
    return files

## Build Data

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
%%time
text_directory = "data/coca-samples-text/"
# text_files = get_files_text(text_directory)

# THIS WAS TAKING TOO LONG SO I AM ONLY DOING SPOK AND TVM
# ['text_news.txt', 'text_fic.txt', 'text_web.txt', 'text_spok.txt', 'text_tvm.txt', 
# 'text_blog.txt', 'text_acad.txt', 'text_mag.txt']
text_files = [parse_filename_text(text_directory, 'text_spok.txt'), parse_filename_text(text_directory, 'text_tvm.txt')]

filename: text_spok.txt
article_number: 17141
article_number: 21741
article_number: 207541
article_number: 207641
article_number: 220641
article_number: 220741
article_number: 221141
article_number: 221241
article_number: 221341
article_number: 221441
article_number: 221541
article_number: 221641
article_number: 221741
article_number: 222141
article_number: 222241
article_number: 222341
article_number: 222441
article_number: 222541
article_number: 222741
article_number: 222841
article_number: 222941
article_number: 223041
article_number: 223141
article_number: 223241
article_number: 223341
article_number: 223441
article_number: 223541
article_number: 223941
article_number: 224241
article_number: 224341
article_number: 224541
article_number: 224641
article_number: 224741
article_number: 224841
article_number: 224941
article_number: 225041
article_number: 225141
article_number: 225241
article_number: 225341
article_number: 225441
article_number: 225541
article_number: 225641
article_numb

article_number: 5225341
article_number: 5225441
article_number: 5225841
article_number: 5226041
article_number: 5226141
article_number: 5226241
article_number: 5226341
article_number: 5226441
article_number: 5226541
article_number: 5226641
article_number: 5226741
article_number: 5227241
article_number: 5227341
article_number: 5227441
article_number: 5227541
article_number: 5227941
article_number: 5228041
article_number: 5228141
article_number: 5228241
article_number: 5229041
article_number: 5229141
article_number: 5229241
article_number: 5229441
article_number: 5229641
article_number: 5230041
article_number: 5230141
article_number: 5230341
article_number: 5230441
article_number: 5231141
article_number: 5231241
article_number: 5231341
article_number: 5231541
article_number: 5231641
article_number: 5231741
article_number: 5232141
article_number: 5232241
article_number: 5232341
article_number: 5232541
article_number: 5232641
article_number: 5233041
article_number: 5233141
article_number: 

In [7]:
def build_data(label1, label2): 
    text_file1 = [file for file in text_files if label1 in file["filename"]][0]
    text_file2 = [file for file in text_files if label2 in file["filename"]][0]
    
    X, Y = [], []
    for article in text_file1["articles"]: 
        X.append(article)
        Y.append(label1)
    
    for article in text_file2["articles"]: 
        X.append(article)
        Y.append(label2)
    return (X, Y)

In [8]:
X, Y = build_data("spok", "tvm")
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=0)

## Build Classifier

In [9]:
import json
import nltk
from sklearn import linear_model
from scipy import sparse
from collections import Counter
import operator

In [10]:
def print_weights(clf, vocab, n=10):
    weights=clf.coef_[0]
    reverse_vocab=[None]*len(weights)
    for k in vocab:
        reverse_vocab[vocab[k]]=k

    print(f"Class 1: {clf.classes_[0]}")
    for feature, weight in sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))[:n]:
        print("%.3f\t%s" % (weight, feature))

    print()

    print(f"Class 2: {clf.classes_[1]}")
    for feature, weight in list(reversed(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))))[:n]:
        print("%.3f\t%s" % (weight, feature))

In [11]:
def build_features(dataX, feature_functions):
    
    """ This function featurizes the data according to the list of parameter feature_functions """
    
    data=[]
    for tokens in dataX:
        feats={}
        
        for function in feature_functions:
            feats.update(function(tokens))

        data.append(feats)
    return data

In [12]:
def features_to_ids(data, feature_vocab):
    
    """ 
    
    This helper function converts a dictionary of feature names to a sparse representation
 that we can fit in a scikit-learn model.  This is important because almost all feature 
 values will be 0 for most documents (note: why?), and we don't want to save them all in 
 memory.

    """
    new_data=sparse.lil_matrix((len(data), len(feature_vocab)))
    for idx,doc in enumerate(data):
        for f in doc:
            if f in feature_vocab:
                new_data[idx,feature_vocab[f]]=doc[f]
    return new_data

In [13]:
def create_vocab(data, top_n=None):
    
    """ 
    
    This helper function converts a dictionary of feature names to unique numerical ids. 
    top_n limits the features to only the n most frequent features observed in the training data 
    (in terms of the number of documents that contains it).
    
    """
    
    counts=Counter()
    for doc in data:
        for feat in doc:
            counts[feat]+=1

    feature_vocab={}

    for idx, (k, v) in enumerate(counts.most_common(top_n)):
        feature_vocab[k]=idx
                
    return feature_vocab

In [14]:
def pipeline(trainX, devX, trainY, devY, feature_functions):

    """ This function evaluates a list of feature functions on the training/dev data arguments """
    
    trainX_feat=build_features(trainX, feature_functions)
    devX_feat=build_features(devX, feature_functions)

    # just create vocabulary from features in *training* data.
    feature_vocab=create_vocab(trainX_feat, top_n=100000)

    trainX_ids=features_to_ids(trainX_feat, feature_vocab)
    devX_ids=features_to_ids(devX_feat, feature_vocab)
    
    clf = linear_model.LogisticRegression(C=100, solver='lbfgs', penalty='l2', max_iter=10000)
    clf.fit(trainX_ids, trainY)
    print("Accuracy: %.3f" % clf.score(devX_ids, devY))
    
    return clf, feature_vocab

## Build Features

In [15]:
def bag_of_words(article):
    doc = article["doc"]
    
    feats = {}
    for token in doc: 
        feat_name = f"token_{token.text}"
        feats[feat_name] = feats.get(feat_name, 0) + 1
    return feats

In [16]:
def sentence_length(article): 
    doc = article["doc"]
    
    feats = {}
    for sentence in list(doc.sents): 
        num_tokens = len(list(doc.sents))
        feat_name = f"sentence_length_{num_tokens}"
        feats[feat_name] = feats.get(feat_name, 0) + 1
    return feats

In [17]:
def parts_of_speech(article): 
    doc = article["doc"]
    
    feats = {}
    for token in doc: 
        feat_name = f"POS_{token.pos_}"
        feats[feat_name] = feats.get(feat_name, 0) + 1
    return feats

In [18]:
hedging_file = open("data/hedging_data.txt")
hedging_text = hedging_file.read()
hedging_file.close()
hedging_lst = [h for h in hedging_text.split("\n") if ("%" not in h and len(h) > 2)]
print(hedging_lst)

['largely', 'generally', 'often', 'rarely', 'sometimes', 'frequently', 'occasionally', 'seldom', 'usually', 'most', 'several', 'some', 'almost', 'practically', 'apparently', 'virtually', 'basically', 'approximately', 'roughly', 'somewhat', 'somehow', 'partially', 'actually', 'like', 'something', 'someone', 'somebody', 'somewhere', 'think', 'thinks', 'thought', 'believe', 'believed', 'believes', 'consider', 'considers', 'considered', 'assume', 'assumes', 'assumed', 'understand', 'understands', 'understood', 'find', 'found', 'finds', 'appear', 'appears', 'appeared', 'seem', 'seems', 'seemed', 'suppose', 'supposes', 'supposed', 'guess', 'guesses', 'guessed', 'estimate', 'estimates', 'estimated', 'speculate', 'speculates', 'speculated', 'suggest', 'suggests', 'suggested', 'may', 'could', 'should', 'might', 'surely', 'probably', 'likely', 'maybe', 'perhaps', 'unsure', 'probable', 'unlikely', 'possibly', 'possible', 'read', 'say', 'says', 'looks like', 'look like', "don't know", 'necessarily

In [19]:
def hedging_feature(article): 
    doc = article["doc"]
    
    feats = {}
    for sentence in list(doc.sents): 
        for hedge in hedging_lst: 
            if hedge in sentence.string: 
                feat_name = f"hedge_{hedge}"
                feats[feat_name] = feats.get(feat_name, 0) + 1
    return feats

## Put It All Together

In [22]:
features = [bag_of_words, sentence_length, parts_of_speech, hedging_feature]

In [23]:
%%time
clf, vocab = pipeline(trainX, testX, trainY, testY, features)

Accuracy: 0.950
CPU times: user 14min 1s, sys: 9.51 s, total: 14min 10s
Wall time: 14min 7s


In [24]:
print_weights(clf, vocab, n=20)

Class 1: spok
-0.127	token_:
-0.059	sentence_length_241
-0.049	POS_ADP
-0.049	token_--
-0.044	POS_AUX
-0.043	sentence_length_185
-0.040	sentence_length_105
-0.039	token_to
-0.038	sentence_length_181
-0.038	token_,
-0.037	token_#
-0.036	sentence_length_126
-0.035	sentence_length_64
-0.034	POS_CCONJ
-0.034	sentence_length_149
-0.033	POS_PROPN
-0.027	sentence_length_114
-0.026	token_-
-0.025	POS_X
-0.025	token_and

Class 2: tvm
0.105	sentence_length_935
0.099	sentence_length_66
0.079	sentence_length_98
0.067	token_?
0.055	token_...
0.053	token_!
0.047	POS_VERB
0.045	token_.
0.042	POS_PRON
0.039	sentence_length_718
0.038	sentence_length_273
0.031	token_(
0.031	sentence_length_272
0.028	sentence_length_239
0.028	token_I
0.026	sentence_length_340
0.024	sentence_length_701
0.023	sentence_length_559
0.023	token_me
0.022	sentence_length_521


## SAVE & LOAD MODEL

In [41]:
import pickle
from datetime import datetime

# save models
dt_string = datetime.now().strftime("%d_%m_%Y %H_%M_%S")
clf_name = f"models/clf {dt_string}.sav"
vocab_name = f"models/vocab {dt_string}.sav"
pickle.dump(clf, open(clf_name, 'wb'))
pickle.dump(vocab, open(vocab_name, 'wb'))

# load models
loaded_clf = pickle.load(open(clf_name, 'rb'))
loaded_vocab = pickle.load(open(vocab_name, 'rb'))

## Parse Litbank Data

In [25]:
litbank_dir = "../litbank/original/"

In [26]:
def parse_text_litbank(text, number=None, filename=None): 
    doc = nlp(text)
    article = {"doc" : doc}
    if number != None: 
        article["number"] = number
    if filename != None: 
        article["filename"] = filename
    return article

In [45]:
def build_litbank_data(): 
    X = []
    number = 0
    for filename in os.listdir(litbank_dir):
        if not filename.endswith(".txt"): 
            continue
        print(f"filename: {filename}")
        full_filename = os.path.join(litbank_dir, filename)
        file = open(full_filename)
        text = file.read()
        file.close()
        # X.append(parse_text_litbank(number, text))
        
        # chunks = [text[i:i + nlp.max_length] for i in range(0, len(text), nlp.max_length)]
        # for chunk in chunks: 
            # X.append(parse_text_litbank(chunk, number=number, filename=filename))
            # number += 1
        
        text = text[:nlp.max_length]
        X.append(parse_text_litbank(text, number=number, filename=filename))
        number += 1
    
    return X

In [None]:
%%time
X_litbank = build_litbank_data()

filename: 730_oliver_twist.txt
filename: 76_adventures_of_huckleberry_finn.txt
filename: 74_the_adventures_of_tom_sawyer.txt
filename: 766_david_copperfield.txt
filename: 345_dracula.txt
filename: 105_persuasion.txt
filename: 18581_adrift_in_new_york_tom_and_florence_braving_the_world.txt
filename: 45_anne_of_green_gables.txt
filename: 3268_the_mysteries_of_udolpho.txt
filename: 6593_history_of_tom_jones_a_foundling.txt
filename: 1206_the_flying_u_ranch.txt
filename: 969_the_tenant_of_wildfell_hall.txt
filename: 5348_ragged_dick_or_street_life_in_new_york_with_the_bootblacks.txt
filename: 84_frankenstein_or_the_modern_prometheus.txt
filename: 711_allan_quatermain.txt
filename: 351_of_human_bondage.txt
filename: 215_the_call_of_the_wild.txt
filename: 1327_elizabeth_and_her_german_garden.txt
filename: 78_tarzan_of_the_apes.txt
filename: 60_the_scarlet_pimpernel.txt
filename: 36_the_war_of_the_worlds.txt
filename: 599_vanity_fair.txt
filename: 2852_the_hound_of_the_baskervilles.txt
filena

## Predict on Litbank

In [None]:
def predict_data(X): 
    X_feat = build_features(X, features)
    X_ids=features_to_ids(X_feat, vocab)
    return (clf.predict(X_ids), clf.predict_proba(X_ids))

In [None]:
(classes, probas) = predict_data(X_litbank)
print(f"classes: {classes}")
print(f"probas: {probas}")

## SAVE & LOAD DATA

In [None]:
import pickle
from datetime import datetime

dt_string = datetime.now().strftime("%d_%m_%Y %H_%M_%S")

# save COCA data
trainX_name = f"models/trainX {dt_string}.sav"
pickle.dump(trainX, open(trainX_name, 'wb'))

testX_name = f"models/testX {dt_string}.sav"
pickle.dump(testX, open(testX_name, 'wb'))

trainY_name = f"models/trainY {dt_string}.sav"
pickle.dump(trainY, open(trainY_name, 'wb'))

testY_name = f"models/testY {dt_string}.sav"
pickle.dump(testY, open(testY_name, 'wb'))

# save litbank data
X_litbank_name = f"models/X_litbank {dt_string}.sav"
pickle.dump(X_litbank, open(X_litbank_name, 'wb'))

# load COCA data
loaded_trainX = pickle.load(open(trainX_name, 'rb'))
loaded_testX = pickle.load(open(testX_name, 'rb'))
loaded_trainY = pickle.load(open(trainY_name, 'rb'))
loaded_testY = pickle.load(open(testY_name, 'rb'))

#load litbank data
loaded_X_litbank = pickle.load(open(X_litbank_name, 'rb'))