# Main Notebook

In [1]:
import re
import os

## Parse Data

### coca-samples-wlp

In [2]:
class Article_WLP: 
    def __init__(self, number, w, l, p): 
        self.number = number
        self.w = w
        self.l = l
        self.p = p
    
    def __str__(self): 
        return str({self.number : {"w" : self.w[:10], "l" : self.l[:10], "p" : self.p[:10]}}) + "/n"

class File_WLP: 
    def __init__(self, filename, articles):
        self.filename = filename
        self.articles = articles
    
    def __str__(self): 
        return str({self.filename : [str(a) for a in self.articles[:3]]})

In [3]:
def get_wlp(line): 
    temp = line.split('\t')
    if len(temp) != 4:
        return None
    return tuple(temp[1:])

In [4]:
def parse_article_wlp(text): 
    lines = text.split("\n")
    number = None
    for line in lines: 
        numbers = re.findall(r'\d+', line)
        if len(numbers) > 0: 
            number = int(numbers[0])
            break
    
    if number == None: 
        return None
    
    w = []
    l = []
    p = []
    for line in lines: 
        args = get_wlp(line)
        if args == None: 
            continue
        w.append(args[0])
        l.append(args[1])
        p.append(args[2])
    
    return Article_WLP(number, w, l, p)        

In [5]:
def parse_filename_wlp(directory, filename): 
    file = open(directory + filename, "r", encoding="ISO-8859-1")
    file_text = file.read()
    file.close()

    pattern = r'\d+\t@@\d+\t\t'
    article_texts = re.split(pattern, file_text)
    
    articles = []
    for text in article_texts: 
        article = parse_article_wlp(text)
        if article == None: 
            continue
        articles.append(article)
    
    if len(articles) == 0: 
        return None    
    return File_WLP(filename, articles)

In [6]:
def get_files_wlp(directory): 
    files = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        file = parse_filename_wlp(directory, filename)
        if file == None: 
            continue
        files.append(file)
    return files

### coca-samples-text

In [7]:
class Article_Text: 
    def __init__(self, number, text, sentence_texts): 
        self.number = number
        self.text = text
        self.sentence_texts = sentence_texts
    
    def __str__(self):
        return str({self.number : {"text" : self.text[:50], "sentences" : [s[:10] for s in self.sentence_texts[:3]]}})

class File_Text: 
    def __init__(self, filename, articles): 
        self.filename = filename
        self.articles = articles
    
    def __str__(self): 
        return str({self.filename : [str(a) for a in self.articles[:3]]})

In [8]:
def parse_articles_text(file_text): 
    article_texts = file_text.split("\n")

    articles = []
    for article_text in article_texts: 
        if len(article_text) == 0: 
            continue
        
        pattern = r'@@\d+ '
        rv = re.findall(pattern, article_text[:20])
        if len(rv) == 0: 
            continue
        article_number = int(rv[0][2:-1])

        pattern = r" [\.|\?|\!] "
        sentence_texts = re.split(pattern, article_text)[1:]
        if len(sentence_texts) == 0: 
            continue
        
        article = Article_Text(article_number, article_text, sentence_texts)
        articles.append(article)
    
    if len(articles) == 0: 
        return None
    return articles

In [9]:
def parse_filename_text(directory, filename): 
    file = open(directory + filename, "r", encoding="ISO-8859-1")
    file_text = file.read()
    file.close()
    
    articles = parse_articles_text(file_text)
    if articles == None: 
        return None
    file = File_Text(filename, articles)
    return file

In [10]:
def get_files_text(directory): 
    files = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        
        file = parse_filename_text(directory, filename)
        if file == None: 
            continue
        files.append(file)
    return files

## Build Data

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
wlp_directory = "data/coca-samples-wlp (1)/"
wlp_files = get_files_wlp(wlp_directory)

text_directory = "data/coca-samples-text/"
text_files = get_files_text(text_directory)

In [13]:
def build_data(label1, label2): 
    wlp_file1 = [file for file in wlp_files if label1 in file.filename][0]
    wlp_file2 = [file for file in wlp_files if label2 in file.filename][0]
    
    text_file1 = [file for file in text_files if label1 in file.filename][0]
    text_file2 = [file for file in text_files if label2 in file.filename][0]
    
    X, Y = [], []
    for wlp_a in wlp_file1.articles: 
        text_a = [a for a in text_file1.articles if a.number == wlp_a.number]
        if len(text_a) == 0: 
            continue
        else: 
            text_a = text_a[0]
        X.append((wlp_a, text_a))
        Y.append(label1)
    
    for wlp_a in wlp_file2.articles: 
        text_a = [a for a in text_file2.articles if a.number == wlp_a.number]
        if len(text_a) == 0: 
            continue
        else: 
            text_a = text_a[0]
        X.append((wlp_a, text_a))
        Y.append(label2)
    return (X, Y)

In [14]:
X, Y = build_data("spok", "tvm")
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=0)

## Build Classifier

In [15]:
import json
import nltk
from sklearn import linear_model
from scipy import sparse
from collections import Counter
import operator

In [16]:
def print_weights(clf, vocab, n=10):
    weights=clf.coef_[0]
    reverse_vocab=[None]*len(weights)
    for k in vocab:
        reverse_vocab[vocab[k]]=k

    print(f"Class 1: {clf.classes_[0]}")
    for feature, weight in sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))[:n]:
        print("%.3f\t%s" % (weight, feature))

    print()

    print(f"Class 2: {clf.classes_[1]}")
    for feature, weight in list(reversed(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))))[:n]:
        print("%.3f\t%s" % (weight, feature))

In [17]:
def build_features(dataX, feature_functions):
    
    """ This function featurizes the data according to the list of parameter feature_functions """
    
    data=[]
    for tokens in dataX:
        feats={}
        
        for function in feature_functions:
            feats.update(function(tokens))

        data.append(feats)
    return data

In [18]:
def features_to_ids(data, feature_vocab):
    
    """ 
    
    This helper function converts a dictionary of feature names to a sparse representation
 that we can fit in a scikit-learn model.  This is important because almost all feature 
 values will be 0 for most documents (note: why?), and we don't want to save them all in 
 memory.

    """
    new_data=sparse.lil_matrix((len(data), len(feature_vocab)))
    for idx,doc in enumerate(data):
        for f in doc:
            if f in feature_vocab:
                new_data[idx,feature_vocab[f]]=doc[f]
    return new_data

In [19]:
def create_vocab(data, top_n=None):
    
    """ 
    
    This helper function converts a dictionary of feature names to unique numerical ids. 
    top_n limits the features to only the n most frequent features observed in the training data 
    (in terms of the number of documents that contains it).
    
    """
    
    counts=Counter()
    for doc in data:
        for feat in doc:
            counts[feat]+=1

    feature_vocab={}

    for idx, (k, v) in enumerate(counts.most_common(top_n)):
        feature_vocab[k]=idx
                
    return feature_vocab

In [20]:
def pipeline(trainX, devX, trainY, devY, feature_functions):

    """ This function evaluates a list of feature functions on the training/dev data arguments """
    
    trainX_feat=build_features(trainX, feature_functions)
    devX_feat=build_features(devX, feature_functions)

    # just create vocabulary from features in *training* data.
    feature_vocab=create_vocab(trainX_feat, top_n=100000)

    trainX_ids=features_to_ids(trainX_feat, feature_vocab)
    devX_ids=features_to_ids(devX_feat, feature_vocab)
    
    clf = linear_model.LogisticRegression(C=100, solver='lbfgs', penalty='l2', max_iter=10000)
    clf.fit(trainX_ids, trainY)
    print("Accuracy: %.3f" % clf.score(devX_ids, devY))
    
    return clf, feature_vocab

## Build Features

In [21]:
def bag_of_words(article_pair):
    article_wlp = article_pair[0]
    article_text = article_pair[1]
    
    feats = {}
    for token in article_wlp.l: 
        feat_name = f"token_{token}"
        feats[feat_name] = feats.get(feat_name, 0) + 1
    return feats

In [22]:
def sentence_length(article_pair): 
    article_wlp = article_pair[0]
    article_text = article_pair[1]
    
    feats = {}
    for sentence in article_text.sentence_texts: 
        num_tokens = len(sentence.split(" "))
        feat_name = f"sentence_length_{num_tokens}"
        feats[feat_name] = feats.get(feat_name, 0) + 1
    return feats

In [23]:
def parts_of_speech(article_pair): 
    article_wlp = article_pair[0]
    article_text = article_pair[1]
    
    feats = {}
    for pos in article_wlp.p: 
        feat_name = f"POS_{pos}"
        feats[feat_name] = feats.get(feat_name, 0) + 1
    return feats

In [24]:
hedging_file = open("data/hedging_data.txt")
hedging_text = hedging_file.read()
hedging_file.close()
hedging_lst = [h for h in hedging_text.split("\n") if ("%" not in h and len(h) > 2)]
print(hedging_lst)

['largely', 'generally', 'often', 'rarely', 'sometimes', 'frequently', 'occasionally', 'seldom', 'usually', 'most', 'several', 'some', 'almost', 'practically', 'apparently', 'virtually', 'basically', 'approximately', 'roughly', 'somewhat', 'somehow', 'partially', 'actually', 'like', 'something', 'someone', 'somebody', 'somewhere', 'think', 'thinks', 'thought', 'believe', 'believed', 'believes', 'consider', 'considers', 'considered', 'assume', 'assumes', 'assumed', 'understand', 'understands', 'understood', 'find', 'found', 'finds', 'appear', 'appears', 'appeared', 'seem', 'seems', 'seemed', 'suppose', 'supposes', 'supposed', 'guess', 'guesses', 'guessed', 'estimate', 'estimates', 'estimated', 'speculate', 'speculates', 'speculated', 'suggest', 'suggests', 'suggested', 'may', 'could', 'should', 'might', 'surely', 'probably', 'likely', 'maybe', 'perhaps', 'unsure', 'probable', 'unlikely', 'possibly', 'possible', 'read', 'say', 'says', 'looks like', 'look like', "don't know", 'necessarily

In [25]:
def hedging_feature(article_pair): 
    article_wlp = article_pair[0]
    article_text = article_pair[1]
    
    feats = {}
    for sentence in article_text.sentence_texts: 
        for hedge in hedging_lst: 
            if hedge in sentence: 
                feat_name = f"hedge_{hedge}"
                feats[feat_name] = feats.get(feat_name, 0) + 1
    return feats

## Put It All Together

In [26]:
features = [bag_of_words, sentence_length, parts_of_speech, hedging_feature]

In [27]:
clf, vocab = pipeline(trainX, testX, trainY, testY, features)

Accuracy: 0.980


In [28]:
print_weights(clf, vocab, n=20)

Class 1: spok
-0.133	POS_FO
-0.099	token_:
-0.098	POS_fo
-0.091	POS_:
-0.070	POS_y
-0.061	token_,
-0.049	POS_vbz
-0.042	token_a
-0.038	token_to
-0.035	POS_,
-0.033	POS_ii
-0.032	POS_at1
-0.030	POS_jj
-0.029	POS_vv0
-0.027	POS_nn1
-0.027	token_so
-0.026	token_on
-0.026	token_yale
-0.025	token_this
-0.024	token_his

Class 2: tvm
0.104	POS_@1
0.070	POS_vvi
0.070	POS_ppis1
0.069	sentence_length_8
0.068	sentence_length_3
0.067	sentence_length_1
0.061	sentence_length_2
0.057	POS_!
0.057	token_!
0.057	sentence_length_9
0.057	token_i
0.053	POS_?
0.051	sentence_length_6
0.050	POS_vvd
0.048	token_?
0.048	POS_"
0.046	token_"
0.040	token_...
0.040	POS_...
0.040	POS_ppio1


## Test to Gutenberg

In [30]:
guten_filename = "../comphumF20/data/fiction.6M.txt"
file = open(guten_filename)
guten_text = file.read()
file.close()

In [31]:
def build_guten_data(text): 
    X = []
    lines = text.split("\n")
    for i in range(len(lines)): 
        line = lines[i]

        # Article_Text
        sentences = nltk.tokenize.sent_tokenize(line)
        a_text = Article_Text(i, line, sentences)

        # Article_WLP
        lexicon = nltk.word_tokenize(line)
        pairs = nltk.pos_tag(lexicon)
        l = [pair[0] for pair in pairs]
        p = [pair[1] for pair in pairs]
        a_wlp = Article_WLP(i, [], l, p)
        
        X.append((a_wlp, a_text))
    return X

In [32]:
%time
guten_X = build_guten_data(guten_text)

CPU times: user 9 µs, sys: 1e+03 ns, total: 10 µs
Wall time: 17.6 µs


In [38]:
clf.predict(trainX)

TypeError: float() argument must be a string or a number, not 'Article_WLP'