# Data Parsing COCA

## coca-samples-wlp

In [1]:
import re
import os

### Parsing Functions

In [2]:
class Article: 
    def __init__(self, number, w, l, p): 
        self.number = number
        self.w = w
        self.l = l
        self.p = p
    
    def __str__(self): 
        return str({self.number : {"w" : self.w[:10], "l" : self.l[:10], "p" : self.p[:10]}}) + "/n"

class File: 
    def __init__(self, filename, articles):
        self.filename = filename
        self.articles = articles
    
    def __str__(self): 
        return str({self.filename : [str(a) for a in self.articles[:3]]})

In [3]:
def get_wlp(line): 
    temp = line.split('\t')
    if len(temp) != 4:
        return None
    # return w, l, p
    return tuple(temp[1:])

In [4]:
def parse_article(text): 
    lines = text.split("\n")
    number = None
    for line in lines: 
        numbers = re.findall(r'\d+', line)
        if len(numbers) > 0: 
            number = int(numbers[0])
            break
    
    if number == None: 
        return None
    
    w = []
    l = []
    p = []
    for line in lines: 
        args = get_wlp(line)
        if args == None: 
            continue
        w.append(args[0])
        l.append(args[1])
        p.append(args[2])
    
    return Article(number, w, l, p)        

In [5]:
def parse_filename(directory, filename): 
    file = open(directory + filename, "r", encoding="ISO-8859-1")
    file_text = file.read()
    file.close()

    pattern = r'\d+\t@@\d+\t\t'
    article_texts = re.split(pattern, file_text)
    
    articles = []
    for text in article_texts: 
        article = parse_article(text)
        if article == None: 
            continue
        articles.append(article)
    
    if len(articles) == 0: 
        return None    
    return File(filename, articles)

In [6]:
def get_files(directory): 
    files = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        file = parse_filename(directory, filename)
        if file == None: 
            continue
        files.append(file)
    return files

### Format Classifier

In [7]:
import json
import nltk
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from scipy import sparse
from collections import Counter
import operator

In [8]:
wlp_directory = "/home/divya/Desktop/coca-samples-wlp (1)/"

In [9]:
files = get_files(wlp_directory)
print(f"Number of files: {len(files)}")

Number of files: 8


In [10]:
wlp_spok_file = [file for file in files if "wlp_spok" in file.filename][0]
wlp_tvm_file = [file for file in files if "wlp_tvm" in file.filename][0]
print(wlp_spok_file)

{'wlp_spok.txt': ["{17141: {'w': ['ERIC', '@!BURNS', ',', 'FOX', 'NEWS', 'HOST', ':', 'On', 'this', 'week'], 'l': ['eric', '', ',', 'fox', 'news', 'host', ':', 'on', 'this', 'week'], 'p': ['np1', 'zzq', 'y', 'np1_nn1', 'nn1', 'nn1_vv0', 'y', 'ii', 'dd1', 'nnt1']}}/n", "{21741: {'w': ['qwq', '@', '!', 'DOUGLAS-FORD-ARSO', ':', 'I', 'set', 'the', 'fire', 'at'], 'l': ['', '', '!', '', ':', 'i', 'set', 'the', 'fire', 'at'], 'p': ['xx', 'ii', 'y', 'np1', 'y', 'ppis1', 'vv0_vvd', 'at', 'nn1', 'ii']}}/n", "{207541: {'w': ['(', 'BEGIN', 'VIDEOTAPE', ')', 'HOWARD', 'KURTZ', ',', 'HOST', '(', 'voice-over'], 'l': ['(', 'begin', 'videotape', '', 'howard', 'kurtz', ',', 'host', '(', 'voice-over'], 'p': ['y', 'vv0', 'np1_nn1', 'np1', 'np1', 'np1', 'y', 'nn1_vv0', 'y', 'np']}}/n"]}


In [11]:
def build_data(file1, file2): 
    X, Y = [], []
    for a in file1.articles: 
        X.append(a.l)
        Y.append(0)
    for a in file2.articles: 
        X.append(a.l)
        Y.append(1)
    return (X, Y)

In [12]:
X, Y = build_data(wlp_spok_file, wlp_tvm_file)
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=0)

### Build Classifier

In [13]:
def print_weights(clf, vocab, n=10):
    weights=clf.coef_[0]
    reverse_vocab=[None]*len(weights)
    for k in vocab:
        reverse_vocab[vocab[k]]=k

    for feature, weight in sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))[:n]:
        print("%.3f\t%s" % (weight, feature))

    print()

    for feature, weight in list(reversed(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))))[:n]:
        print("%.3f\t%s" % (weight, feature))

In [14]:
def build_features(dataX, feature_functions):
    
    """ This function featurizes the data according to the list of parameter feature_functions """
    
    data=[]
    for tokens in dataX:
        feats={}
        
        for function in feature_functions:
            feats.update(function(tokens))

        data.append(feats)
    return data

In [15]:
def features_to_ids(data, feature_vocab):
    
    """ 
    
    This helper function converts a dictionary of feature names to a sparse representation
 that we can fit in a scikit-learn model.  This is important because almost all feature 
 values will be 0 for most documents (note: why?), and we don't want to save them all in 
 memory.

    """
    new_data=sparse.lil_matrix((len(data), len(feature_vocab)))
    for idx,doc in enumerate(data):
        for f in doc:
            if f in feature_vocab:
                new_data[idx,feature_vocab[f]]=doc[f]
    return new_data

In [16]:
def create_vocab(data, top_n=None):
    
    """ 
    
    This helper function converts a dictionary of feature names to unique numerical ids. 
    top_n limits the features to only the n most frequent features observed in the training data 
    (in terms of the number of documents that contains it).
    
    """
    
    counts=Counter()
    for doc in data:
        for feat in doc:
            counts[feat]+=1

    feature_vocab={}

    for idx, (k, v) in enumerate(counts.most_common(top_n)):
        feature_vocab[k]=idx
                
    return feature_vocab

In [17]:
def pipeline(trainX, devX, trainY, devY, feature_functions):

    """ This function evaluates a list of feature functions on the training/dev data arguments """
    
    trainX_feat=build_features(trainX, feature_functions)
    devX_feat=build_features(devX, feature_functions)

    # just create vocabulary from features in *training* data.
    feature_vocab=create_vocab(trainX_feat, top_n=100000)

    trainX_ids=features_to_ids(trainX_feat, feature_vocab)
    devX_ids=features_to_ids(devX_feat, feature_vocab)
    
    clf = linear_model.LogisticRegression(C=100, solver='lbfgs', penalty='l2', max_iter=10000)
    clf.fit(trainX_ids, trainY)
    print("Accuracy: %.3f" % clf.score(devX_ids, devY))
    
    return clf, feature_vocab

In [18]:
def count_feature(tokens): 
    feats = {}
    for token in tokens: 
        feats[token] = feats.get(token, 0) + 1
    return feats

In [19]:
features = [count_feature]
clf, vocab = pipeline(trainX, testX, trainY, testY, features)

Accuracy: 0.949


In [20]:
print_weights(clf, vocab, n=10)

-0.207	:
-0.139	know
-0.137	but
-0.134	you
-0.133	a
-0.129	so
-0.115	woman
-0.113	in
-0.110	this
-0.089	and

0.228	!
0.208	?
0.194	me
0.190	i
0.174	"
0.158	get
0.137	sex
0.121	...
0.114	n't
0.094	trafficking
