# Data Parsing COCA

## coca-samples-text

In [1]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

import re
import os

[nltk_data] Downloading package stopwords to /home/divya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Parsing Functions

In [2]:
class Article: 
    def __init__(self, number, text, sentence_texts, sentence_tokens=[], sentence_tokens_wo_sw=[]): 
        self.number = number
        self.text = text
        self.sentence_texts = sentence_texts
        self.sentence_tokens = sentence_tokens
        self.sentence_tokens_wo_sw = sentence_tokens_wo_sw
    
    def __str__(self):
        return str({self.number : {"text" : self.text[:50], "sentences" : [s[:10] for s in self.sentence_texts[:3]]}})

class File: 
    def __init__(self, filename, articles): 
        self.filename = filename
        self.articles = articles
    
    def __str__(self): 
        return str({self.filename : [str(a) for a in self.articles[:3]]})

In [3]:
def parse_articles(file_text): 
    article_texts = file_text.split("\n")

    articles = []
    for article_text in article_texts: 
        if len(article_text) == 0: 
            continue
        
        pattern = r'@@\d+ '
        rv = re.findall(pattern, article_text[:20])
        if len(rv) == 0: 
            continue
        article_number = int(rv[0][2:-1])

        pattern = r" [\.|\?|\!] "
        sentence_texts = re.split(pattern, article_text)[1:]
        if len(sentence_texts) == 0: 
            continue
        
        # article = Article(article_number, article_text, sentence_texts, sentence_tokens, sentence_tokens_wo_sw)
        article = Article(article_number, article_text, sentence_texts)
        articles.append(article)
    
    if len(articles) == 0: 
        return None
    return articles

In [4]:
def parse_filename(directory, filename): 
    file = open(directory + filename, "r", encoding="ISO-8859-1")
    file_text = file.read()
    file.close()
    
    articles = parse_articles(file_text)
    if articles == None: 
        return None
    file = File(filename, articles)
    return file

In [5]:
def get_files(directory): 
    files = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        
        file = parse_filename(directory, filename)
        if file == None: 
            continue
        files.append(file)
    return files

In [7]:
text_directory = "/home/divya/Desktop/coca-samples-text/"
text_files = get_files(text_directory)
text_spok_file = [file for file in text_files if "text_spok" in file.filename][0]
text_tvm_file = [file for file in text_files if "text_tvm" in file.filename][0]

In [9]:
print(f"Number of Files: {len(text_files)}")
print(text_spok_file)

Number of Files: 8
{'text_spok.txt': ["{17141: {'text': '@@17141 ERIC @!BURNS , FOX NEWS HOST : On this wee', 'sentences': ['The media ', 'How did th', 'How did au']}}", "{21741: {'text': '@@21741 qwq @ ! DOUGLAS-FORD-ARSO : I set the fire', 'sentences': ['DOUGLAS-FO', 'I burned i', 'I cant say']}}", "{207541: {'text': '@@207541 ( BEGIN VIDEOTAPE ) HOWARD KURTZ , HOST (', 'sentences': ['Were ABC a', 'Should Geo', 'With Iraq ']}}"]}


### Format Classifier

In [10]:
import json
import nltk
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from scipy import sparse
from collections import Counter
import operator

In [11]:
def build_data(file1, file2): 
    X, Y = [], []
    for a in file1.articles: 
        X.append(a.sentence_texts)
        Y.append(0)
    for a in file2.articles: 
        X.append(a.sentence_texts)
        Y.append(1)
    return (X, Y)

In [13]:
X, Y = build_data(text_spok_file, text_tvm_file)
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=0)

### Build Classifier

In [14]:
def print_weights(clf, vocab, n=10):
    weights=clf.coef_[0]
    reverse_vocab=[None]*len(weights)
    for k in vocab:
        reverse_vocab[vocab[k]]=k

    for feature, weight in sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))[:n]:
        print("%.3f\t%s" % (weight, feature))

    print()

    for feature, weight in list(reversed(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))))[:n]:
        print("%.3f\t%s" % (weight, feature))

In [15]:
def build_features(dataX, feature_functions):
    
    """ This function featurizes the data according to the list of parameter feature_functions """
    
    data=[]
    for tokens in dataX:
        feats={}
        
        for function in feature_functions:
            feats.update(function(tokens))

        data.append(feats)
    return data

In [16]:
def features_to_ids(data, feature_vocab):
    
    """ 
    
    This helper function converts a dictionary of feature names to a sparse representation
 that we can fit in a scikit-learn model.  This is important because almost all feature 
 values will be 0 for most documents (note: why?), and we don't want to save them all in 
 memory.

    """
    new_data=sparse.lil_matrix((len(data), len(feature_vocab)))
    for idx,doc in enumerate(data):
        for f in doc:
            if f in feature_vocab:
                new_data[idx,feature_vocab[f]]=doc[f]
    return new_data

In [17]:
def create_vocab(data, top_n=None):
    
    """ 
    
    This helper function converts a dictionary of feature names to unique numerical ids. 
    top_n limits the features to only the n most frequent features observed in the training data 
    (in terms of the number of documents that contains it).
    
    """
    
    counts=Counter()
    for doc in data:
        for feat in doc:
            counts[feat]+=1

    feature_vocab={}

    for idx, (k, v) in enumerate(counts.most_common(top_n)):
        feature_vocab[k]=idx
                
    return feature_vocab

In [18]:
def pipeline(trainX, devX, trainY, devY, feature_functions):

    """ This function evaluates a list of feature functions on the training/dev data arguments """
    
    trainX_feat=build_features(trainX, feature_functions)
    devX_feat=build_features(devX, feature_functions)

    # just create vocabulary from features in *training* data.
    feature_vocab=create_vocab(trainX_feat, top_n=100000)

    trainX_ids=features_to_ids(trainX_feat, feature_vocab)
    devX_ids=features_to_ids(devX_feat, feature_vocab)
    
    clf = linear_model.LogisticRegression(C=100, solver='lbfgs', penalty='l2', max_iter=10000)
    clf.fit(trainX_ids, trainY)
    print("Accuracy: %.3f" % clf.score(devX_ids, devY))
    
    return clf, feature_vocab

In [22]:
def count_feature(sentence_texts): 
    feats = {}
    for sentence in sentence_texts: 
        num_tokens = len(sentence.split(" "))
        feat_name = f"sentence_length_{num_tokens}"
        feats[feat_name] = feats.get(feat_name, 0) + 1
    return feats

In [23]:
features = [count_feature]
clf, vocab = pipeline(trainX, testX, trainY, testY, features)

Accuracy: 0.930


In [24]:
print_weights(clf, vocab, n=10)

-1.545	sentence_length_24
-1.262	sentence_length_21
-1.025	sentence_length_19
-0.948	sentence_length_20
-0.906	sentence_length_10
-0.902	sentence_length_47
-0.823	sentence_length_37
-0.768	sentence_length_35
-0.752	sentence_length_42
-0.688	sentence_length_26

2.333	sentence_length_125
2.333	sentence_length_93
2.269	sentence_length_49
1.212	sentence_length_9
0.953	sentence_length_1
0.836	sentence_length_77
0.767	sentence_length_56
0.751	sentence_length_23
0.734	sentence_length_111
0.728	sentence_length_75
