# Data Parsing COCA

## coca-samples-wlp

In [1]:
import re
import os

### Parsing Functions

In [2]:
class Article: 
    def __init__(self, number, w, l, p, s=[]): 
        self.number = number
        self.w = w
        self.l = l
        self.p = p
    
    def __str__(self): 
        return str({self.number : {"w" : self.w[:10], "l" : self.l[:10], "p" : self.p[:10]}}) + "\n"

class File: 
    def __init__(self, filename, articles):
        self.filename = filename
        self.articles = articles
    
    def __str__(): 
        return str({self.filename : [str(a) for a in self.articles[:3]]})

In [3]:
def get_wlp(line): 
    temp = line.split('\t')
    if len(temp) != 4:
        return None
    # return w, l, p
    return tuple(temp[1:])

In [4]:
def parse_article(text): 
    lines = text.split("\n")
    number = None
    for line in lines: 
        numbers = re.findall(r'\d+', line)
        if len(numbers) > 0: 
            number = int(numbers[0])
            break
    
    if number == None: 
        return None
    
    w = []
    l = []
    p = []
    for line in lines: 
        args = get_wlp(line)
        if args == None: 
            continue
        w.append(args[0])
        l.append(args[1])
        p.append(args[2])
    
    return Article(number, w, l, p)        

In [5]:
def parse_filename(directory, filename): 
    file = open(directory + filename, "r", encoding="ISO-8859-1")
    file_text = file.read()
    file.close()

    pattern = r'\d+\t@@\d+\t\t'
    article_texts = re.split(pattern, file_text)
    
    articles = []
    for text in article_texts: 
        article = parse_article(text)
        if article == None: 
            continue
        articles.append(article)
    
    if len(articles) == 0: 
        return None    
    return File(filename, articles)

In [6]:
def get_files(directory): 
    files = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        file = parse_filename(directory, filename)
        if file == None: 
            continue
        files.append(file)
    return files

### Example of Files and Articles

In [7]:
wlp_directory = "/home/divya/Desktop/coca-samples-wlp (1)/"

In [8]:
files = get_files(wlp_directory)
print(f"Number of files: {len(files)}")

Number of files: 8


In [9]:
file = files[0]
print(f"Filename: {file.filename}. Number of articles: {len(file.articles)}.")

Filename: wlp_mag.txt. Number of articles: 948.


In [10]:
article = file.articles[0]
print(f"Article #: {article.number}. Number of words: {len(article.w)}.")
print(f"Sample of words: {article.w[:10]}.")
print(f"Sample of l: {article.l[:10]}.")
print(f"Sample of p: {article.p[:10]}.")

Article #: 2000341. Number of words: 507.
Sample of words: ['Al', 'Sobotka', ',', 'has', 'been', 'driving', 'a', 'Zamboni', 'for', 'the'].
Sample of l: ['al', 'sobotka', ',', 'have', 'be', 'drive', 'a', 'zamboni', 'for', 'the'].
Sample of p: ['np1', 'np1', 'y', 'vhz', 'vbn', 'vvg', 'at1', 'nn1_jj', 'if', 'at'].


### Statistics for Midterm Report

In [11]:
for file in files: 
    print(f"Filename: {file.filename}")
    print(f"Number of articles: {len(file.articles)}.")
    avg_article_len = sum([len(a.w) for a in file.articles]) / len(file.articles)
    print(f"Average Number of Words per Article: {avg_article_len}")
    print("----------")    

Filename: wlp_mag.txt
Number of articles: 948.
Average Number of Words per Article: 1652.0611814345991
----------
Filename: wlp_spok.txt
Number of articles: 262.
Average Number of Words per Article: 4428.412213740458
----------
Filename: wlp_tvm.txt
Number of articles: 226.
Average Number of Words per Article: 6935.115044247787
----------
Filename: wlp_fic.txt
Number of articles: 274.
Average Number of Words per Article: 5130.029197080292
----------
Filename: wlp_acad.txt
Number of articles: 265.
Average Number of Words per Article: 5355.603773584906
----------
Filename: wlp_news.txt
Number of articles: 871.
Average Number of Words per Article: 1594.5832376578646
----------
Filename: wlp_blog.txt
Number of articles: 990.
Average Number of Words per Article: 1601.1151515151514
----------
Filename: wlp_web.txt
Number of articles: 892.
Average Number of Words per Article: 1594.9159192825111
----------


## coca-samples-text

WARNING: This section is incomplete and may take a long time to run. I suggest not running these cells :)

In [12]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

import re
import os

[nltk_data] Downloading package stopwords to /home/divya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
class Article: 
    def __init__(self, number, text, sentence_texts, sentence_tokens=[], sentence_tokens_wo_sw=[]): 
        self.number = number
        self.text = text
        self.sentence_texts = sentence_texts
        self.sentence_tokens = sentence_tokens
        self.sentence_tokens_wo_sw = sentence_tokens_wo_sw
    
    def __str__(self):
        return str({self.number : {"text" : self.text[:50], "sentences" : [s[:10] for s in self.sentence_texts[:3]]}})

class File: 
    def __init__(self, filename, articles): 
        self.filename = filename
        self.articles = articles
    
    def __str__(self): 
        return str({self.filename : [str(a) for a in self.articles[:3]]})

In [14]:
# TODO for future optimization
def get_tokens(sentence_text): 
    text_tokens = word_tokenize(sentence_text)
    if len(text_tokens) == 0:
        return None
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    return (text_tokens, tokens_without_sw)

In [15]:
def parse_articles(file_text): 
    article_texts = file_text.split("\n")

    articles = []
    for article_text in article_texts: 
        if len(article_text) == 0: 
            continue
        
        pattern = r'@@\d+ '
        rv = re.findall(pattern, article_text[:20])
        if len(rv) == 0: 
            continue
        article_number = int(rv[0][2:-1])

        pattern = r" [\.|\?|\!] "
        sentence_texts = re.split(pattern, article_text)[1:]
        
        """
        sentence_tokens = []
        sentence_tokens_wo_sw = []
        for sentence_text in sentence_texts: 
            rv = get_tokens(sentence_text)
            if rv == None: 
                continue
            (tokens, tokens_without_sw) = rv
            sentence_tokens.append(tokens)
            sentence_tokens_wo_sw.append(tokens_without_sw)
        
        if len(sentence_texts) == 0 or len(sentence_tokens) == 0: 
            continue
        """
        if len(sentence_texts) == 0: 
            continue
        
        # article = Article(article_number, article_text, sentence_texts, sentence_tokens, sentence_tokens_wo_sw)
        article = Article(article_number, article_text, sentence_texts)
        articles.append(article)
    
    if len(articles) == 0: 
        return None
    return articles

In [16]:
def parse_filename(directory, filename): 
    file = open(directory + filename, "r", encoding="ISO-8859-1")
    file_text = file.read()
    file.close()
    
    articles = parse_articles(file_text)
    if articles == None: 
        return None
    file = File(filename, articles)
    return file

In [17]:
def get_files(directory): 
    files = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        
        file = parse_filename(directory, filename)
        if file == None: 
            continue
        files.append(file)
    return files

### Example of File and Articles

In [18]:
text_directory = "/home/divya/Desktop/coca-samples-text/"
text_spok = "text_spok.txt"

In [19]:
%%time
files = get_files(text_directory)

CPU times: user 1.5 s, sys: 156 ms, total: 1.65 s
Wall time: 1.65 s


In [20]:
file = files[0]
print(file)

{'text_news.txt': ["{3000041: {'text': '@@3000041 <p> Basketball United States Wins Bronze', 'sentences': ['Kenny Ande', 'Yugoslavia', 'This was t']}}", "{3000341: {'text': '@@3000341 <p> In the airy sitting room of the ranc', 'sentences': ['In the fra', '<p> Nelson', 'He is the ']}}", "{3000641: {'text': '@@3000641 <p> It was just a normal summer day when', 'sentences': ['<p> Just o', 'Not the hi', 'Completely']}}"]}


In [21]:
for file in files: 
    print(f"Filename: {file.filename}")
    print(f"Number of articles: {len(file.articles)}")
    avg_article_sent = sum([len(a.sentence_texts) for a in file.articles]) / len(file.articles)
    print(f"Average number of sentences per article: {avg_article_sent}")
    print("----------")

Filename: text_news.txt
Number of articles: 871
Average number of sentences per article: 69.59931113662456
----------
Filename: text_fic.txt
Number of articles: 274
Average number of sentences per article: 325.3430656934307
----------
Filename: text_web.txt
Number of articles: 856
Average number of sentences per article: 71.50934579439253
----------
Filename: text_spok.txt
Number of articles: 263
Average number of sentences per article: 245.08365019011407
----------
Filename: text_tvm.txt
Number of articles: 233
Average number of sentences per article: 783.2145922746781
----------
Filename: text_blog.txt
Number of articles: 982
Average number of sentences per article: 69.70875763747455
----------
Filename: text_acad.txt
Number of articles: 265
Average number of sentences per article: 190.25283018867924
----------
Filename: text_mag.txt
Number of articles: 947
Average number of sentences per article: 69.23125659978881
----------
