# Data Parsing COCA

## coca-samples-wlp

In [1]:
import re
import os

### Parsing Functions

In [2]:
class Article: 
    def __init__(self, number, w, l, p, s=[]): 
        self.number = number
        self.w = w
        self.l = l
        self.p = p
    
    def __str__(self): 
        return str({self.number : {"w" : self.w[:10], "l" : self.l[:10], "p" : self.p[:10]}})

class File: 
    def __init__(self, filename, articles):
        self.filename = filename
        self.articles = articles
    
    def __str__(): 
        return f"\{ {self.filename} : {[str(a) for a in self.articles[:5]]} ... \}"

In [3]:
def get_wlp(line): 
    temp = line.split('\t')
    if len(temp) != 4:
        return None
    # return w, l, p
    return tuple(temp[1:])

In [4]:
def parse_article(text): 
    lines = text.split("\n")
    number = None
    for line in lines: 
        numbers = re.findall(r'\d+', line)
        if len(numbers) > 0: 
            number = int(numbers[0])
            break
    
    if number == None: 
        return None
    
    w = []
    l = []
    p = []
    for line in lines: 
        args = get_wlp(line)
        if args == None: 
            continue
        w.append(args[0])
        l.append(args[1])
        p.append(args[2])
    
    return Article(number, w, l, p)        

In [5]:
def parse_filename(directory, filename): 
    file = open(directory + filename, "r", encoding="ISO-8859-1")
    file_text = file.read()
    file.close()

    pattern = r'\d+\t@@\d+\t\t'
    article_texts = re.split(pattern, file_text)
    
    articles = []
    for text in article_texts: 
        article = parse_article(text)
        if article == None: 
            continue
        articles.append(article)
    
    if len(articles) == 0: 
        return None    
    return File(filename, articles)

In [6]:
def get_files(directory): 
    files = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        file = parse_filename(directory, filename)
        if file == None: 
            continue
        files.append(file)
    return files

### Example of Files and Articles

In [7]:
wlp_directory = "/home/divya/Desktop/coca-samples-wlp (1)/"

In [8]:
files = get_files(wlp_directory)
print(f"Number of files: {len(files)}")

Number of files: 8


In [9]:
file = files[0]
print(f"Filename: {file.filename}. Number of articles: {len(file.articles)}.")

Filename: wlp_mag.txt. Number of articles: 948.


In [10]:
article = file.articles[0]
print(f"Article #: {article.number}. Number of words: {len(article.w)}.")
print(f"Sample of words: {article.w[:10]}.")
print(f"Sample of l: {article.l[:10]}.")
print(f"Sample of p: {article.p[:10]}.")

Article #: 2000341. Number of words: 507.
Sample of words: ['Al', 'Sobotka', ',', 'has', 'been', 'driving', 'a', 'Zamboni', 'for', 'the'].
Sample of l: ['al', 'sobotka', ',', 'have', 'be', 'drive', 'a', 'zamboni', 'for', 'the'].
Sample of p: ['np1', 'np1', 'y', 'vhz', 'vbn', 'vvg', 'at1', 'nn1_jj', 'if', 'at'].


## coca-samples-text

WARNING: This section is incomplete and may take a long time to run. I suggest not running these cells :)

In [11]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

import re
import os

[nltk_data] Downloading package stopwords to /home/divya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
class Article: 
    def __init__(self, number, sentence_tokens, sentence_tokens_wo_sw=[]): 
        self.number = number
        self.sentence_tokens = sentence_tokens
        self.sentence_tokens_wo_sw = sentence_tokens_wo_sw

class File: 
    def __init__(self, filename, articles): 
        self.filename = filename
        self.articles = articles

In [13]:
def get_tokens(sentence_text): 
    text_tokens = word_tokenize(sentence_text)
    if len(text_tokens) == 0:
        return None
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    return (text_tokens, tokens_without_sw)

def get_sentences(article_text): 
    pattern = r" [\.|\?|\!] "
    sentences = re.split(pattern, article_text)
    return sentences

def get_article_texts(file_text): 
    pattern = r'\n@@\d+ '
    articles = re.split(pattern, file_text)
    return articles

In [14]:
def parse_articles(file_text): 
    article_texts = get_article_texts(file_text)

    articles = []
    for article_text in article_texts: 

        sentence_texts = get_sentences(article_text)

        sentence_tokens = []
        sentence_tokens_wo_sw = []
        for sentence_text in sentence_texts: 
            rv = get_tokens(sentence_text)
            if rv == None:
                continue
            (text_tokens, tokens_without_sw) = rv
            sentence_tokens.append(text_tokens)
            sentence_tokens_wo_sw.append(tokens_without_sw)

        if len(sentence_tokens) == 0: 
            continue
        article = Article(None, sentence_tokens, sentence_tokens_wo_sw)
        articles.append(article)
    
    if len(articles) == 0: 
        return None
    return articles

In [15]:
def parse_filename(directory, filename): 
    file = open(directory + filename, "r", encoding="ISO-8859-1")
    file_text = file.read()
    file.close()
    
    articles = parse_articles(file_text)
    if articles == None: 
        return None
    file = File(filename, articles)
    return file

In [16]:
def get_files(directory): 
    files = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        
        file = parse_filename(directory, filename)
        if file == None: 
            continue
        files.append(file)
    return files

### Example of File and Articles

In [17]:
text_directory = "/home/divya/Desktop/coca-samples-text/"
text_spok = "text_spok.txt"

In [None]:
%%time
files = get_files(text_directory)

In [None]:
%%time
file = parse_filename(text_directory, text_spok)