In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/Shareddrives/Computational Semantics A3/Code'

In [None]:
import os
import pandas as pd
import numpy as np
import spacy
import gensim.downloader as api
from gensim.models import FastText
import string

In [None]:
class TextProcessor():
    def __init__(self, spacy_pipeline:str="en_core_web_lg"):
        try:
            self.nlp = spacy.load(spacy_pipeline)
        except:
            !python3 -m spacy download {spacy_pipeline}
            self.nlp = spacy.load(spacy_pipeline)

    def lemmatize_text(self, text):
      doc = self.nlp(text)
      lemmas = [token.lemma_ for token in doc]

      return " ".join(lemmas)


    def tokenize_text(self, text):
        doc = self.nlp(text)
        tokens = [token.text for token in doc]
        return tokens

    def pos_tagging(self, text):
        doc = self.nlp(text)
        pos = [token.tag_ for token in doc]
        return pos

    def sentence_segmentation(self, text):
        doc = self.nlp(text)
        assert doc.has_annotation("SENT_START")
        sentences = [sentence.text for sentence in doc.sents]
        return sentences

In [None]:
files = []

for (dirpath, dirnames, filenames) in os.walk('coha_samples_text'):
  # only read the files that are before 1910
  files = ['coha_samples_text/'+f for f in filenames if int(f.split(".")[0].split("_")[1]) <= 1910]

coha_corpus = []

for f in files:
  with open(f, 'r', encoding='utf-8') as file:
    text = file.read()
    coha_corpus.append(text)

In [None]:
def load_corpus(folder_path):
    corpus = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            ge
                corpus.append(text)
    return corpus


# Load COCA and COHA
coca_corpus = load_corpus('coca_samples_text')

# Print the first few characters of the first document in each corpus
print("COCA Corpus:", coca_corpus[0][:100])
print("\nCOHA Corpus:", coha_corpus[0][:100])

In [None]:
text_processor = TextProcessor()

In [None]:
# process the corpus so that each line in the text file is one sentence
def process_corpus(corpus:list, output_path:str):
  output = ""
  with open(output_path, 'w') as f:
    f.write(output)
  for text in corpus:
    # the maximum number of characters that the spacy sentence segmentation can handle is 1 000 000.
    if len(text) > 1000000:
      sentences = []
      for i in range(0, len(text), 1000000):
        sentences.extend(text_processor.sentence_segmentation(text[i:i+1000000]))
    else:
      sentences = text_processor.sentence_segmentation(text)
    output = output + "\n".join(sentences)
    with open(output_path, 'a') as f:
        f.write(output)

In [None]:
process_corpus(coha_corpus, 'coha_corpus.txt')

In [None]:
process_corpus(coca_corpus, 'coca_corpus.txt')

In [None]:
modern_model = FastText(window=5)
modern_model.build_vocab(corpus_file='coca_corpus.txt')
total_words = modern_model.corpus_total_words
modern_model.train(corpus_file='coca_corpus.txt', total_words=total_words, epochs=5)

In [None]:
modern_model.save('coca_ft.model')

In [None]:
old_model = FastText(window=5)
old_model.build_vocab(corpus_file='coha_corpus.txt')
total_words = old_model.corpus_total_words
old_model.train(corpus_file='coha_corpus.txt', total_words=total_words, epochs=5)

In [None]:
old_model.save('coha_ft.model')

# Other preprocessing methods that we tried but ultimately did not use:

## FastText w/ lemmatized, lowercased and punctiations removed from the text

In [None]:
# process the corpus so that each line in the text file is one sentence
def process_corpus_v2(corpus:list, output_path:str):
  output = ""

  with open(output_path, 'w') as f:
    f.write(output)
  for text in corpus:
    # the maximum number of characters that the spacy sentence segmentation can handle is 1 000 000.
    if len(text) > 1000000:
      sentences = []
      for i in range(0, len(text), 1000000):
        temp_text = text_processor.lemmatize_text(text[i:i+1000000])
        sentences.extend(text_processor.sentence_segmentation(temp_text))
    else:
      sentences = text_processor.sentence_segmentation(text)
    output = output + "\n".join(sentences)
    translator = str.maketrans('', '', string.punctuation)
    output = output.translate(translator)
    with open(output_path, 'a') as f:
        f.write(output)

In [None]:
process_corpus_v2(coha_corpus, 'coha_corpus_v2.txt')

In [None]:
process_corpus_v2(coca_corpus, 'coca_corpus_v2.txt')

In [None]:
modern_model = FastText(window=5)
modern_model.build_vocab(corpus_file='coca_corpus_v2.txt')
total_words = modern_model.corpus_total_words
modern_model.train(corpus_file='coca_corpus_v2.txt', total_words=total_words, epochs=5)

In [None]:
modern_model.save('coca_ft_v2.model')

In [None]:
old_model = FastText(window=5)
old_model.build_vocab(corpus_file='coha_corpus_v2.txt')
total_words = old_model.corpus_total_words
old_model.train(corpus_file='coha_corpus_v2.txt', total_words=total_words, epochs=5)

In [None]:
old_model.save('coha_ft_v2.model')

## Word2Vec original Hamilton et al implementation

In [None]:
from gensim.models import Word2Vec

old_model = Word2Vec(min_count=100,max_vocab_size=50000)
old_model.build_vocab(corpus_file='coha_corpus.txt')
total_words = old_model.corpus_total_words
old_model.train(corpus_file='coha_corpus.txt', total_words=total_words, epochs=5)

In [None]:
old_model.save("coha_w2v.model")

In [None]:
modern_model = Word2Vec(min_count=50,max_vocab_size=50000)
modern_model.build_vocab(corpus_file='coca_corpus.txt')
total_words = modern_model.corpus_total_words
modern_model.train(corpus_file='coca_corpus.txt', total_words=total_words, epochs=5)

In [None]:
modern_model.save("coca_w2v.model")