## Set the path of where the csv files are located, as well as initial import statements

In [None]:
csv_data_path = "/Users/quinton/Documents/Projects/COMET_Prestudy/scripts/collection/formal_scripts/output_data_mlab_01_2017.csv" # **PLACE CSV FILEPATH HERE**
sep = "\t" # csv delimiter used
small_model = False # use smaller ML model for better performance, less accuracy

In [None]:
from pathlib import Path 
import pandas as pd
import numpy as np
from pprint import pprint

# Gensim imports
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy
import spacy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# visualization
import pyLDAvis
import pyLDAvis.gensim_models

## Functions to handle data loading and data writing

In [None]:
def load_data(filename) -> pd.DataFrame:
    filename = Path(filename)
    if not filename.is_file():
        raise FileNotFoundError

    with open(filename.resolve(), 'r', encoding='utf-8') as file:
        data = pd.read_csv(file, sep=sep)
    return data

def load_data_by_year(directory: Path) -> pd.DataFrame:
    for data_file in Path.glob(Path.joinpath(directory, '*.csv')):
            yield load_data(data_file.resolve())

def write_data(filename: Path, data: pd.DataFrame):
    with open(filename.as_uri(), 'w', encoding='utf-8') as f:
        data.to_csv(f, sep=sep)


In [None]:
# obtain the stop words
stopwords = stopwords.words("english")
stopwords=stopwords + ['thank', 'well', 'then', 'go', 'way', 'also']
stopwords

In [None]:
data = load_data(csv_data_path)['content']
data.head()

# Content Preprocessing

In [None]:
def lemmatize(posts: pd.DataFrame, allowed_post_tags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm" if small_model else "en_core_web_trf", disable=["parser", 'ner'])
    texts_out = []

    for post in posts:
        doc = nlp(post)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_post_tags:
                new_text.append(token.lemma_)
        final = ' '.join(new_text)
        texts_out.append(final)
    return texts_out

lemmatized_data = lemmatize(data.astype(str))
lemmatized_data

In [None]:
def tokenize_lemma(posts: list):
    final = []
    for post in posts:
        new = gensim.utils.simple_preprocess(post, deacc=True)
        final.append(new)
    return final

tokenized_data = tokenize_lemma(lemmatized_data)

data = [[word for word in phrase if word not in stopwords] for phrase in tokenized_data]
data

# Generate bigrams/trigrams 

In [None]:
bigrams_phrases = gensim.models.Phrases(data, min_count=5, threshold=100)
trigram_phrases = gensim.models.Phrases(bigrams_phrases[data], threshold=100)

bigram = gensim.models.phrases.Phraser(bigrams_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return (bigram[doc] for doc in texts)

def make_trigrams(texts):
    return (trigram[bigram[doc]] for doc in texts)

data_bigrams = make_bigrams(data)
data_bigrams_trigrams = make_trigrams(data_bigrams)
bgtg = list(data_bigrams_trigrams)
bgtg

## TF-IDF non-useful common word removal

In [None]:
from gensim.models import TfidfModel

id2word = corpora.Dictionary(bgtg)
corpus = [id2word.doc2bow(text) for text in bgtg]

corpus

tfidf = TfidfModel(corpus=corpus, id2word=id2word, )

low_value = 0.03
words = []
words_missing_tfidf = []
for iter in range(0,len(corpus)):
    bow = corpus[iter]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_tfidf = [id for id in bow_ids if id not in tfidf_ids]

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_tfidf]
    corpus[iter] = new_bow

# Generate id2word dictionary

In [None]:
# deprecated since tf-idf implementation
# id2word = corpora.Dictionary(bgtg)

# corpus = []

# for text in bgtg:
#     new = id2word.doc2bow(text)
#     corpus.append(new)

# corpus

# LDA Visualization

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=7,
random_state=100, update_every=0, chunksize=100, passes=10, alpha="auto")

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='mmds', R=30)
vis