## Set the path of where the csv files are located, as well as initial import statements

In [9]:
csv_data_path = "/Users/quinton/Documents/Projects/COMET_Prestudy/scripts/collection/formal_scripts/output_data_ocellID_01_2017.csv" # **PLACE CSV FILEPATH HERE**
sep = "\t" # csv delimiter used
small_model = False # use smaller ML model for better performance, less accuracy

In [10]:
from pathlib import Path 
import pandas as pd
import numpy as np

# Gensim imports
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy
import spacy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# visualization
import pyLDAvis
import pyLDAvis.gensim_models

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/quinton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Functions to handle data loading and data writing

In [11]:
def load_data(filename) -> pd.DataFrame:
    filename = Path(filename)
    if not filename.is_file():
        raise FileNotFoundError

    with open(filename.resolve(), 'r', encoding='utf-8') as file:
        data = pd.read_csv(file, sep=sep)
    return data

def load_data_by_year(directory: Path) -> pd.DataFrame:
    for data_file in Path.glob(Path.joinpath(directory, '*.csv')):
            yield load_data(data_file.resolve())

def write_data(filename: Path, data: pd.DataFrame):
    with open(filename.as_uri(), 'w', encoding='utf-8') as f:
        data.to_csv(f, sep=sep)


In [12]:
# obtain the stop words
stopwords = stopwords.words("english")
stopwords.append(['so', 'just', 'maybe'])
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [13]:
data = load_data(csv_data_path)['content']
data.head()

0    Welcome to our very new community forum.  | “W...
1    Option1:   Sign up for an account at https://O...
2    Hi, I checked around, and I can’t seem to find...
3    Hi, First of all, thank you for providing such...
4    I came across the Mozilla Location Service her...
Name: content, dtype: object

# Content Preprocessing

In [14]:
def lemmatize(posts: pd.DataFrame, allowed_post_tags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm" if small_model else "en_core_web_trf", disable=["parser", 'ner'])
    texts_out = []

    for post in posts:
        doc = nlp(post)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_post_tags:
                new_text.append(token.lemma_)
        final = ' '.join(new_text)
        texts_out.append(final)
    return texts_out

lemmatized_data = lemmatize(data.astype(str))
lemmatized_data



['welcome very new community forum ’ good way begin say well introduce anonymous lead here exciting thing currently work say just want here as long ’re nice other',
 'option1 sign account directly download file option sign account sign dashboard use email token go download tab dashboard click download link download csv database edit ’re get rate_limtied error allow only download account day prevent abuse see error only able try again tomorrow use proper download manager/ browser',
 'check around seem find documentation different column datum mean ’m refer complete dataset available download gb e.g. range mcc sample let know thing document thank',
 'first thank provide such abundant amount datum worldwide curious way identify carrier use cell datum set download thank so much good',
 'come here say ’ build ’m confused dataset different related question ’m unclear use service identify device location doc indicate use',
 'know love know lot change change like other like have trouble improv

In [15]:
def tokenize_lemma(posts: list):
    final = []
    for post in posts:
        new = gensim.utils.simple_preprocess(post, deacc=True)
        final.append(new)
    return final

tokenized_data = tokenize_lemma(lemmatized_data)
data = [[word for word in phrase if word not in stopwords] for phrase in tokenized_data]
data

[['welcome',
  'new',
  'community',
  'forum',
  'good',
  'way',
  'begin',
  'say',
  'well',
  'introduce',
  'anonymous',
  'lead',
  'exciting',
  'thing',
  'currently',
  'work',
  'say',
  'want',
  'long',
  'nice'],
 ['option',
  'sign',
  'account',
  'directly',
  'download',
  'file',
  'option',
  'sign',
  'account',
  'sign',
  'dashboard',
  'use',
  'email',
  'token',
  'go',
  'download',
  'tab',
  'dashboard',
  'click',
  'download',
  'link',
  'download',
  'csv',
  'database',
  'edit',
  'get',
  'rate_limtied',
  'error',
  'allow',
  'download',
  'account',
  'day',
  'prevent',
  'abuse',
  'see',
  'error',
  'able',
  'try',
  'tomorrow',
  'use',
  'proper',
  'download',
  'manager',
  'browser'],
 ['check',
  'around',
  'seem',
  'find',
  'documentation',
  'different',
  'column',
  'datum',
  'mean',
  'refer',
  'complete',
  'dataset',
  'available',
  'download',
  'gb',
  'range',
  'mcc',
  'sample',
  'let',
  'know',
  'thing',
  'documen

# Generate Bigrams/Trigrams

In [16]:

bigrams_phrases = gensim.models.Phrases(data, min_count=5, threshold=50)
trigram_phrases = gensim.models.Phrases(bigrams_phrases[data], threshold=100)

bigram = gensim.models.phrases.Phraser(bigrams_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return (bigram[doc] for doc in texts)

def make_trigrams(texts):
    return (trigram[bigram[doc]] for doc in texts)

data_bigrams = make_bigrams(data)
data_bigrams_trigrams = make_trigrams(data_bigrams)
bgtg = list(data_bigrams_trigrams)
bgtg

[['welcome',
  'new',
  'community',
  'forum',
  'good',
  'way',
  'begin',
  'say',
  'well',
  'introduce',
  'anonymous',
  'lead',
  'exciting',
  'thing',
  'currently',
  'work',
  'say',
  'want',
  'long',
  'nice'],
 ['option',
  'sign',
  'account',
  'directly',
  'download',
  'file',
  'option',
  'sign',
  'account',
  'sign',
  'dashboard',
  'use',
  'email',
  'token',
  'go',
  'download',
  'tab',
  'dashboard',
  'click',
  'download',
  'link',
  'download',
  'csv',
  'database',
  'edit',
  'get',
  'rate_limtied',
  'error',
  'allow',
  'download',
  'account',
  'day',
  'prevent',
  'abuse',
  'see',
  'error',
  'able',
  'try',
  'tomorrow',
  'use',
  'proper',
  'download',
  'manager',
  'browser'],
 ['check',
  'around',
  'seem',
  'find',
  'documentation',
  'different',
  'column',
  'datum',
  'mean',
  'refer',
  'complete',
  'dataset',
  'available',
  'download',
  'gb',
  'range',
  'mcc',
  'sample',
  'let',
  'know',
  'thing',
  'documen

# TF-IDF non-useful common word removal

In [17]:
from gensim.models import TfidfModel

id2word = corpora.Dictionary(bgtg)
corpus = [id2word.doc2bow(text) for text in bgtg]

corpus

tfidf = TfidfModel(corpus=corpus, id2word=id2word, )

low_value = 0.03
words = []
words_missing_tfidf = []
for iter in range(0,len(corpus)):
    bow = corpus[iter]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_tfidf = [id for id in bow_ids if id not in tfidf_ids]

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_tfidf]
    corpus[iter] = new_bow

# Generate id2word dictionary

In [18]:
# Deprecated since TF-IDF handles this already
# id2word = corpora.Dictionary(bgtg)

# corpus = []

# for text in bgtg:
#     new = id2word.doc2bow(text)
#     corpus.append(new)

# corpus

# LDA Visualization

In [19]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=10,
random_state=100, update_every=1, chunksize=100, passes=10, alpha="auto")

In [20]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='mmds', R=30)
vis

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  default_term_info = default_term_info.sort_values(


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
