## Set the path of where the csv files are located, as well as initial import statements

In [8]:
csv_data_path = "/Users/quinton/Documents/Projects/COMET_Prestudy/scripts/collection/formal_scripts/output_data_mlab_01_2017.csv" # **PLACE CSV FILEPATH HERE**
sep = "\t" # csv delimiter used
small_model = True # use smaller ML model for better performance, less accuracy

In [9]:
from pathlib import Path 
import pandas as pd
import numpy as np
from pprint import pprint

# Gensim imports
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy
import spacy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# visualization
import pyLDAvis
import pyLDAvis.gensim_models

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/quinton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Functions to handle data loading and data writing

In [10]:
def load_data(filename) -> pd.DataFrame:
    filename = Path(filename)
    if not filename.is_file():
        raise FileNotFoundError

    with open(filename.resolve(), 'r', encoding='utf-8') as file:
        data = pd.read_csv(file, sep=sep)
    return data

def load_data_by_year(directory: Path) -> pd.DataFrame:
    for data_file in Path.glob(Path.joinpath(directory, '*.csv')):
            yield load_data(data_file.resolve())

def write_data(filename: Path, data: pd.DataFrame):
    with open(filename.as_uri(), 'w', encoding='utf-8') as f:
        data.to_csv(f, sep=sep)


In [11]:
# obtain the stop words
stopwords = stopwords.words("english")
stopwords=stopwords + ['thank', 'well', 'then', 'go', 'way', 'also']
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [12]:
data = load_data(csv_data_path)['content']
data.head()

0    Hi,  I'm looking for monthly average mobile do...
1    The latest ookla speedtest app (just for apple...
2    We had some unusual results come back today fr...
3    Hi there,  I am working with a group that is i...
4    Hi, It looks like the -server option in ndt7-c...
Name: content, dtype: object

# Content Preprocessing

In [13]:
def lemmatize(posts: pd.DataFrame, allowed_post_tags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm" if small_model else "en_core_web_trf", disable=["parser", 'ner'])
    texts_out = []

    for post in posts:
        doc = nlp(post)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_post_tags:
                new_text.append(token.lemma_)
        final = ' '.join(new_text)
        texts_out.append(final)
    return texts_out

lemmatized_data = lemmatize(data.astype(str))
lemmatized_data

['look monthly average mobile download speed datum country specifcally onwards early well though so far only able find datum help really greatful advice thank advance',
 'late ookla speedtest app just apple io presently finally track work latency responsiveness try pende',
 'unusual result come back today test pod extremely low chance way see pod hiccup today way see pod up down',
 'there work group interested use monitor performance various isp requirement project use tool follow base standard find so far involve find specifically mention follow base standard help provide more information link topic thank',
 'look option ndt7 client go work run test use default setting ndt7 client client$ go run download progress download complete upload progress s upload complete ndt-mlab3-dfw02.mlab-oti.measurement-lab.org e62d s % however get handshake error try run again use same server ~/godir ndt7 client client$ go run ndt mlab3 oti.measurement lab.org download fail websocket bad handshake downl

In [14]:
def tokenize_lemma(posts: list):
    final = []
    for post in posts:
        new = gensim.utils.simple_preprocess(post, deacc=True)
        final.append(new)
    return final

tokenized_data = tokenize_lemma(lemmatized_data)

data = [[word for word in phrase if word not in stopwords] for phrase in tokenized_data]
data

[['look',
  'monthly',
  'average',
  'mobile',
  'download',
  'speed',
  'datum',
  'country',
  'specifcally',
  'onwards',
  'early',
  'though',
  'far',
  'able',
  'find',
  'datum',
  'help',
  'really',
  'greatful',
  'advice',
  'advance'],
 ['late',
  'ookla',
  'speedtest',
  'app',
  'apple',
  'io',
  'presently',
  'finally',
  'track',
  'work',
  'latency',
  'responsiveness',
  'try',
  'pende'],
 ['unusual',
  'result',
  'come',
  'back',
  'today',
  'test',
  'pod',
  'extremely',
  'low',
  'chance',
  'see',
  'pod',
  'hiccup',
  'today',
  'see',
  'pod'],
 ['work',
  'group',
  'interested',
  'use',
  'monitor',
  'performance',
  'various',
  'isp',
  'requirement',
  'project',
  'use',
  'tool',
  'follow',
  'base',
  'standard',
  'find',
  'far',
  'involve',
  'find',
  'specifically',
  'mention',
  'follow',
  'base',
  'standard',
  'help',
  'provide',
  'information',
  'link',
  'topic'],
 ['look',
  'option',
  'ndt',
  'client',
  'work',
  '

# Generate bigrams/trigrams 

In [15]:
bigrams_phrases = gensim.models.Phrases(data, min_count=5, threshold=100)
trigram_phrases = gensim.models.Phrases(bigrams_phrases[data], threshold=100)

bigram = gensim.models.phrases.Phraser(bigrams_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return (bigram[doc] for doc in texts)

def make_trigrams(texts):
    return (trigram[bigram[doc]] for doc in texts)

data_bigrams = make_bigrams(data)
data_bigrams_trigrams = make_trigrams(data_bigrams)
bgtg = list(data_bigrams_trigrams)
bgtg

[['look',
  'monthly',
  'average',
  'mobile',
  'download',
  'speed',
  'datum',
  'country',
  'specifcally',
  'onwards',
  'early',
  'though',
  'far',
  'able',
  'find',
  'datum',
  'help',
  'really',
  'greatful',
  'advice',
  'advance'],
 ['late',
  'ookla',
  'speedtest',
  'app',
  'apple',
  'io',
  'presently',
  'finally',
  'track',
  'work',
  'latency',
  'responsiveness',
  'try',
  'pende'],
 ['unusual',
  'result',
  'come',
  'back',
  'today',
  'test',
  'pod',
  'extremely',
  'low',
  'chance',
  'see',
  'pod',
  'hiccup',
  'today',
  'see',
  'pod'],
 ['work',
  'group',
  'interested',
  'use',
  'monitor',
  'performance',
  'various',
  'isp',
  'requirement',
  'project',
  'use',
  'tool',
  'follow',
  'base',
  'standard',
  'find',
  'far',
  'involve',
  'find',
  'specifically',
  'mention',
  'follow',
  'base',
  'standard',
  'help',
  'provide',
  'information',
  'link',
  'topic'],
 ['look',
  'option',
  'ndt',
  'client',
  'work',
  '

## TF-IDF non-useful common word removal

In [16]:
from gensim.models import TfidfModel

id2word = corpora.Dictionary(bgtg)
corpus = [id2word.doc2bow(text) for text in bgtg]

corpus

tfidf = TfidfModel(corpus=corpus, id2word=id2word, )

low_value = 0.03
words = []
words_missing_tfidf = []
for iter in range(0,len(corpus)):
    bow = corpus[iter]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_tfidf = [id for id in bow_ids if id not in tfidf_ids]

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_tfidf]
    corpus[iter] = new_bow

# Generate id2word dictionary

In [17]:
# deprecated since tf-idf implementation
# id2word = corpora.Dictionary(bgtg)

# corpus = []

# for text in bgtg:
#     new = id2word.doc2bow(text)
#     corpus.append(new)

# corpus

# LDA Visualization

In [18]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=7,
random_state=100, update_every=0, chunksize=100, passes=10, alpha="auto")

In [19]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='mmds', R=30)
vis

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
