In [None]:
# nltk
import nltk
nltk.download('stopwords')
import sqlite3, pandas as pd

def load_sql(db_name, tbl_name):
    """Load SQLite database."""
    con = sqlite3.connect(f"database/{db_name}.db")
    df = pd.read_sql(f"SELECT * FROM {tbl_name}", con)
    con.close()
    return df

In [None]:
import numpy as np
import json, glob

# Gensim
import gensim, gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis.gensim_models

# spaCy
import spacy
from nltk.corpus import stopwords

# visualisation
import pyLDAvis

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:
stopwords = stopwords.words('english')

In [None]:
from nltk.tokenize import casual_tokenize
from collections import Counter

df = load_sql('tweets_v7', 'tweets_v7')

In [None]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) # disable computationally expensive
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = ' '.join(new_text)
        texts_out.append(final)
    return (texts_out)

lemmatized_texts = lemmatization(df['text'])

In [None]:
def gen_words(texts):
    final = []
    for text in texts:
        new = simple_preprocess(text, deacc=True)
        final.append(new)
    return final

data_words = gen_words(lemmatized_texts)

In [None]:
# ----------------------------------------------------------------------------
#  BIGRAMS AND TRIGRAMS
# ----------------------------------------------------------------------------

In [None]:
bigrams_phrases = gensim.models.Phrases(data_words, min_count=3, threshold=50)
trigram_phrases = gensim.models.Phrases(bigrams_phrases[data_words], threshold=50)

bigram = gensim.models.phrases.Phraser(bigrams_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return [bigram[doc] for doc in texts]

def make_trigram(texts):
    return [trigram[bigram[doc]] for doc in texts]

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigram(data_bigrams)

In [None]:
# ----------------------------------------------------------------------------
#  TF-IDF REMOVAL
# ----------------------------------------------------------------------------

In [76]:
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words = []
words_missing_in_tfidf = []

for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] # reintialise to be save
    tfidf_ids = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words + words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # the words with tfidf score 9 will be missing
    
    new bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]


In [None]:
# id2word = corpora.Dictionary(data_words)

# corpus = []
# for text in data_words:
#     new = id2word.doc2bow(text)
#     corpus.append(new)

In [83]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto')

In [None]:
# ----------------------------------------------------------------------------
#  VISUALISING DATA
# ----------------------------------------------------------------------------

In [84]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='mmds', R=30)
vis

  default_term_info = default_term_info.sort_values(
