## Topic modeling

#### NOTE:
This is a rough draft

In [6]:
# load data

import pandas as pd
df_climate = pd.read_csv('../data/Climate_Labelled.csv')
df_climate

Unnamed: 0,text,youth_label
0,We’re doing EXACY:Y what Greta told us not to do.,1
1,"Not exactly related to the Climate Crisis, but...",0
2,"Whether you're an adult or not, you can [lobby...",1
3,It’s 3:48 am and all I can think is I gotta wa...,1
4,Worth noting that Person of the Year isn't rea...,0
...,...,...
31938,TLDR:\n\nAmerica Finally Has an Answer to the ...,0
31939,I like my EV for its 0-60 in less than 3.5sec....,0
31940,"I want affordable, reliable and easy to repair...",1
31941,"Great news! Will get an EV, to save the planet...",1


## Topic modeling

In [7]:
# imports

import warnings
warnings.filterwarnings('ignore')

from selenium import webdriver
from bs4 import BeautifulSoup
import requests, os, sys, json, csv, copy, operator
from collections import Counter

import pandas as pd
import numpy as np
import pickle, time
import seaborn as sns
from scipy.io import savemat, loadmat
from matplotlib import pyplot as plt
%matplotlib inline

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer # ???
from sklearn.metrics import silhouette_score
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity as cos_sim

import gensim
from gensim.utils import simple_preprocess
from gensim.models import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.matutils import jaccard
from gensim.matutils import jensen_shannon

import pyLDAvis
from pyLDAvis import gensim as pyldagensim
from pyLDAvis import sklearn as pyldavis_sklearn

import networkx as nx
from networkx.algorithms import shortest_path

# add scripts directory to path
sys.path.insert(1, '../scripts/')


[nltk_data] Downloading package wordnet to /home/amandae/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
import pickle
from selenium import webdriver
import time
import requests, re
import numpy as np
import json
from bs4 import BeautifulSoup
from scipy.io import loadmat, savemat
from sklearn.decomposition import LatentDirichletAllocation
from gensim.matutils import jensen_shannon
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import gensim

def isEnglish(s):
    """
    Return True if all characters in a string are
    characters used in the English language, else False.

    Parameters
    ----------
    s : string

    Output
    ------
    y : bool
    True if all characters are used in the English language
    False otherwise
    """
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

def fix_text(txt):
    """
    Fixes a string to replace non-English characters.

    Parameters
    ----------
    txt : string
    The string of text to be fixed

    Output
    ------
    txt : string
    The fixed string of text
    """
    if not isEnglish(txt):
        for i, s in enumerate(txt):
            if not isEnglish(s):
                if len(txt)>=i+2:
                    if txt[i+1] == 's':
                        txt = txt.replace(s,"'")
                    elif txt[i+1] == ' ' and "'" not in txt:
                        txt = txt.replace(s,'-')
                    else:
                        txt = txt.replace(s,'')
                else:
                    txt = txt.replace(s,'')
    return txt


def get_jensen_shannon(components, ntopics):
    topic_dists = components
    js_dists = []
    for i in range(ntopics):
        for j in range(ntopics):
            if i>j:
                js_dists.append(jensen_shannon(topic_dists[i,:], topic_dists[j,:]))

    return np.min(js_dists), np.mean(js_dists)

def get_jaccard(components, ntopics):
    topn = int(np.ceil(len(dictionary)*(10/100)))
    topic_word_probs = components
    top_terms = np.argsort(-1*topic_word_probs,axis=1)
    top_terms = 1*top_terms[:,0:topn]
    jdists = []
    for i in range(ntopics):
        for j in range(ntopics):
            if i > j:
                jdists.append(jaccard(top_terms[i,:], top_terms[j,:]))
    return np.min(jdists), np.mean(jdists)

class LDAwithCustomScore(LatentDirichletAllocation):
    def score(self, X, y=None):
        components = self.components_
        ntopics = self.n_components
        score = get_jensen_shannon(components, ntopics)[0]
        return score

def get_stopwords():
    stopwords = list(gensim.parsing.preprocessing.STOPWORDS)
    #stopwords.extend([])
    return stopwords

def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text, stopwords):
    result=[]
    stem_dict = []

    for token in gensim.utils.simple_preprocess(text) :
        if token not in stopwords and lemmatize_stemming(token) not in stopwords and len(token) > 3:
            result.append(lemmatize_stemming(token))
            stem_dict.append((lemmatize_stemming(token), token))

    return result, stem_dict


In [9]:
# preprocessed
processed_info = []
stopwords = list(gensim.parsing.preprocessing.STOPWORDS)
for allinfo in df_climate['text'].values:
    preprocessed, stemdict = preprocess(allinfo, stopwords)
    processed_info.append(preprocessed)

In [10]:
# bag of words
dictionary = gensim.corpora.Dictionary(processed_info)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_info]

In [12]:
# get stopwords
stopwords = get_stopwords()
countvec = CountVectorizer(ngram_range=(1,1), stop_words=stopwords, max_df=.25, min_df=10)
clean_text = [' '.join(text) for text in processed_info]
X = countvec.fit_transform(clean_text).todense()
wft = np.sum(X, axis=0).T

allterms = countvec.get_feature_names()

In [13]:
# try different numbers of topics
myseed=1
ntopics_list = [3, 5, 10]
scores = []
for ntopics in ntopics_list:
    print(ntopics)
    lda_model = LDAwithCustomScore(random_state=myseed, n_components=ntopics, learning_decay=.7, doc_topic_prior=None, topic_word_prior=None)
    lda_model.fit_transform(X)
    scores.append(lda_model.score(X))
    
print(scores)

5
[0.22530371935780366]


In [None]:
# grid search CV with pipeline

# Define Search Param
ntopics_list = [3, 5, 10]
search_params = {'n_components': ntopics_list}
lda = LDAwithCustomScore(random_state=0)

model = GridSearchCV(lda, param_grid=search_params, cv=5)
model.fit(X)

# Best Model
best_lda_model = model.best_estimator_

#file = open('../data/sklearn_model_cv_test.pkl', 'wb')
#pickle.dump(model,file)
#file.close()

In [15]:
# prepare data for pyldavis
data = pyldavis_sklearn.prepare(lda_model, X, countvec, mds = 'tsne', sort_topics=False, R=20)

In [16]:
#pyLDAvis.save_html(data,'khp_topics_viz1.html')
pyLDAvis.enable_notebook()
pyLDAvis.display(data)