<h1>Modelling</h1>

<h3>Importing Model Ready Data</h3>

In [129]:
import pandas as pd

model_tweets = pd.read_csv('../data/model_ready_data.csv')
model_tweets = model_tweets.fillna("")
model_tweets.head()

Unnamed: 0,sentiment,lang,hashtags,clean_text
0,1,en,,rt telglobalhealth africa is in the midst of a...
1,1,en,,rt globalhlthtwit dr moeti is head of who in a...
2,1,en,red4research,rt nhsrdforum thank you research note for crea...
3,1,en,,rt highwiretalk former pfizer vp and virologis...
4,1,en,,rt peterhotez i think it s important that we d...


In [130]:
model_tweets.shape

(6417, 4)

In [156]:
# 4492 1925
sentiment_analysis_tweet_data = model_tweets.copy(deep=True)
sentiment_analysis_tweet_data.drop(sentiment_analysis_tweet_data[sentiment_analysis_tweet_data['sentiment'] == -1].index, inplace=True)
sentiment_analysis_tweet_data.reset_index(drop=True, inplace=True)
tweet_train = sentiment_analysis_tweet_data.iloc[:4492,]
tweet_test = sentiment_analysis_tweet_data.iloc[4493:,]

<h3>Sentiment Analysis</h3>

In [157]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices
from sklearn.decomposition import NMF, LatentDirichletAllocation

#### Unigram Counts

In [158]:
unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
unigram_vectorizer.fit(tweet_train['clean_text'].values)

CountVectorizer()

In [159]:
X_train_unigram = unigram_vectorizer.transform(tweet_train['clean_text'].values)

#### Unigram Tf-Idf

In [160]:
unigram_tf_idf_transformer = TfidfTransformer()
unigram_tf_idf_transformer.fit(X_train_unigram)


TfidfTransformer()

In [161]:
X_train_unigram_tf_idf = unigram_tf_idf_transformer.transform(X_train_unigram)

#### Bigram Counts

In [162]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
bigram_vectorizer.fit(tweet_train['clean_text'].values)

CountVectorizer(ngram_range=(1, 2))

In [163]:
X_train_bigram = bigram_vectorizer.transform(tweet_train['clean_text'].values)

#### Bigram Tf-Idf

In [164]:
bigram_tf_idf_transformer = TfidfTransformer()
bigram_tf_idf_transformer.fit(X_train_bigram)

TfidfTransformer()

In [165]:
X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)

In [166]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import numpy as np

In [167]:
def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y,train_size=0.75, stratify=y
    )

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    valid_score = clf.score(X_valid, y_valid)

    global_vars = globals()
    if(valid_score > global_vars['best_score']):
        global_vars['best_model'] = clf
        global_vars['best_model_name'] = title
        global_vars['best_score'] = valid_score

    print(f'{title}\nTrain score: {round(train_score, 2)} ; Validation score: {round(valid_score, 2)}\n')

In [168]:
y_train = tweet_train['sentiment'].values
y_train

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [169]:
best_model = ""
best_model_name = ""
best_score = 0

train_and_show_scores(X_train_unigram, y_train, 'Unigram Counts')
train_and_show_scores(X_train_unigram_tf_idf, y_train, 'Unigram Tf-Idf')
train_and_show_scores(X_train_bigram, y_train, 'Bigram Counts')
train_and_show_scores(X_train_bigram_tf_idf, y_train, 'Bigram Tf-Idf')

Unigram Counts
Train score: 1.0 ; Validation score: 0.98

Unigram Tf-Idf
Train score: 1.0 ; Validation score: 0.98

Bigram Counts
Train score: 1.0 ; Validation score: 0.98

Bigram Tf-Idf
Train score: 1.0 ; Validation score: 0.98



In [170]:
print(f'The best Model is {best_model_name} with a Validation score of: {round(best_score, 2)}')

The best Model is Unigram Tf-Idf with a Validation score of: 0.98


Testing

In [176]:
def run_test_using_model(best_model: SGDClassifier, model_type: str):
    unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
    unigram_vectorizer.fit(tweet_test['clean_text'].values)
    X_test_unigram = unigram_vectorizer.transform(tweet_test['clean_text'].values)

    bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
    bigram_vectorizer.fit(tweet_test['clean_text'].values)
    X_test_bigram = bigram_vectorizer.transform(tweet_test['clean_text'].values)

    y_test = tweet_test['sentiment'].values

    if(model_type == "Unigram Counts"):
        X_test = X_test_unigram

    elif(model_type == "Unigram Tf-Idf"):
        unigram_tf_idf_transformer = TfidfTransformer()
        unigram_tf_idf_transformer.fit(X_test_unigram)
        X_test_unigram_tf_idf = unigram_tf_idf_transformer.transform(X_test_unigram)

        X_test = X_test_unigram_tf_idf

    elif(model_type == "Bigram Counts"):
        X_test = X_test_bigram

    else:
        bigram_tf_idf_transformer = TfidfTransformer()
        bigram_tf_idf_transformer.fit(X_test_bigram)

        X_test_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_test_bigram)
        X_test = X_test_bigram_tf_idf

   
    return best_model.score(X_test,y_test)


In [177]:
# type(tweet_test['clean_text'].values)
run_test_using_model(best_model, best_model_name)

ValueError: empty vocabulary; perhaps the documents only contain stop words

Saving generated Topic LDA Model

In [None]:
joblib.dump(best_model, '../trained_models/sentimentSGDmodel')
# then reload it with
# lda_model = joblib.load('lda_model.jl')

<h3>Topic Modelling</h3>

In [109]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [110]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\milky\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [111]:
import re
import numpy as np
import pandas as  pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
import joblib
%matplotlib inline

Stopwords Preparation

In [112]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

Loading Dataset

In [113]:
topic_model_data = model_tweets.copy(deep=True)
topic_model_data

Unnamed: 0,sentiment,lang,hashtags,clean_text
0,1,en,,rt telglobalhealth africa is in the midst of a...
1,1,en,,rt globalhlthtwit dr moeti is head of who in a...
2,1,en,red4research,rt nhsrdforum thank you research note for crea...
3,1,en,,rt highwiretalk former pfizer vp and virologis...
4,1,en,,rt peterhotez i think it s important that we d...
...,...,...,...,...
6412,-1,en,,pin code kamala nehru pmcgvaccines covaxin min...
6413,-1,en,covid19,minute interview with the inventor of mrna vac...
6414,-1,en,,rt shawajason liars you tried to load off your...
6415,-1,en,covid19,rt kalainh as of june th t amp t has administe...


Tokenizing words

In [114]:
def get_hastags_words_list():
    hashtagList = []
    for hashtags in topic_model_data.hashtags:
        if(hashtags != ""):
            hashtagList += hashtags.split(',')

    return hashtagList

hashtag = get_hastags_words_list()

data = [word for sentence in topic_model_data.clean_text for word in sentence.split(' ')]

In [115]:
hashtag[:5]

['red4research', 'wecandothis', 'covid19', 'wecandothis', 'cuban']

In [116]:
data[:10]

['rt',
 'telglobalhealth',
 'africa',
 'is',
 'in',
 'the',
 'midst',
 'of',
 'a',
 'full']

In [117]:
data_words = data + hashtag
data_words = [word for word in data_words if word != '']
data_words[:5]

['rt', 'telglobalhealth', 'africa', 'is', 'in']

Creating bigram and trigram models

In [118]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['r', 't']


Removing Stopwords, making bigrams and lemmatization

In [119]:
# Define function for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [120]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[[]]


In [121]:
data_lemmatized = [word for word in data_lemmatized if word != []]
data_lemmatized[:5]

[['telglobalhealth'], ['africa'], ['midst'], ['full'], ['blow']]

Create Dictionary and Corpus needed for Topic Modeling

In [122]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print(corpus)

], [(64, 1)], [(7, 1)], [(765, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(441, 1)], [(64, 1)], [(64, 1)], [(844, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(7, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(374, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(278, 1)], [(7, 1)], [(844, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(720, 1)], [(441, 1)], [(64, 1)], [(64, 1)], [(194, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(1335, 1)], [(1335, 1)], [(64, 1)], [(2380, 1)], [(7, 1)], [(2381, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(2396, 1)], [(524, 1)], [(64, 1)], [(49, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(92, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(64, 1)], [(

In [123]:
# Readable View
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:10]]

[[('telglobalhealth', 1)],
 [('africa', 1)],
 [('midst', 1)],
 [('full', 1)],
 [('blow', 1)],
 [('third', 1)],
 [('wave', 1)],
 [('coronavirus', 1)],
 [('head', 1)],
 [('whoafro', 1)]]

Building the topic Model

In [124]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

KeyboardInterrupt: 

In [22]:
# Print the keyword of topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.088*"yet" + 0.081*"doctor" + 0.048*"roger" + 0.000*"ce" + 0.000*"yard" + '
  '0.000*"baby" + 0.000*"girl" + 0.000*"guinea" + 0.000*"equatorial" + '
  '0.000*"chinaeconomy"'),
 (1,
  '0.642*"africa" + 0.091*"wave" + 0.071*"health" + 0.058*"good" + '
  '0.000*"deltavariant" + 0.000*"india" + 0.000*"israel" + 0.000*"uganda" + '
  '0.000*"dose" + 0.000*"expire"'),
 (2,
  '0.224*"public" + 0.051*"remember" + 0.007*"sell" + 0.000*"ce" + '
  '0.000*"baby" + 0.000*"girl" + 0.000*"guinea" + 0.000*"equatorial" + '
  '0.000*"chinaeconomy" + 0.000*"cute"'),
 (3,
  '0.054*"policy" + 0.054*"amazingly" + 0.000*"birth" + 0.000*"baby" + '
  '0.000*"girl" + 0.000*"guinea" + 0.000*"equatorial" + 0.000*"chinaeconomy" + '
  '0.000*"ce" + 0.000*"intense"'),
 (4,
  '0.253*"government" + 0.163*"head" + 0.051*"rise" + 0.031*"courageous" + '
  '0.000*"chinaeconomy" + 0.000*"equatorial" + 0.000*"guinea" + 0.000*"girl" + '
  '0.000*"unicefafrica" + 0.000*"baby"'),
 (5,
  '0.115*"pfizer" + 0.055*"atlanta

Evaluating trained topic model using perplexity and cherence score

In [23]:
# Compute Perplexity
perplexity_score = lda_model.log_perplexity(corpus)
print('\nPerplexity: ', perplexity_score)  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)

Perplexity:  -23.871449319949598

Coherence Score:  0.7245872265477


Visualize the topic model

In [27]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  and should_run_async(code)


Saving generated Topic LDA Model

In [32]:
joblib.dump(lda_model, '../trained_models/topicLDAmodel')
# then reload it with
# lda_model = joblib.load('lda_model.jl')


['../trained_models/topicLDAmodel']

In [None]:
import pickle
description = {'sentiment_analysis':{'name':best_model_name,'score':best_score}, 'topic_modeling':{'perplexity_score':perplexity_score, 'coherence_score':coherence_lda}}
pickle.dump(description, 'trainedModelsData.pk')
# # then reload it with
# lda_model = pickle.load('lda_model.pk')