COS 802 Final Project <br> Student Number: u11028182 <br> Name:  Erika Scholtz

#### Import required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import spacy

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import TfidfModel

import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Load and filter dataset

#### Load dataset

In [53]:
df = pd.read_csv('D:/COS802/AylienCovid19.csv') # --> Change to location of extracted AylienCovid19 csv file
df.head()

Check how many news articles are available for each Country

In [None]:
df['location'].value_counts()

#### Filter Dataset

Create list of relevant economic impact keywords

In [4]:
keywords = ['business','businesses','economy','economic','GDP','gross domestic product','jobs','unemployment','industry',
             'trade', 'shortage', 'panic buying','stocks','stock','market','investment']

Filter dataframe to include only news article that contain one or more of the keywords

In [5]:
df_f = df[df['body'].str.contains('|'.join(keywords),case=False)]

Filter for specific countries and create text documents for each country based on a subset of the news articles. Countries were selected for the created number of articles availabe. South Africa was added as an additional country

In [3]:
text_US = df_f[df_f['location']=='US'].body.sample(10000,random_state = 101)  # --> 232850 entries in full set
text_GB = df_f[df_f['location']=='GB'].body.sample(10000,random_state = 101)  # --> 102161 entries in full set
text_IN = df_f[df_f['location']=='IN'].body.sample(10000,random_state = 101)  # --> 80247 entries in full set
text_CA = df_f[df_f['location']=='CA'].body.sample(10000,random_state = 101)  # --> 19373 entries in full set
text_AU = df_f[df_f['location']=='AU'].body.sample(10000,random_state = 101)  # --> 19954 entries in full set
text_ZA = df_f[df_f['location']=='ZA'].body  # --> 1834 entries in full set

Load subset for testing purposes

In [7]:
df_test = df_f_c[['id','body','location']].sample(20000)

## Create data preprocessing functions

#### Define list of stopwords

In [4]:
nlp = spacy.load('en_core_web_sm',disable=['parser','ner'])
nlp.Defaults.stop_words |= {'say','go','know','think'}
stopwords = nlp.Defaults.stop_words

#### Prepare data preporcessing functions

In [5]:
# ---------- Remove stopwords, run through language model, and lemmatize ---------

def clean_text(text_data, allowed_types=['NOUN','ADJ','VERB','ADV']):
    
    lemma_text = []
    
    for text in text_data:
        doc = nlp(text)
        new = []
        
        for token in doc:
            if (token.pos_ in allowed_types) and (token.text.lower() not in stopwords):
                lem = token.lemma_
                
                if lem not in stopwords:
                    new.append(lem)
                               
        new_text = " ".join(new)
        lemma_text.append(new_text)

# ---------- Further Preprocess data ----------

    words = []
    
    for text in lemma_text:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        words.append(new)    
        
    return lemma_text, words

## Create functions for bigram and trigram models and tf-idf

#### Create bigrams and trigrams

In [6]:
def tri_bigrams(text_data,min_count=5,threshold=100):
    
    bigram_phrases = gensim.models.Phrases(text_data,min_count=min_count,threshold=threshold)
    trigram_phrases = gensim.models.Phrases(bigram_phrases[text_data],threshold=threshold)

    bigram = gensim.models.phrases.Phraser(bigram_phrases)
    trigram = gensim.models.phrases.Phraser(trigram_phrases)

    def to_bigrams(texts):
        return([bigram[doc] for doc in texts])

    def to_trigrams(texts):
        return ([trigram[bigram[doc]] for doc in texts])

    text_bigrams = to_bigrams(text_data)
    text_bi_trigrams = to_trigrams(text_bigrams)

    return text_bi_trigrams

#### Create id2word and corpus

In [7]:
def corpus_prep(text_data):
    id2word = corpora.Dictionary(text_data)

    corpus = []
    
    for text in text_data:
        new = id2word.doc2bow(text)
        corpus.append(new)
    
    return id2word, corpus

#### TF-IDF

In [8]:
def tf_idf(corpus, id2word):
    tfidf = TfidfModel(corpus=corpus, id2word=id2word)

    threshold = 0.03
    words = []
    words_not_in_tfidf = []

    for i in range(0,len(corpus)):
        bow = corpus[i]
        low_value_words = []
        
        tfidf_ids = [id for id, value in tfidf[bow]]
        
        bow_ids = [id for id, value in bow]
        
        low_value_words = [id for id, value in tfidf[bow] if value < threshold]
        
        drops = low_value_words+words_not_in_tfidf
        
        for item in drops:
            words.append(id2word[item])
            
        words_not_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]

        new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_not_in_tfidf]
        
        corpus[i] =new_bow
        
    return corpus, id2word

#### Finding the optimal number of topics

In [9]:
def opt_k(corpus,id2word,texts,save_name):
    
    coherence = []

    for k in range(10,37,2):
        print('Test for k value of: '+str(k))
        
        Lda_model = gensim.models.ldamulticore.LdaModel(corpus=corpus,
                                                        id2word=id2word,
                                                        num_topics=k,
                                                        random_state=100,
                                                        update_every=1,
                                                        chunksize=500,
                                                        passes=20,
                                                        alpha='auto')

        coher_model = gensim.models.coherencemodel.CoherenceModel(model=Lda_model,
                                                         texts=texts,
                                                         corpus=corpus,
                                                         dictionary=id2word,
                                                         coherence='c_v')
        
        coherence.append((k,coher_model.get_coherence()))

    k = []
    c = []
    for i in coherence:
        k.append(i[0])
        c.append(i[1]) 
    
    pd.DataFrame(list(zip(k, c))).to_csv(save_name + ".csv",index=False)
    
    return k, c

## Build topic model for the United States (US)

#### US: Prepare the data

In [10]:
# ---------- Clean text ----------
lemma_text_US, text_words_US = clean_text(text_US)

# print('Lemmatized: ',lemma_text_US[0][0:100],'\n')
# print('Tokenized: ',text_words_US[0][0:20])

# ---------- Create bigrams and trigrams ----------
text_words_US_2 =  tri_bigrams(text_words_US,min_count=5,threshold=100)
id2word_US, corpus_US = corpus_prep(text_words_US_2)

# ---------- TF-IDF and create corpus and dictionary ----------
corpus_US, id2words_US = tf_idf(corpus_US, id2word_US)



#### US: Find optimal number of topics

In [1]:
k_US, c_US = opt_k(corpus_US,id2word_US,text_words_US_2,"US_k")

In [2]:
sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
plt.figure(figsize=(5,2.5))
plt.tight_layout

plt.plot(k_US,c_US)

plt.title('Number of topics vs. coherence for US')
plt.xlabel('Number of topics')
plt.ylabel('Coherence')
plt.savefig('US_Topics_K.png',bbox_inches = 'tight')

#### US: Final LDA Model and eval

In [2]:
lda_model_US = gensim.models.ldamulticore.LdaModel(corpus=corpus_US,
                                                   id2word=id2word_US,
                                                   num_topics=14,
                                                   random_state=100,
                                                   update_every=1,
                                                   chunksize=500,
                                                   passes=20,
                                                   alpha='auto')

# ---------- Save model -----------

lda_model_US.save("model_US.model")

# ---------- Evaluate model ----------
perplex_US = lda_model_US.log_perplexity(corpus_US)
print(perplex_US)

coher_model_lda_US = CoherenceModel(model=lda_model_US, 
                                    texts=text_words_US_2, 
                                    dictionary=id2word_US,
                                    corpus=corpus_US,
                                    coherence='c_v')

coherence_lda_US = coher_model_lda_US.get_coherence()
print(coherence_lda_US)

#### US: Visualise results

In [13]:
vis_US = pyLDAvis.gensim_models.prepare(lda_model_US,corpus_US,id2word_US,mds='mmds',R=20)
vis_US

#### US: Write topic results to dataframe

In [15]:
topic_US = lda_model_US.show_topics(formatted=False,num_topics=14,num_words=20)

topic_words_US = []

for i in range(lda_model_US.num_topics):
    topic_words_US.extend([(i, )+ x for x in lda_model_US.show_topic(i, topn=20)])
    
df_out_US = pd.DataFrame(topic_words_US, columns=['Topic','Word','P'])
df_out_US.to_csv('US_Topics.csv',index=False)

## Country: GB

#### GB:Prep data

In [3]:
# ---------- Clean text ----------
lemma_text_GB, text_words_GB = clean_text(text_GB)

print('Lemmatized: ',lemma_text_GB[0][0:100],'\n')
print('Tokenized: ',text_words_GB[0][0:20])

# ---------- Create bigrams and trigrams ----------
text_words_GB_2 =  tri_bigrams(text_words_GB,min_count=5,threshold=100)
id2word_GB, corpus_GB = corpus_prep(text_words_GB_2)

# ---------- TF-IDF and create corpus and dictionary ----------
corpus_GB, id2words_GB = tf_idf(corpus_GB, id2word_GB)

#### GB: Find optimal number of topics

In [6]:
k_GB, c_GB = opt_k(corpus_GB,id2word_GB,text_words_GB_2,"GB_k")

In [7]:
sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
plt.figure(figsize=(5,2.5))
plt.tight_layout

plt.plot(k_GB,c_GB)

plt.title('Number of topics vs. coherence for GB')
plt.xlabel('Number of topics')
plt.ylabel('Coherence')
plt.savefig('GB_Topics_K.png',bbox_inches = 'tight')

#### GB: Final LDA Model and Eval

In [4]:
lda_model_GB = gensim.models.ldamulticore.LdaModel(corpus=corpus_GB,
                                                   id2word=id2word_GB,
                                                   num_topics=18,
                                                   random_state=100,
                                                   update_every=1,
                                                   chunksize=500,
                                                   passes=20,
                                                   alpha='auto')

# ---------- Save model -----------

lda_model_GB.save("model_GB.model")

# ---------- Evaluate model ----------
perplex_GB = lda_model_GB.log_perplexity(corpus_GB)
print(perplex_GB)

coher_model_lda_GB = CoherenceModel(model=lda_model_GB, 
                                    texts=text_words_GB_2, 
                                    dictionary=id2word_GB,
                                    corpus=corpus_GB,
                                    coherence='c_v')

coherence_lda_GB = coher_model_lda_GB.get_coherence()
print(coherence_lda_GB)

#### GB: Visualise topics

In [14]:
vis_GB = pyLDAvis.gensim_models.prepare(lda_model_GB,corpus_GB,id2word_GB,mds='mmds',R=20)
vis_GB

#### GB: Write topic results to dataframe

In [21]:
topic_GB = lda_model_GB.show_topics(formatted=False,num_topics=18,num_words=20)

topic_words_GB = []

for i in range(lda_model_GB.num_topics):
    topic_words_GB.extend([(i, )+ x for x in lda_model_GB.show_topic(i, topn=20)])
    
df_out_GB = pd.DataFrame(topic_words_GB, columns=['Topic','Word','P'])
df_out_GB.to_csv('GB_Topics.csv',index=False)

## Country: IN

#### IN: Prep data

In [5]:
# ---------- Clean text ----------
lemma_text_IN, text_words_IN = clean_text(text_IN)

print('Lemmatized: ',lemma_text_IN[0][0:100],'\n')
print('Tokenized: ',text_words_IN[0][0:20])

# ---------- Create bigrams and trigrams ----------
text_words_IN_2 =  tri_bigrams(text_words_IN,min_count=5,threshold=100)
id2word_IN, corpus_IN = corpus_prep(text_words_IN_2)

# ---------- TF-IDF and create corpus and dictionary ----------
corpus_IN, id2words_IN = tf_idf(corpus_IN, id2word_IN)

#### IN: Find optimal number of topics

In [11]:
k_IN, c_IN = opt_k(corpus_IN,id2word_IN,text_words_IN_2,"IN_k")

In [12]:
sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
plt.figure(figsize=(5,2.5))
plt.tight_layout

plt.plot(k_IN,c_IN)

plt.title('Number of topics vs. coherence for IN')
plt.xlabel('Number of topics')
plt.ylabel('Coherence')
plt.savefig('IN_Topics_K.png',bbox_inches = 'tight')

#### IN: Final model and eval

In [6]:
lda_model_IN = gensim.models.ldamulticore.LdaModel(corpus=corpus_IN,
                                                   id2word=id2word_IN,
                                                   num_topics=20,
                                                   random_state=100,
                                                   update_every=1,
                                                   chunksize=500,
                                                   passes=20,
                                                   alpha='auto')

# ---------- Save model -----------

lda_model_IN.save("model_IN.model")

# ---------- Evaluate model ----------
perplex_IN = lda_model_IN.log_perplexity(corpus_IN)
print(perplex_IN)

coher_model_lda_IN = CoherenceModel(model=lda_model_IN, 
                                    texts=text_words_IN_2, 
                                    dictionary=id2word_IN,
                                    corpus=corpus_IN,
                                    coherence='c_v')

coherence_lda_IN = coher_model_lda_IN.get_coherence()
print(coherence_lda_IN)

#### Visualise results

In [15]:
vis_IN = pyLDAvis.gensim_models.prepare(lda_model_IN,corpus_IN,id2word_IN,mds='mmds',R=20)
vis_IN

#### Write topic results to dataframe

In [27]:
topic_IN = lda_model_IN.show_topics(formatted=False,num_topics=20,num_words=20)

topic_words_IN = []

for i in range(lda_model_IN.num_topics):
    topic_words_IN.extend([(i, )+ x for x in lda_model_IN.show_topic(i, topn=20)])
    
df_out_IN = pd.DataFrame(topic_words_IN, columns=['Topic','Word','P'])
df_out_IN.to_csv('IN_Topics.csv',index=False)

## Country: CA

#### CA: Prep data

In [7]:
# ---------- Clean text ----------
lemma_text_CA, text_words_CA = clean_text(text_CA)

print('Lemmatized: ',lemma_text_CA[0][0:100],'\n')
print('Tokenized: ',text_words_CA[0][0:20])

# ---------- Create bigrams and trigrams ----------
text_words_CA_2 =  tri_bigrams(text_words_CA,min_count=5,threshold=100)
id2word_CA, corpus_CA = corpus_prep(text_words_CA_2)

# ---------- TF-IDF and create corpus and dictionary ----------
corpus_CA, id2words_CA = tf_idf(corpus_CA, id2word_CA)

#### CA: Find optimal number of topics

In [16]:
k_CA, c_CA = opt_k(corpus_CA,id2word_CA,text_words_CA_2,"CA_k")

In [17]:
sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
plt.figure(figsize=(5,2.5))
plt.tight_layout

plt.plot(k_CA,c_CA)

plt.title('Number of topics vs. coherence for CA')
plt.xlabel('Number of topics')
plt.ylabel('Coherence')
plt.savefig('CA_Topics_K.png',bbox_inches = 'tight')

#### CA: Final LDA Model and eval

In [8]:
lda_model_CA = gensim.models.ldamulticore.LdaModel(corpus=corpus_CA,
                                                   id2word=id2word_CA,
                                                   num_topics=16,
                                                   random_state=100,
                                                   update_every=1,
                                                   chunksize=500,
                                                   passes=20,
                                                   alpha='auto')

# ---------- Save model -----------

lda_model_CA.save("model_CA.model")

# ---------- Evaluate model ----------
perplex_CA = lda_model_CA.log_perplexity(corpus_CA)
print(perplex_CA)

coher_model_lda_CA = CoherenceModel(model=lda_model_CA, 
                                    texts=text_words_CA_2, 
                                    dictionary=id2word_CA,
                                    corpus=corpus_CA,
                                    coherence='c_v')

coherence_lda_CA = coher_model_lda_CA.get_coherence()
print(coherence_lda_CA)

#### Visualise results

In [16]:
vis_CA = pyLDAvis.gensim_models.prepare(lda_model_CA,corpus_CA,id2word_CA,mds='mmds',R=20)
vis_CA

#### Write topic results to dataframe

In [None]:
topic_CA = lda_model_CA.show_topics(formatted=False,num_topics=16,num_words=20)

topic_words_CA = []

for i in range(lda_model_CA.num_topics):
    topic_words_CA.extend([(i, )+ x for x in lda_model_CA.show_topic(i, topn=20)])
    
df_out_CA = pd.DataFrame(topic_words_CA, columns=['Topic','Word','P'])
df_out_CA.to_csv('CA_Topics.csv',index=False)

## Country: AU

#### AU: Prep data

In [9]:
# ---------- Clean text ----------
lemma_text_AU, text_words_AU = clean_text(text_AU)

print('Lemmatized: ',lemma_text_AU[0][0:100],'\n')
print('Tokenized: ',text_words_AU[0][0:20])

# ---------- Create bigrams and trigrams ----------
text_words_AU_2 =  tri_bigrams(text_words_AU,min_count=5,threshold=100)
id2word_AU, corpus_AU = corpus_prep(text_words_AU_2)

# ---------- TF-IDF and create corpus and dictionary ----------
corpus_AU, id2words_AU = tf_idf(corpus_AU, id2word_AU)

#### AU: Find optimal number of topics

In [None]:
k_AU, c_AU = opt_k(corpus_AU,id2word_AU,text_words_AU_2,"AU_k")

In [None]:
sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
plt.figure(figsize=(5,2.5))
plt.tight_layout

plt.plot(k_AU,c_AU)

plt.title('Number of topics vs. coherence for AU')
plt.xlabel('Number of topics')
plt.ylabel('Coherence')
plt.savefig('AU_Topics_K.png',bbox_inches = 'tight')

#### Final model and eval

In [10]:
lda_model_AU = gensim.models.ldamulticore.LdaModel(corpus=corpus_AU,
                                                   id2word=id2word_AU,
                                                   num_topics=22,
                                                   random_state=100,
                                                   update_every=1,
                                                   chunksize=500,
                                                   passes=20,
                                                   alpha='auto')

# ---------- Save model -----------

lda_model_AU.save("model_AU.model")

# ---------- Evaluate model ----------
perplex_AU = lda_model_AU.log_perplexity(corpus_AU)
print(perplex_AU)

coher_model_lda_AU = CoherenceModel(model=lda_model_AU, 
                                    texts=text_words_AU_2, 
                                    dictionary=id2word_AU,
                                    corpus=corpus_AU,
                                    coherence='c_v')

coherence_lda_AU = coher_model_lda_AU.get_coherence()
print(coherence_lda_AU)

#### Visualise results

In [17]:
vis_AU = pyLDAvis.gensim_models.prepare(lda_model_AU,corpus_AU,id2word_AU,mds='mmds',R=20)
vis_AU

#### Write topic results to dataframe

In [None]:
topic_AU = lda_model_AU.show_topics(formatted=False,num_topics=22,num_words=20)

topic_words_AU = []

for i in range(lda_model_AU.num_topics):
    topic_words_AU.extend([(i, )+ x for x in lda_model_AU.show_topic(i, topn=20)])
    
df_out_AU = pd.DataFrame(topic_words_AU, columns=['Topic','Word','P'])
df_out_AU.to_csv('AU_Topics.csv',index=False)

## Country: ZA

#### ZA: Prep data

In [11]:
# ---------- Clean text ----------
lemma_text_ZA, text_words_ZA = clean_text(text_ZA)

print('Lemmatized: ',lemma_text_ZA[0][0:100],'\n')
print('Tokenized: ',text_words_ZA[0][0:20])

# ---------- Create bigrams and trigrams ----------
text_words_ZA_2 =  tri_bigrams(text_words_ZA,min_count=5,threshold=100)
id2word_ZA, corpus_ZA = corpus_prep(text_words_ZA_2)

# ---------- TF-IDF and create corpus and dictionary ----------
corpus_ZA, id2words_ZA = tf_idf(corpus_ZA, id2word_ZA)

#### ZA: Find optimal number of k

In [None]:
k_ZA, c_ZA = opt_k(corpus_ZA,id2word_ZA,text_words_ZA_2,"ZA_k")

In [None]:
sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
plt.figure(figsize=(5,2.5))
plt.tight_layout

plt.plot(k_ZA,c_ZA)

plt.title('Number of topics vs. coherence for ZA')
plt.xlabel('Number of topics')
plt.ylabel('Coherence')
plt.savefig('ZA_Topics_K.png',bbox_inches = 'tight')

#### Final model and eval

In [12]:
lda_model_ZA = gensim.models.ldamulticore.LdaModel(corpus=corpus_ZA,
                                                   id2word=id2word_ZA,
                                                   num_topics=22,
                                                   random_state=100,
                                                   update_every=1,
                                                   chunksize=500,
                                                   passes=20,
                                                   alpha='auto')

# ---------- Save model -----------

lda_model_ZA.save("model_ZA.model")

# ---------- Evaluate model ----------
perplex_ZA = lda_model_ZA.log_perplexity(corpus_ZA)
print(perplex_ZA)

coher_model_lda_ZA = CoherenceModel(model=lda_model_ZA, 
                                    texts=text_words_ZA_2, 
                                    dictionary=id2word_ZA,
                                    corpus=corpus_ZA,
                                    coherence='c_v')

coherence_lda_ZA = coher_model_lda_ZA.get_coherence()
print(coherence_lda_ZA)

#### Visualise results

In [18]:
vis_ZA = pyLDAvis.gensim_models.prepare(lda_model_ZA,corpus_ZA,id2word_ZA,mds='mmds',R=20)
vis_ZA

#### ZA: Write topic results to dataframe

In [None]:
topic_ZA = lda_model_ZA.show_topics(formatted=False,num_topics=22,num_words=20)

topic_words_ZA = []

for i in range(lda_model_ZA.num_topics):
    topic_words_ZA.extend([(i, )+ x for x in lda_model_ZA.show_topic(i, topn=20)])
    
df_out_ZA = pd.DataFrame(topic_words_ZA, columns=['Topic','Word','P'])
df_out_ZA.to_csv('ZA_Topics.csv',index=False)