In [1]:
# packages to store and manipulate data
import pandas as pd
import numpy as np

# plotting packages
import matplotlib.pyplot as plt
import seaborn as sns

# model building package
import sklearn
import spacy, gensim

# package to clean text
import re
import string

from preprocessor import TwitterPreprocessor

In [2]:
tweets = pd.read_csv('data/tweets.csv', index_col=0)

In [3]:
tweets.head()

Unnamed: 0,created_at,full_text,is_retweet,retweeted,mentioned,hashtags
0,Sat Aug 24 05:28:19 +0000 2019,I have taken it up with @fccpcnigeria . Hopefu...,False,[],"['@fccpcnigeria', '@fccpcnigeria']",[]
1,Sat Aug 24 05:22:35 +0000 2019,@sanctitybona @enugudisco @PowerUpNG @nepawaha...,False,[],"['@sanctitybona', '@enugudisco', '@PowerUpNG',...",[]
2,Fri Aug 09 18:01:30 +0000 2019,@ChickenRepublic Dear @fccpcnigeria kindly advise,False,[],"['@ChickenRepublic', '@fccpcnigeria']",[]
3,Fri Aug 09 17:48:03 +0000 2019,@ChickenRepublic @CocaCola @CocaCola_NG @fccpc...,False,[],"['@ChickenRepublic', '@CocaCola', '@CocaCola_N...",[]
4,Fri Aug 09 17:47:12 +0000 2019,@ChickenRepublic @CocaCola @CocaCola_NG @fccpc...,False,[],"['@ChickenRepublic', '@CocaCola', '@CocaCola_N...",[]


In [4]:
tweets.shape

(200000, 6)

## Topic Modelling

Topic modeling is a type of statistical modeling for discovering the abstract “topics” that occur in a collection of documents. In our case, the collection of document is a collection or tweets.

They are various algorithms used for topic modelling. Here we would look at **LDA** (Latent Dirichlet Allocation). 

LDA represents documents as mixtures of topics that spit out words with certain probabilities. It assumes that documents are produced in the following fashion: when writing each document, you

- Decide on the number of words N the document will have (say, according to a Poisson distribution).
- Choose a topic mixture for the document (according to a Dirichlet distribution over a fixed set of K topics). For example, assuming that we have the two food and cute animal topics above, you might choose the document to consist of 1/3 food and 2/3 cute animals.
- Generate each word w_i in the document by:
    - First picking a topic (according to the multinomial distribution that you sampled above; for example, you might pick the food topic with 1/3 probability and the cute animals topic with 2/3 probability).
    - Using the topic to generate the word itself (according to the topic’s multinomial distribution). For example, if we selected the food topic, we might generate the word “broccoli” with 30% probability, “bananas” with 15% probability, and so on.
    
**It builds a topic per document model and words per topic model, modeled as Dirichlet distributions.**

### Cleaning Unstructured Text Data

The most important thing we need to do to help our topic modelling algorithm is to pre-clean up the tweets. Looking at the tweets you may notice that they are very untidy, with non-standard English, capitalisation, links, hashtags, @users and punctuation and emoticons everywhere. In order to properly apply topic modelling we need to remove most of this and massage our data into a more standard form before finally turning it into vectors.

In [5]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

Below are some functions to remove web-links from the tweets. We will also remove retweets and mentions. We remove these because it is unlikely that they will help us form meaningful topics.

**We would like to know the general things which people are talking about, not who they are talking about or to and not the web links they are sharing.**


Next, we stem the words in the list. This is essentially where we knock the end off the words. We do this so that similar words will be recognised as the same word by the algorithm

In [6]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
lemma = nltk.wordnet.WordNetLemmatizer()
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@.""-,`'
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

# cleaning master function
def clean_tweet(tweet, bigrams=True):
    p = TwitterPreprocessor(tweet)
    # preprocess test
    p.fully_preprocess()
    tweet = p.text
    return tweet

In [7]:
tweets['clean_tweet'] = tweets.full_text.apply(clean_tweet)

In [8]:
tweets.head()

Unnamed: 0,created_at,full_text,is_retweet,retweeted,mentioned,hashtags,clean_tweet
0,Sat Aug 24 05:28:19 +0000 2019,I have taken it up with @fccpcnigeria . Hopefu...,False,[],"['@fccpcnigeria', '@fccpcnigeria']",[],taken hopefully resolved nobody protects consu...
1,Sat Aug 24 05:22:35 +0000 2019,@sanctitybona @enugudisco @PowerUpNG @nepawaha...,False,[],"['@sanctitybona', '@enugudisco', '@PowerUpNG',...",[],people lucky theyve taken transformer office p...
2,Fri Aug 09 18:01:30 +0000 2019,@ChickenRepublic Dear @fccpcnigeria kindly advise,False,[],"['@ChickenRepublic', '@fccpcnigeria']",[],dear kindly advise
3,Fri Aug 09 17:48:03 +0000 2019,@ChickenRepublic @CocaCola @CocaCola_NG @fccpc...,False,[],"['@ChickenRepublic', '@CocaCola', '@CocaCola_N...",[],
4,Fri Aug 09 17:47:12 +0000 2019,@ChickenRepublic @CocaCola @CocaCola_NG @fccpc...,False,[],"['@ChickenRepublic', '@CocaCola', '@CocaCola_N...",[],


In [9]:
def lemmatization(tweet, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    doc = nlp(" ".join(tweet)) 
    texts_out = " ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags])
        
    return texts_out

def tweet_to_words(tweet):
    yield(gensim.utils.simple_preprocess(str(tweet), deacc=True))  # deacc=True removes punctuations

In [10]:
tweets['tokenized_tweet'] = tweets.clean_tweet.apply(tweet_to_words)

In [11]:
tweets.head()

Unnamed: 0,created_at,full_text,is_retweet,retweeted,mentioned,hashtags,clean_tweet,tokenized_tweet
0,Sat Aug 24 05:28:19 +0000 2019,I have taken it up with @fccpcnigeria . Hopefu...,False,[],"['@fccpcnigeria', '@fccpcnigeria']",[],taken hopefully resolved nobody protects consu...,<generator object tweet_to_words at 0x128c5d850>
1,Sat Aug 24 05:22:35 +0000 2019,@sanctitybona @enugudisco @PowerUpNG @nepawaha...,False,[],"['@sanctitybona', '@enugudisco', '@PowerUpNG',...",[],people lucky theyve taken transformer office p...,<generator object tweet_to_words at 0x128c5d450>
2,Fri Aug 09 18:01:30 +0000 2019,@ChickenRepublic Dear @fccpcnigeria kindly advise,False,[],"['@ChickenRepublic', '@fccpcnigeria']",[],dear kindly advise,<generator object tweet_to_words at 0x128c5d8d0>
3,Fri Aug 09 17:48:03 +0000 2019,@ChickenRepublic @CocaCola @CocaCola_NG @fccpc...,False,[],"['@ChickenRepublic', '@CocaCola', '@CocaCola_N...",[],,<generator object tweet_to_words at 0x128c5d950>
4,Fri Aug 09 17:47:12 +0000 2019,@ChickenRepublic @CocaCola @CocaCola_NG @fccpc...,False,[],"['@ChickenRepublic', '@CocaCola', '@CocaCola_N...",[],,<generator object tweet_to_words at 0x128c5d9d0>


In [None]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
tweets['lemmatized_tweet'] = tweets.clean_tweet.apply(lemmatization)

In [None]:
tweets.head()

In [11]:
pd.options.display.max_colwidth = 200

In [12]:
tweets.clean_tweet.unique().size

146761

#### Applying LDA

Now that we have clean text we can apply some processing to turn the clean tweets into vectors and then build a model.

To turn the text into a matrix*, where each row in the matrix encodes which words appeared in each individual tweet. 


We will also filter the words `max_df=0.9` means we discard any words that appear in >90% of tweets. In this dataset I don’t think there are any words that are that common but it is good practice. We will also filter words using `min_df=25`, so words that appear in less than 25 tweets will be discarded. We discard high appearing words since they are too common to be meaningful in topics. We discard low appearing words because we won’t have a strong enough signal and they will just introduce noise to our model.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=250, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(tweets['clean_tweet']) #.toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

In [15]:
tf.shape

(200000, 1267)

In [16]:
tf_feature_names

['abeg',
 'abi',
 'able',
 'abuja',
 'acc',
 'access',
 'accessbank',
 'account',
 'accountbank',
 'accountcredited',
 'accountdebited',
 'accountdetail',
 'accountid',
 'accountnumber',
 'accountplease',
 'accountsince',
 'accountusername',
 'accountyet',
 'acct',
 'across',
 'act',
 'action',
 'activate',
 'activated',
 'activategb',
 'activatewawu',
 'active',
 'actually',
 'add',
 'address',
 'adm',
 'admin',
 'adminpaicipate',
 'advise',
 'africa',
 'african',
 'afternoon',
 'agency',
 'agent',
 'ago',
 'ah',
 'aide',
 'aiel',
 'aiime',
 'aint',
 'air',
 'ale',
 'alex',
 'allow',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'always',
 'ambassador',
 'ame',
 'among',
 'amount',
 'amp',
 'annoying',
 'another',
 'answer',
 'anymore',
 'anyone',
 'anything',
 'apc',
 'apologize',
 'apologizeinconvenience',
 'apology',
 'app',
 'appreciate',
 'appropriately',
 'area',
 'arena',
 'arenagame',
 'around',
 'arsenal',
 'asap',
 'ask',
 'asked',
 'asking',
 'assist',
 'assistance',

The tf matrix is similar to the hashtag_vector_df dataframe. Each row is a tweet and each column is a word. The numbers in each position tell us how many times this word appears in this tweet.

Next we create the model object. Lets start by randomly choosing 10 topics.

In [17]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 10

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=45) # random state for reproducibility

In [20]:
model.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=45, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

Next we will inspect our topics that we generated and try to extract meaningful information from them.

In [18]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [22]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,bet9ja,26732.8,power,2796.5,bbnaija,4653.8,please,7545.2,nigeria,11614.2,’,11781.5,us,4978.3,account,17494.9,bank,6303.2,bet9ja,8210.5
1,code,4500.0,one,2375.0,na,3782.7,money,3624.9,the,4356.0,see,2890.2,thank,4316.1,bet9ja,6743.6,polaris,3541.1,bbnaija,4921.9
2,play,3591.0,still,1945.0,mercy,3149.9,help,3142.6,buhari,4245.8,what,2588.3,kindly,4076.0,id,4507.0,call,2707.2,win,3004.7
3,game,3491.8,game,1568.6,wawu,2822.1,since,3079.8,dont,3713.2,time,2295.7,hello,3408.8,please,4061.0,bet365,2541.1,big,2700.1
4,odds,2642.4,pay,1563.9,activate,2754.4,kindly,2438.7,know,3655.7,it,2262.1,we,3009.4,credited,3411.5,ecobank,2230.1,bubu,2564.2
5,win,2341.9,yes,1526.3,dey,2665.7,issue,1936.8,like,3149.0,money,1726.6,dm,2960.9,bank,3379.7,bwin,2179.1,😂,2424.9
6,bet,1950.2,last,1410.4,go,2502.3,yet,1887.3,people,2979.9,even,1703.8,please,2896.8,username,2906.1,just,2042.3,winning,2227.3
7,ticket,1655.2,coming,1390.5,ike,2442.4,cc,1695.7,you,2627.5,people,1558.4,number,2645.8,money,2771.4,williamhill,1999.1,de,2218.1
8,please,1576.0,day,1385.4,tacha,2329.9,response,1617.5,one,1837.1,they,1499.4,team,2240.4,pls,2672.4,bet9ja,1893.6,games,1861.0
9,pls,1558.2,light,1307.1,house,2085.4,good,1585.2,amp,1809.0,you,1474.7,complaint,1934.2,bet,2670.3,whatsapp,1785.4,brother,1809.1


### Non-negative Matrix Factorisation (NMF).

In [19]:
from sklearn.decomposition import NMF

In [20]:
model_nmf = NMF(n_components=number_of_topics, random_state=42, alpha=.1, l1_ratio=.5, verbose=True)

In [21]:
model_nmf.fit(tf)

violation: 1.0
violation: 0.3021239366955817
violation: 0.19072065088007276
violation: 0.08587871308382568
violation: 0.03927028734613173
violation: 0.021774903955333665
violation: 0.013004158187007883
violation: 0.008266471839519807
violation: 0.0059314822937103695
violation: 0.004782416037180024
violation: 0.00418750562293099
violation: 0.00388801740057672
violation: 0.003749348712213601
violation: 0.0036777117088381325
violation: 0.0036291640214035975
violation: 0.0035926761485269416
violation: 0.003563656294413012
violation: 0.0035398232790722134
violation: 0.0035196295938652116
violation: 0.003502161702992927
violation: 0.0034868221976557577
violation: 0.003473123830455181
violation: 0.003460716102122241
violation: 0.003449322185886566
violation: 0.0034387118953286165
violation: 0.0034287209992343
violation: 0.003419201086678527
violation: 0.00341005836283664
violation: 0.0034012178895608744
violation: 0.0033926197854820214
violation: 0.003384216282293012
violation: 0.003375964856

NMF(alpha=0.1, beta_loss='frobenius', init=None, l1_ratio=0.5, max_iter=200,
    n_components=10, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=True)

In [22]:
no_top_words = 10
nmf_topics = display_topics(model_nmf, tf_feature_names, no_top_words)

In [23]:
nmf_topics.head(20)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,’,21.8,account,18.3,betja,22.7,now,4.7,please,17.3,bank,15.3,game,10.5,nigeria,7.8,u,12.2,win,8.3
1,buhari,1.2,betjaaccount,3.6,code,2.2,✅✅✅,4.7,help,4.8,polaris,2.7,whatsapp,4.4,money,7.4,kindly,5.5,odds,6.8
2,‘,0.8,id,2.8,coin,1.6,today,4.2,pleasehelp,2.6,polarisbank,2.2,call,4.1,one,5.9,hello,5.1,prediction,6.4
3,osinbajo,0.6,money,2.6,betjaaccount,1.5,bet,4.0,wawu,2.6,first,1.2,today,3.7,dont,4.7,complaint,3.0,bet,5.5
4,”,0.5,credited,2.5,betjacode,1.4,match,3.0,activate,2.5,ba,1.2,inbox,3.0,get,4.7,hellokindly,2.6,daily,4.9
5,let,0.4,debited,2.2,betjacoin,1.3,fixed,3.0,gb,1.8,access,1.1,available,2.8,people,4.6,team,2.4,sure,3.7
6,“,0.4,username,2.0,play,1.2,winning,2.9,plan,1.6,firstbank,1.0,guaranteed,2.7,like,4.5,enable,2.4,…,3.4
7,know,0.3,bankaccount,1.8,playbetja,0.8,fixedmatch,2.8,line,1.5,ecobank,1.0,odds,2.7,go,4.4,twitter,2.4,betting,3.2
8,–,0.3,pls,1.5,game,0.7,big,2.7,thanks,1.4,accessbank,1.0,whatsappcall,2.6,day,4.1,regard,2.4,day,2.9
9,nigeria,0.3,deposit,1.5,–,0.6,oppounity,2.5,wawugb,1.2,bankaccount,0.9,inboxwhatsapp,2.5,know,3.6,suppo,2.3,winbet,2.8
