In [1]:
# packages to store and manipulate data
import pandas as pd
import numpy as np

# plotting packages
import matplotlib.pyplot as plt
import seaborn as sns

# model building package
import sklearn

# package to clean text
import re

In [2]:
tweets = pd.read_csv('data/tweets.csv', index_col=0)

In [3]:
tweets.head()

Unnamed: 0,created_at,full_text,is_retweet,retweeted,mentioned,hashtags
0,Sat Aug 24 05:28:19 +0000 2019,I have taken it up with @fccpcnigeria . Hopefu...,False,[],"['@fccpcnigeria', '@fccpcnigeria']",[]
1,Sat Aug 24 05:22:35 +0000 2019,@sanctitybona @enugudisco @PowerUpNG @nepawaha...,False,[],"['@sanctitybona', '@enugudisco', '@PowerUpNG',...",[]
2,Fri Aug 09 18:01:30 +0000 2019,@ChickenRepublic Dear @fccpcnigeria kindly advise,False,[],"['@ChickenRepublic', '@fccpcnigeria']",[]
3,Fri Aug 09 17:48:03 +0000 2019,@ChickenRepublic @CocaCola @CocaCola_NG @fccpc...,False,[],"['@ChickenRepublic', '@CocaCola', '@CocaCola_N...",[]
4,Fri Aug 09 17:47:12 +0000 2019,@ChickenRepublic @CocaCola @CocaCola_NG @fccpc...,False,[],"['@ChickenRepublic', '@CocaCola', '@CocaCola_N...",[]


In [4]:
tweets.shape

(200000, 6)

## Topic Modelling

Topic modeling is a type of statistical modeling for discovering the abstract “topics” that occur in a collection of documents. In our case, the collection of document is a collection or tweets.

They are various algorithms used for topic modelling. Here we would look at **LDA** (Latent Dirichlet Allocation). 

LDA represents documents as mixtures of topics that spit out words with certain probabilities. It assumes that documents are produced in the following fashion: when writing each document, you

- Decide on the number of words N the document will have (say, according to a Poisson distribution).
- Choose a topic mixture for the document (according to a Dirichlet distribution over a fixed set of K topics). For example, assuming that we have the two food and cute animal topics above, you might choose the document to consist of 1/3 food and 2/3 cute animals.
- Generate each word w_i in the document by:
    - First picking a topic (according to the multinomial distribution that you sampled above; for example, you might pick the food topic with 1/3 probability and the cute animals topic with 2/3 probability).
    - Using the topic to generate the word itself (according to the topic’s multinomial distribution). For example, if we selected the food topic, we might generate the word “broccoli” with 30% probability, “bananas” with 15% probability, and so on.
    
**It builds a topic per document model and words per topic model, modeled as Dirichlet distributions.**

### Cleaning Unstructured Text Data

The most important thing we need to do to help our topic modelling algorithm is to pre-clean up the tweets. Looking at the tweets you may notice that they are very untidy, with non-standard English, capitalisation, links, hashtags, @users and punctuation and emoticons everywhere. In order to properly apply topic modelling we need to remove most of this and massage our data into a more standard form before finally turning it into vectors.

In [5]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

In [7]:
# download the stopwords if you have not used "nltk" before.
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/sasu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Below are some functions to remove web-links from the tweets. We will also remove retweets and mentions. We remove these because it is unlikely that they will help us form meaningful topics.

**We would like to know the general things which people are talking about, not who they are talking about or to and not the web links they are sharing.**


Next, we stem the words in the list. This is essentially where we knock the end off the words. We do this so that similar words will be recognised as the same word by the algorithm

In [6]:
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet

In [7]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

# cleaning master function
def clean_tweet(tweet, bigrams=True):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

In [8]:
tweets['clean_tweet'] = tweets.full_text.apply(clean_tweet)

In [9]:
pd.options.display.max_colwidth = 200

In [10]:
tweets.head()

Unnamed: 0,created_at,full_text,is_retweet,retweeted,mentioned,hashtags,clean_tweet
0,Sat Aug 24 05:28:19 +0000 2019,"I have taken it up with @fccpcnigeria . Hopefully, it can be resolved. Nobody protects consumers in this country. Hopefully the revamped @fccpcnigeria can prove me wrong.",False,[],"['@fccpcnigeria', '@fccpcnigeria']",[],taken hope resolv nobodi protect consum countri hope revamp prove wrong taken_hope hope_resolv resolv_nobodi nobodi_protect protect_consum consum_countri countri_hope hope_revamp revamp_prove pro...
1,Sat Aug 24 05:22:35 +0000 2019,@sanctitybona @enugudisco @PowerUpNG @nepawahalang @csm_ng @fccpcnigeria You people are lucky that they've taken your transformer to their office. As for the people of Ezenwagbara and Ndukuba Stre...,False,[],"['@sanctitybona', '@enugudisco', '@PowerUpNG', '@nepawahalang', '@csm_ng', '@fccpcnigeria']",[],peopl lucki taken transform offic peopl ezenwagbara ndukuba street ariaria aba transform still sit tight stand _peopl peopl_lucki lucki_taken taken_transform transform_offic offic_peopl peopl_ez...
2,Fri Aug 09 18:01:30 +0000 2019,@ChickenRepublic Dear @fccpcnigeria kindly advise,False,[],"['@ChickenRepublic', '@fccpcnigeria']",[],dear kindli advis _dear dear_kindli kindli_advis
3,Fri Aug 09 17:48:03 +0000 2019,@ChickenRepublic @CocaCola @CocaCola_NG @fccpcnigeria https://t.co/W8KYztwJhV,False,[],"['@ChickenRepublic', '@CocaCola', '@CocaCola_NG', '@fccpcnigeria']",[],_
4,Fri Aug 09 17:47:12 +0000 2019,@ChickenRepublic @CocaCola @CocaCola_NG @fccpcnigeria,False,[],"['@ChickenRepublic', '@CocaCola', '@CocaCola_NG', '@fccpcnigeria']",[],_


In [11]:
tweets.head()

Unnamed: 0,created_at,full_text,is_retweet,retweeted,mentioned,hashtags,clean_tweet
0,Sat Aug 24 05:28:19 +0000 2019,"I have taken it up with @fccpcnigeria . Hopefully, it can be resolved. Nobody protects consumers in this country. Hopefully the revamped @fccpcnigeria can prove me wrong.",False,[],"['@fccpcnigeria', '@fccpcnigeria']",[],taken hope resolv nobodi protect consum countri hope revamp prove wrong taken_hope hope_resolv resolv_nobodi nobodi_protect protect_consum consum_countri countri_hope hope_revamp revamp_prove pro...
1,Sat Aug 24 05:22:35 +0000 2019,@sanctitybona @enugudisco @PowerUpNG @nepawahalang @csm_ng @fccpcnigeria You people are lucky that they've taken your transformer to their office. As for the people of Ezenwagbara and Ndukuba Stre...,False,[],"['@sanctitybona', '@enugudisco', '@PowerUpNG', '@nepawahalang', '@csm_ng', '@fccpcnigeria']",[],peopl lucki taken transform offic peopl ezenwagbara ndukuba street ariaria aba transform still sit tight stand _peopl peopl_lucki lucki_taken taken_transform transform_offic offic_peopl peopl_ez...
2,Fri Aug 09 18:01:30 +0000 2019,@ChickenRepublic Dear @fccpcnigeria kindly advise,False,[],"['@ChickenRepublic', '@fccpcnigeria']",[],dear kindli advis _dear dear_kindli kindli_advis
3,Fri Aug 09 17:48:03 +0000 2019,@ChickenRepublic @CocaCola @CocaCola_NG @fccpcnigeria https://t.co/W8KYztwJhV,False,[],"['@ChickenRepublic', '@CocaCola', '@CocaCola_NG', '@fccpcnigeria']",[],_
4,Fri Aug 09 17:47:12 +0000 2019,@ChickenRepublic @CocaCola @CocaCola_NG @fccpcnigeria,False,[],"['@ChickenRepublic', '@CocaCola', '@CocaCola_NG', '@fccpcnigeria']",[],_


In [12]:
tweets.clean_tweet.unique().size

154911

#### Applying LDA

Now that we have clean text we can apply some processing to turn the clean tweets into vectors and then build a model.

To turn the text into a matrix*, where each row in the matrix encodes which words appeared in each individual tweet. 


We will also filter the words `max_df=0.9` means we discard any words that appear in >90% of tweets. In this dataset I don’t think there are any words that are that common but it is good practice. We will also filter words using `min_df=25`, so words that appear in less than 25 tweets will be discarded. We discard high appearing words since they are too common to be meaningful in topics. We discard low appearing words because we won’t have a strong enough signal and they will just introduce noise to our model.

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=250, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(tweets['clean_tweet']) #.toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

In [14]:
tf.shape

(200000, 1657)

In [15]:
tf_feature_names

['#',
 '#_',
 '#apuestas',
 '#bbnaija',
 '#bbnaija_',
 '#bbnaija_#bbnaija',
 '#bbnaija_#betja',
 '#bbnaijapepperdem',
 '#bbnaijapepperdem_#tacha',
 '#bbnaijia',
 '#bbnajia',
 '#bet',
 '#bet_#bet',
 '#bet_#betfair',
 '#bet_#betja',
 '#bet_#betting',
 '#bet_#soccer',
 '#bet_#williamhill',
 '#betclic',
 '#betfair',
 '#betfair_#bet',
 '#betin',
 '#betja',
 '#betja_',
 '#betja_#bet',
 '#betja_#nairabet',
 '#betja_#sportpesa',
 '#betja_#sportybet',
 '#betja_#wgb',
 '#betjabbn',
 '#betjabbn_',
 '#betjalive',
 '#betking',
 '#betpawa',
 '#betting',
 '#bettingexpert',
 '#bettingtips',
 '#betway',
 '#bigbronaija',
 '#bigbrothernaija',
 '#bitcoin',
 '#blackbet',
 '#bovada',
 '#bovada_#merrybet',
 '#bovada_#nairabet',
 '#bwin',
 '#bwin_#bet',
 '#bwin_#bovada',
 '#completesports',
 '#completesports_#soccer',
 '#fixedmatch',
 '#fixedmatches',
 '#football',
 '#inplay',
 '#liverpool',
 '#merrybet',
 '#merrybet_#naira',
 '#mfm',
 '#mfm_#completesports',
 '#money',
 '#mufc',
 '#naija',
 '#naijabet',
 '#n

The tf matrix is similar to the hashtag_vector_df dataframe. Each row is a tweet and each column is a word. The numbers in each position tell us how many times this word appears in this tweet.

Next we create the model object. Lets start by randomly choosing 10 topics.

In [16]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 10

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=45) # random state for reproducibility

In [17]:
model.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=45, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

Next we will inspect our topics that we generated and try to extract meaningful information from them.

In [18]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [19]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,nigeria,6304.4,account,18850.2,activ,4045.3,u,6645.3,#betja,8288.5,#bbnaija,16929.1,play,5594.6,thank,8874.5,pleas,7901.0,_,25667.5
1,i,3301.1,betja,6291.6,_,3143.6,like,5529.9,bank,4781.4,’,11119.1,betja,4425.4,kindli,4003.4,money,6612.0,betja,22370.7
2,day,2901.4,#,6221.3,wawu,2885.1,one,5150.3,game,4650.9,betja,4003.6,game,2565.9,hello,3148.9,sinc,3845.3,_betja,5954.2
3,nigeria_,2263.4,credit,5357.8,plan,2547.0,peopl,4619.7,today,3897.1,coin,3896.9,wait,2135.7,us,3051.2,_pleas,3599.5,code,4973.4
4,😂😂😂,2155.1,pleas,5246.4,de,2369.1,nigeria,4550.3,polari,3533.1,buhari,3683.9,support,2127.2,_hello,2846.4,need,3372.9,odd,4240.9
5,bill,2120.9,’t,4963.0,gb,2151.1,know,4506.9,win,3265.9,merci,3232.9,twitter,1998.5,_thank,2728.3,custom,3333.4,win,3000.1
6,cc,2046.3,pl,4709.8,pleas,1773.3,😂,4308.3,bubu,3143.8,it,3108.1,see,1851.4,thank_,2047.0,guy,3041.0,betja_,2929.0
7,pay,1866.3,id,4491.5,line,1728.2,go,4088.5,_,3012.5,tacha,3013.5,play_betja,1762.1,dm,1937.1,bank,2972.4,bet,2860.2
8,god,1859.7,betja_account,3900.1,la,1583.1,make,3129.0,#bet,2914.5,vote,2444.6,check,1599.9,via,1734.9,day,2803.1,na,2636.1
9,cc_,1801.7,help,3721.3,#williamhill,1392.7,even,2992.2,polari_bank,2804.8,–,2413.1,regard,1340.4,pleas,1730.3,get,2740.3,betja_code,2605.1


### Non-negative Matrix Factorisation (NMF).

In [20]:
from sklearn.decomposition import NMF

In [21]:
model_nmf = NMF(n_components=number_of_topics, random_state=42, alpha=.1, l1_ratio=.5, verbose=True)

In [22]:
model_nmf.fit(tf)

violation: 1.0
violation: 0.4125309653456096
violation: 0.22587590937223354
violation: 0.09908776087575762
violation: 0.04728187465794097
violation: 0.025186510707268787
violation: 0.015004612526747016
violation: 0.00990048329864382
violation: 0.006980543067290575
violation: 0.0052174143042793995
violation: 0.004152801798311493
violation: 0.003500578697361405
violation: 0.0030945337193856045
violation: 0.0028453535430729687
violation: 0.002695012062577527
violation: 0.0026074058560839127
violation: 0.00255722750358941
violation: 0.0025276251579136504
violation: 0.0025083441833912484
violation: 0.002494320637015608
violation: 0.0024836594433845357
violation: 0.0024746806092538903
violation: 0.0024666895565368394
violation: 0.0024593138632941895
violation: 0.002452357650392068
violation: 0.0024456370960473516
violation: 0.0024390471094232246
violation: 0.0024326288179197025
violation: 0.0024263339757669873
violation: 0.0024201464318963435
violation: 0.002414113367404087
violation: 0.0024

NMF(alpha=0.1, beta_loss='frobenius', init=None, l1_ratio=0.5, max_iter=200,
    n_components=10, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=True)

In [23]:
no_top_words = 10
display_topics(model_nmf, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,_,25.1,betja,22.7,#betja,7.8,account,17.8,#bbnaija,18.6,’,17.3,bet,5.2,bank,16.0,pleas,15.2,’t,14.2
1,#,1.1,_betja,3.6,#bet,6.4,credit,4.2,#betja,2.1,it,5.1,today,5.1,polari,3.8,_pleas,5.7,don,6.0
2,odd,1.0,odd,2.8,#soccer,4.8,#,4.0,coin,1.9,buhari,2.4,big,4.9,polari_bank,3.6,thank,5.1,can,2.2
3,₦,0.9,win,2.4,game,4.0,betja_account,3.7,#bbnaija_,1.3,nigeria,2.0,win,4.7,bank_,3.6,help,4.8,u,1.8
4,win,0.6,bet,2.3,#bwin,4.0,money,3.0,merci,1.2,osinbajo,1.3,now,4.4,_polari,1.7,kindli,4.0,i,1.7
5,_odd,0.5,code,2.3,#williamhill,3.9,debit,2.9,betja_coin,1.2,that,1.2,✅✅✅,4.3,first,1.2,us,3.7,didn,1.5
6,odd_,0.5,play,2.2,call,3.1,id,2.7,#bbnaija_#bbnaija,1.2,let,1.1,game,3.9,access,1.0,activ,3.3,know,1.3
7,#betja,0.5,game,1.9,whatsapp,2.8,deposit,2.6,tacha,1.1,presid,0.7,💥💥,3.7,access_bank,0.9,hello,3.2,won,1.1
8,fc,0.4,betja_,1.9,admin,2.6,id_,2.3,ike,1.1,–,0.7,fix,3.1,first_bank,0.9,_hello,3.0,like,1.1
9,win_,0.4,_odd,1.8,inbox,2.5,usernam,2.1,housem,1.1,aid,0.6,match,2.9,zenith,0.8,wawu,2.5,money,1.0
