## Biterm Topic Modelling

#### Dependencies

- **Biterm** ( pip install biterm )
- **pyLDAvis** ( pip install pyLDAvis )
- **Scikit-Learn** ( pip install sklearn )
- **Pandas** ( pip install pandas )
- **xlrd** ( pip install xlrd )

In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pyLDAvis
import os
from biterm.utility import vec_to_biterms
from biterm.cbtm import oBTM
pyLDAvis.enable_notebook()


In [10]:
%time

data_location = './Data/'
tweet_files = os.listdir(data_location)
print('*'*50)
print("*"*25,"Files","*"*25)
print("\n".join([t for t in tweet_files]))
print('*'*50)

print('Creating Courpus from data...')

CPU times: user 7 µs, sys: 1 µs, total: 8 µs
Wall time: 13.1 µs
**************************************************
************************* Files *************************
edu.xlsx
fakenews.xlsx
psl18.xlsx
sports1.xlsx
psl.xlsx
hrt.xlsx
disease.xlsx
diabet.xlsx
fashion.xlsx
nov.xlsx
gw.xlsx
sports.xlsx
cncr.xlsx
music.xlsx
climate.xlsx
envir.xlsx
**************************************************
Creating Courpus from data...


In [11]:
%time

corpus = np.array([],dtype="object")

for each_file in tweet_files:
    df = pd.read_excel(data_location+each_file)
    print(each_file,'=> Total Tweets => ',df.size, ' sample size => ','100','\n')
    corpus = np.append(corpus,"".join([i for i in df['Text'][:100]])) 


CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 11.7 µs
edu.xlsx => Total Tweets =>  60000  sample size =>  100 

fakenews.xlsx => Total Tweets =>  12000  sample size =>  100 

psl18.xlsx => Total Tweets =>  8376  sample size =>  100 

sports1.xlsx => Total Tweets =>  19824  sample size =>  100 

psl.xlsx => Total Tweets =>  3456  sample size =>  100 

hrt.xlsx => Total Tweets =>  12000  sample size =>  100 

disease.xlsx => Total Tweets =>  12000  sample size =>  100 

diabet.xlsx => Total Tweets =>  948  sample size =>  100 

fashion.xlsx => Total Tweets =>  21576  sample size =>  100 

nov.xlsx => Total Tweets =>  12000  sample size =>  100 

gw.xlsx => Total Tweets =>  27396  sample size =>  100 

sports.xlsx => Total Tweets =>  19824  sample size =>  100 

cncr.xlsx => Total Tweets =>  12000  sample size =>  100 

music.xlsx => Total Tweets =>  41988  sample size =>  100 

climate.xlsx => Total Tweets =>  73200  sample size =>  100 

envir.xlsx => Total Tweets =>  60000  s

In [12]:
%time
vec = CountVectorizer(stop_words='english')
X = vec.fit_transform(corpus).toarray()
vocab = np.array(vec.get_feature_names())
biterms = vec_to_biterms(X)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


In [14]:
%time

btm = oBTM(num_topics=10, V=vocab)
topics = btm.fit_transform(biterms, iterations=1)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.06 µs



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [33:35<00:00, 2015.93s/it][A


[[0.3611136495210005,
  0.07174714922963471,
  0.0708076608676302,
  0.06812899366725007,
  0.07142508402133654,
  0.06526501781124447,
  0.07521420463267647,
  0.0738907957269915,
  0.07056709617241223,
  0.07184034834982339],
 [0.3110752729521798,
  0.07978510839525062,
  0.08896878931001995,
  0.07578020218658187,
  0.06874817876625422,
  0.07385121703812801,
  0.06797625022653234,
  0.07917282212600722,
  0.07922662904100931,
  0.07541552995803673],
 [0.27604818685440025,
  0.07858946624264146,
  0.08657560644357622,
  0.07897017961923905,
  0.07175982270098298,
  0.0834344937028739,
  0.08595779689508481,
  0.07692803310632725,
  0.0782021718476546,
  0.08353424258721938],
 [0.24834434763728636,
  0.08294939759994768,
  0.0820797470296308,
  0.08599250245760398,
  0.08655423178042146,
  0.0832526765107382,
  0.08467642661852105,
  0.08182008404930195,
  0.0855192259802342,
  0.07881136033631439],
 [0.2881577779049368,
  0.07838105440615108,
  0.07843335240882934,
  0.0771362077640

In [20]:
%timeit
pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))