In [2]:
#!pip install tqdm
#!pip install pyLDAvis

In [125]:
# Run in python console
#import nltk; nltk.download('stopwords')

# Run in terminal or command prompt
#python3 -m spacy download en

In [124]:
from tqdm import tqdm

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.animation as animation
from IPython.display import HTML

import datetime

from collections import Counter

import pickle

#from wordcloud import WordCloud, STOPWORDS
from nltk.stem import PorterStemmer
# spacy for lemmatization
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this

%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# NLTK Stop words
#from nltk.corpus import stopwords
#stop_words = stopwords.words('english')

In [15]:
#Itemise date and create a counter
def return_calander_wk(x,y,z):
    return datetime.date(x,y,z).isocalendar()[1]

def extend_weeks(x):
    if x < 10:
        x = '0{}'.format(x)
    else:
        x
    return '{}'.format(x)

def tokeniser(text):
    #  "nlp" Object is used to create documents with linguistic annotations.
    my_doc = nlp(text)

    # Create list of word tokens
    token_list = gensim.utils.simple_preprocess(text, deacc=True)

    return token_list

def remove_stop_words(token_list):
    # Create list of word tokens after removing stopwords
    filtered_text =[] 

    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_text.append(word) 
            
    return filtered_text 

def stemmer(token_list):
    #Prepare stemming objects
    ps = PorterStemmer()
    
    stemm_list = []
    
    for j in token_list:
        stemm_list.append(ps.stem(j))
        
    return stemm_list

def is_name(word):
    #Prepare stemming objects

    common_names = ['damian','damo','stefan','paul','dave','david','phil','rob','jack','sam','sambo','james','hamish','verity','zac','holly']
    
    common_teams = ['unit','arsen','liverpool','spurs','brighton','west ham']
    
    name_flag = 0
    
    if word in common_teams:
        name_flag = 1
    if word in common_names:
        name_flag = 2
    
    return name_flag

def text_cleaning(x):
    return re.sub(r'[^A-Z a-z0-9]+', '', x).lower()


In [109]:
#Load data from the full thread
df=pd.read_csv('data/PREP_combo_thread_data_20200516.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,date,time,text,name
0,1,29/03/2018,09:00,hebiiiiib,DaveY
1,2,29/03/2018,09:01,hibeb,JackG
2,3,29/03/2018,09:01,let's just see. we need paul to make a clean ...,DaveY


In [110]:
#Prepare yearweek counter
splits = df['date'].str.split("/", expand = True)

df['day'] = splits[0].astype(int)
df['month'] = splits[1].astype(int)
df['year'] = splits[2].astype(int)

df['week'] = df.apply(lambda x:return_calander_wk(x.year, x.month, x.day), axis=1)
df['week'] = df['week'].apply(lambda x:extend_weeks(x))

df['text'] = df['text'].astype(str)

In [111]:
#Aggregate words per yearweek
#df_agg = df.groupby(['yearweek'])['text'].apply(''.join).reset_index()

#Prepare stemming objects
ps = PorterStemmer()

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

df['words'] = df['text'].apply(lambda x: text_cleaning(x))
df['words'] = df['words'].apply(lambda x: tokeniser(x))
df['words'] = df['words'].apply(lambda x: remove_stop_words(x))
#df_agg['words'] = df_agg['words'].apply(lambda x: remove_common_words(x,common_words))
#df['words'] = df['words'].apply(lambda x: stemmer(x))

#df_agg['word_count_dict'] = df_agg['words'].apply(lambda x: CountFrequency(x))

#Word Cloud specific prep
df = df.drop(df[df['text'] == ' MediaShared'].index)
df = df.drop(df[df['text'] == ' DeletedMsg'].index)
df = df.drop(df[df['text'] == ' Deleted Message'].index)

data_words = list(df['words'])

In [112]:
df['words'].head(10)

0                                           [hebiiiiib]
1                                               [hibeb]
2                      [lets, need, paul, clean, break]
3                                [yeah, big, deteather]
4     [electing, trump, doesnt, sit, froust, neutral...
5                                     [feeling, maundy]
6                                    [maundy, thursday]
7                              [people, numbers, thier]
8                                      [trolly, matrix]
10                  [numbers, dont, saved, phone, hoot]
Name: words, dtype: object

In [113]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=3, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [115]:
# See trigram example
print(data_words[22])

['tbh', 'prefer', 'stay', 'facebook', 'main', 'reason', 'cant', 'interact', 'whatsapp', 'pc', 'reduc', 'function', 'mainli', 'phil', 'send']


In [116]:
# See trigram example
print(trigram_mod[bigram_mod[data_words[22]]])

['tbh', 'prefer', 'stay', 'facebook', 'main', 'reason', 'cant', 'interact', 'whatsapp', 'pc', 'reduc', 'function', 'mainli', 'phil', 'send']


In [117]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [127]:
# Remove Stop Words
#data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
texts = make_bigrams(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
#! python3 -m spacy download en
#nlp = spacy.load('en', disable=['ner'])
nlp = English()

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Create Dictionary
id2word = corpora.Dictionary(texts)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [129]:
#data_lemmatized

In [86]:
i = 523

print(corpus[i])
print(texts[i])
#[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:22]]

[(125, 1), (163, 1), (597, 1), (775, 1), (862, 1), (1153, 1), (1154, 1), (1155, 1), (1156, 2)]
['christma', 'parti', 'savil', 'savil', 'owe', 'quid', 'georg', 'best', 'stori', 'right']


In [98]:
len(corpus)

52670

In [141]:
%%time

rcorpus = corpus[:5000]
rtexts = texts[:5000]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=rcorpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

CPU times: user 33.5 s, sys: 4.1 ms, total: 33.5 s
Wall time: 33.5 s


In [142]:
doc_lda = lda_model[rcorpus]

In [143]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())

[(0,
  '0.035*"have" + 0.027*"doesnt" + 0.025*"possibl" + 0.015*"theyv" + '
  '0.009*"gone" + 0.009*"problem" + 0.005*"damo" + 0.004*"sit" + 0.003*"data" '
  '+ 0.000*"elect"'),
 (1,
  '0.157*"yeah" + 0.044*"cant" + 0.024*"damian" + 0.011*"includ" + '
  '0.008*"small" + 0.003*"interest" + 0.000*"hear" + 0.000*"game" + '
  '0.000*"wait" + 0.000*"thing"'),
 (2,
  '0.049*"best" + 0.036*"big" + 0.016*"phil" + 0.014*"manag" + 0.011*"ill" + '
  '0.007*"base" + 0.006*"phone" + 0.003*"add" + 0.001*"main" + '
  '0.001*"breakfast"'),
 (3,
  '0.031*"way" + 0.007*"jack" + 0.006*"gump" + 0.005*"week" + 0.000*"go" + '
  '0.000*"england" + 0.000*"get" + 0.000*"glori" + 0.000*"fight" + '
  '0.000*"game"'),
 (4,
  '0.027*"dave" + 0.017*"theyr" + 0.001*"gif" + 0.000*"corner" + 0.000*"love" '
  '+ 0.000*"coach" + 0.000*"apart" + 0.000*"chadli" + 0.000*"winger" + '
  '0.000*"januzaj"'),
 (5,
  '0.000*"capita" + 0.000*"nippi" + 0.000*"gratin" + 0.000*"fondant" + '
  '0.000*"dauphinois" + 0.000*"somesort" +

In [146]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(rcorpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -26.564629849497766

Coherence Score:  0.3986827192143076


In [147]:
#%%time
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, rcorpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
