In [99]:
#!pip install tqdm
#!pip install pyLDAvis
#! pip install boto3
#! pip install tensorflow
#! pip install "tensorflow_hub>=0.6.0"
#! pip install tensorflow==2.0.0
#!pip install stop_words

In [151]:
import numpy as np
import pandas as pd
import boto3

from sklearn.cluster import AgglomerativeClustering, SpectralClustering, KMeans, AffinityPropagation, MeanShift, DBSCAN, Birch

import tensorflow_hub as hub

import os

from sklearn import metrics

import re

import matplotlib.pyplot as plt

import collections
from collections import Counter
from stop_words import get_stop_words

In [None]:
#Itemise date and create a counter
def return_calander_wk(x,y,z):
    return datetime.date(x,y,z).isocalendar()[1]

def extend_weeks(x):
    if x < 10:
        x = '0{}'.format(x)
    else:
        x
    return '{}'.format(x)

def tokeniser(text):
    #  "nlp" Object is used to create documents with linguistic annotations.
    my_doc = nlp(text)

    # Create list of word tokens
    token_list = gensim.utils.simple_preprocess(text, deacc=True)

    return token_list

def remove_stop_words(token_list):
    # Create list of word tokens after removing stopwords
    filtered_text =[] 

    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_text.append(word) 
            
    return filtered_text 

def stemmer(token_list):
    #Prepare stemming objects
    ps = PorterStemmer()
    
    stemm_list = []
    
    for j in token_list:
        stemm_list.append(ps.stem(j))
        
    return stemm_list

def is_name(word):
    #Prepare stemming objects

    common_names = ['damian','damo','stefan','paul','dave','david','phil','rob','jack','sam','sambo','james','hamish','verity','zac','holly']
    
    common_teams = ['unit','arsen','liverpool','spurs','brighton','west ham']
    
    name_flag = 0
    
    if word in common_teams:
        name_flag = 1
    if word in common_names:
        name_flag = 2
    
    return name_flag

def text_cleaning(x):
    return re.sub(r'[^A-Z a-z0-9]+', '', x).lower()


def clean_url(x):
    return re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE)


### Load the embedding model and vectorise the data

There are two types of universal sentence encoder:
1. Universal sentence encoder, which is trained with a deep averaging network (DAN) encoder
2. Universal sentence encoder "large", which is trained with a Transformer encoder

They both have different versions, 1. V1 to V4 and 2. V1 to V5.
Main differences are that the early versions are trained with TensorFlow 1 and later versions are trained with TensorFlow 2.

1\. has the advantage that it can be used for transfer learning too; otherwise, I imagine performance to be similar.

In [205]:
# Define function to download the right model from S3 (the model can generally loaded from URL but on this AWS they have been blacklisted)
def downloadDirectoryFroms3(bucket,remoteDirectoryName):
    s3_resource = boto3.resource('s3')
    bucket = s3_resource.Bucket(bucket) 
    for object in bucket.objects.filter(Prefix = remoteDirectoryName):
        if not os.path.exists(os.path.dirname(object.key.split(project_folder)[-1])):
            os.makedirs(os.path.dirname(object.key.split(project_folder)[-1]))
        if object.key.split(project_folder)[-1][-1] != '/': #avoids trying to copy files from an empty folder
            bucket.download_file(object.key,object.key.split(project_folder)[-1])

In [206]:
bucket = 'bt-data-science-playground'

project_folder = 'nps-score-verbatim-text-analysis/'

model_filename = 'model_objects_UniversalSentenceEncoders/4' # look on S3 for available ones or download other versions from the Google's webpages

# Download the relevant model objects
downloadDirectoryFroms3(bucket, project_folder+model_filename)

embed = hub.load(model_filename)

In [207]:
#Load data from the full thread
df=pd.read_csv('data/PREP_combo_thread_data_20200516.csv')
df.head(3)

#Word Cloud specific prep
redacted_df = df.drop(df[df['text'] == ' MediaShared'].index)
redacted_df = redacted_df.drop(redacted_df[redacted_df['text'] == ' DeletedMsg'].index)
redacted_df = redacted_df.drop(redacted_df[redacted_df['text'] == ' Deleted Message'].index)

redacted_df.text = redacted_df.text.astype(str)

redacted_df['text'] = redacted_df['text'].apply(lambda x: clean_url(x))

In [208]:
# Get data as all the verbatim text in a single list
data = redacted_df['text'].values.tolist()

ids = redacted_df['text'].index

In [209]:
embeddings = embed(data)

print(embeddings.shape)

(52670, 512)


In [210]:
train = pd.DataFrame(np.array(embeddings))

In [220]:
from sklearn.metrics import silhouette_score

range_n_clusters = list (range(2,25))

for n_clusters in range_n_clusters:
    clusterer = KMeans(n_clusters=n_clusters)
    preds = clusterer.fit_predict(train)
    centers = clusterer.cluster_centers_

    score = silhouette_score(train, preds)
    print("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))

KeyboardInterrupt: 

In [236]:
#Use Clustering ALgorithm to Cluster Embeddings
n_topics_2 = 24
#n_samples=
labels_KM_final = KMeans(n_clusters = n_topics_2, random_state = 0).fit_predict(train)

In [237]:
#Create DataFrame with Data and Labels
columns = ['Topic', 'text']
output_samp = pd.DataFrame()
output_samp['Topic'] = labels_KM_final
output_samp['text'] = data

In [238]:
topic_count = Counter(output_samp['Topic'])

for i in range(0,(n_topics_2)):
    print({i}, topic_count[i]) 

{0} 1237
{1} 1777
{2} 2223
{3} 2641
{4} 1891
{5} 4158
{6} 1941
{7} 365
{8} 1444
{9} 775
{10} 1595
{11} 1551
{12} 2162
{13} 746
{14} 3167
{15} 3121
{16} 3773
{17} 1438
{18} 2190
{19} 2293
{20} 3719
{21} 2312
{22} 2894
{23} 3257


In [239]:
for number in range(0,(n_topics_2)):
    data_numb = output_samp[output_samp['Topic'] == number]['text'].values
    np.set_printoptions(threshold=np.inf)
    
    stop_words = get_stop_words('en')
    newStopWords = ['service', 'wasnt', 'resolved', 'resolve', 'store', 'ee','phone', 'didnt', 'told', 'good', 'helpful', 
                   'get', 'will', 'sim', 'old', 'new', 'friendly', 'contract', 'deal', 'now', 'customer', 'im', 'app', 'went'
                    , 'sent', 'one', 'go', 'said', 'just', 'dont', 'got', 'still', 'ask', 'wanted','yeah','thats','like',
                   'lol','thats']
    stop_words.extend(newStopWords)
    df_frame = pd.DataFrame({ 'data_words': [str(data_numb)]})
    df_frame['data_words'] = df_frame['data_words'].str.replace("[^\w\s]", "").str.lower()
    df_frame['data_words'] = df_frame['data_words'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))
    most_common_ = df_frame['data_words'].str.split(expand=True).stack().value_counts() 
    print('\n', most_common_[0:10])


 brexit      181
vote        170
labour      131
tories       83
election     75
tory         75
lib          73
people       72
think        69
trump        67
dtype: int64

 ah         68
know       59
think      59
see        49
thought    40
ive        39
well       37
cant       36
need       26
though     26
dtype: int64

 shit        323
fucking     218
fuck        209
cunt         72
troll        48
butthurt     47
cant         45
trolling     43
fucked       42
really       40
dtype: int64

 football    274
league      222
team        211
game        209
cup         167
win         163
season      147
world       133
players     128
can         114
dtype: int64

 next      18
fake      17
news      17
also      17
true      15
2         13
10        11
think     10
back      10
thanks    10
dtype: int64

 back      66
big       66
news      65
time      55
people    52
great     52
bbc       49
can       47
first     47
also      46
dtype: int64

 eat       98
food      91
me

In [224]:
output_samp.loc[output_samp['Topic'] == 0].head(50)

Unnamed: 0,Topic,text
0,0,hebiiiiib
1,0,hibeb
13,0,+12813868870
20,0,facebexit
31,0,damo?
36,0,👍🏿
37,0,🐄👈🏻
39,0,🍆
40,0,🤷🏻‍♀
42,0,💩


In [235]:
#Create Dataset for each cluster
data = output_samp['text'].values
data_0 = output_samp[output_samp['Topic'] == 0]['text'].values # > topic and URLs
data_1 = output_samp[output_samp['Topic'] == 1]['text'].values # large General conversation
data_2 = output_samp[output_samp['Topic'] == 2]['text'].values # NAN
data_3 = output_samp[output_samp['Topic'] == 3]['text'].values # Football
data_4 = output_samp[output_samp['Topic'] == 4]['text'].values # Short conversation
data_5 = output_samp[output_samp['Topic'] == 5]['text'].values # < 3 word response
np.set_printoptions(threshold=np.inf)

In [234]:
data_5

array([' only', ' what is this', ' please explain', ' lol', ' perfect',
       ' i think it is', ' whatever', ' 👍', ' it’s a good one', ' lol',
       ' haha', ' ah. so good', ' lol', ' oh no', ' ?', ' anyone?',
       ' lol amazing', ' *is', ' hahahaha', ' nah i’m in loondon',
       ' there is', ' 👌', ' haha yeah', ' tonight', ' it begins',
       ' bollocks', ' o rly', ' 👍', ' fair', ' not necessarily..',
       ' he’s', ' 👍', ' i have it currently', ' this is mine', ' correct',
       ' only just', ' hard to say', ' would like that', ' very deep',
       ' lol', ' so hard', ' perfect', ' lol out loud', ' or maybe:',
       ' ___', ' nah from leamington', " not that i'm aware of",
       ' yeah it has.', " that's encouraging", ' as in right now?', ' 👍',
       ' what?', ' yes', ' ah so it begins', ' yeah', ' nice.', ' lol',
       ' just scored', ' ah - 1-1', ' ?', ' i was thinking first',
       ' fucking two', ' i ❤ acid', ' kek', ' nah...', ' not for me',
       ' honest mate', '

In [177]:
#Create Dataset for each cluster - x24
data = output_samp['text'].values
data_0 = output_samp[output_samp['Topic'] == 0]['text'].values # POLITICS
data_1 = output_samp[output_samp['Topic'] == 1]['text'].values
data_2 = output_samp[output_samp['Topic'] == 2]['text'].values # Swearing
data_3 = output_samp[output_samp['Topic'] == 3]['text'].values # Generic Sports
data_4 = output_samp[output_samp['Topic'] == 4]['text'].values
data_5 = output_samp[output_samp['Topic'] == 5]['text'].values
data_6 = output_samp[output_samp['Topic'] == 6]['text'].values
data_7 = output_samp[output_samp['Topic'] == 7]['text'].values
data_8 = output_samp[output_samp['Topic'] == 8]['text'].values
data_9 = output_samp[output_samp['Topic'] == 9]['text'].values
data_10 = output_samp[output_samp['Topic'] == 10]['text'].values
data_11 = output_samp[output_samp['Topic'] == 11]['text'].values
data_12 = output_samp[output_samp['Topic'] == 12]['text'].values
data_13 = output_samp[output_samp['Topic'] == 13]['text'].values
data_14 = output_samp[output_samp['Topic'] == 14]['text'].values
data_15 = output_samp[output_samp['Topic'] == 15]['text'].values
data_16 = output_samp[output_samp['Topic'] == 16]['text'].values
data_17 = output_samp[output_samp['Topic'] == 17]['text'].values
data_18 = output_samp[output_samp['Topic'] == 18]['text'].values
data_19 = output_samp[output_samp['Topic'] == 19]['text'].values # > (+URLS)
data_20 = output_samp[output_samp['Topic'] == 20]['text'].values
data_21 = output_samp[output_samp['Topic'] == 21]['text'].values
data_22 = output_samp[output_samp['Topic'] == 22]['text'].values
data_23 = output_samp[output_samp['Topic'] == 23]['text'].values
data_24 = output_samp[output_samp['Topic'] == 24]['text'].values
np.set_printoptions(threshold=np.inf)

In [110]:
#Prepare yearweek counter
splits = df['date'].str.split("/", expand = True)

df['day'] = splits[0].astype(int)
df['month'] = splits[1].astype(int)
df['year'] = splits[2].astype(int)

df['week'] = df.apply(lambda x:return_calander_wk(x.year, x.month, x.day), axis=1)
df['week'] = df['week'].apply(lambda x:extend_weeks(x))

df['text'] = df['text'].astype(str)

In [111]:
#Aggregate words per yearweek
#df_agg = df.groupby(['yearweek'])['text'].apply(''.join).reset_index()

#Prepare stemming objects
ps = PorterStemmer()

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

df['words'] = df['text'].apply(lambda x: text_cleaning(x))
df['words'] = df['words'].apply(lambda x: tokeniser(x))
df['words'] = df['words'].apply(lambda x: remove_stop_words(x))
#df_agg['words'] = df_agg['words'].apply(lambda x: remove_common_words(x,common_words))
#df['words'] = df['words'].apply(lambda x: stemmer(x))

#df_agg['word_count_dict'] = df_agg['words'].apply(lambda x: CountFrequency(x))

#Word Cloud specific prep
df = df.drop(df[df['text'] == ' MediaShared'].index)
df = df.drop(df[df['text'] == ' DeletedMsg'].index)
df = df.drop(df[df['text'] == ' Deleted Message'].index)

data_words = list(df['words'])

In [117]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out