# Aspect Identifying
### Running through the tweets, based on Harry Wang's Document Clustering 
<p> We take the tweets out of the data, clean them of html, hashtags and mentions, then run the text through a tokenizer before identifying the most common words/phrases throughout the tweets. Those common words/phrases are then the aspects we base the clustering of the tweets on. As of right now, the model is using singular terms with a fixed dictionary, the hope is to move towards a dynamic dictionary that grows with the new data recieved.</p>

In [1]:
import preprocessor as p #https://pypi.org/project/tweet-preprocessor/
import numpy as np
import pandas as pd
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer
from bs4 import BeautifulSoup
import re
import os
import codecs
from sklearn import feature_extraction
import string
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

Read the tweets

In [2]:
df = pd.read_csv('data/Tweets.csv')
tweets = df.text

In [3]:
len(tweets)

14640

In [4]:
tweet = tweets[0]

In [5]:
tweet

'@VirginAmerica What @dhepburn said.'

In [6]:
tweet = p.clean(tweet)

In [7]:
tweet

'What said.'

In [8]:
#tweets

In [9]:
new_tweets = []
for tweet in tweets:
    tweet = BeautifulSoup(tweet, 'html.parser').getText()
    tweet = p.clean(tweet)
    new_tweets.append(tweet)

In [10]:
tweets = new_tweets
#tweets

In [11]:
stopwords = nltk.corpus.stopwords.words('english')

stemmer = SnowballStemmer("english")

In [12]:
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [13]:
def tokenize_only(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [14]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in tweets:
    allwords_stemmed = tokenize_and_stem(i) # for each item in 'tweets', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) # extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [15]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.1, min_df=.001,
                                   stop_words='english', max_features=1000,
                                   strip_accents='unicode', use_idf=True, 
                                   tokenizer=tokenize_and_stem, ngram_range=(1,2))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(tweets) #fit the vectorizer to tweets

print(tfidf_matrix.shape)
terms = tfidf_vectorizer.get_feature_names()
#print(terms)
len(terms)



Wall time: 8.78 s
(14640, 1000)


1000

In [17]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

Wall time: 8.09 s


In [18]:
Tweets = { 'tweet': tweets, 'cluster': clusters }

frame = pd.DataFrame(Tweets, index = [clusters])

#print(frame)

frame['cluster'].value_counts() #number of tweets per cluster (clusters from 0 to 4)


1    10759
3     1211
0     1158
4      892
2      620
Name: cluster, dtype: int64

In [19]:
clusterwords = pd.DataFrame(columns = ['Cluster', 'Word', "Name"])

In [20]:
print("Top terms per cluster:")

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    name = "Cluster %d words:" % i
    
    for ind in order_centroids[i, :1]:
        word = ' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0]
        print(word, end=',')
        clusterwords.loc[len(clusterwords.index)] = [i, word, "Miscellaneous"]
    print() #add whitespace
    print() #add whitespace


Top terms per cluster:
Cluster 0 words: help,

Cluster 1 words: hour,

Cluster 2 words: service,

Cluster 3 words: 's,

Cluster 4 words: cancelled,



In [21]:
cluster_assignments = {'Customer Service':['service', 'customer', 'phone'], 
                       'Delays':['delayed', 'hour', 'hold', 'waited', 'late'], 
                       'Cancelled Flight(s)':['cancelled', 'rebook'], 
                       'Please Help':['help', 'please', 'need'], 
                       'Physical location':['bag', 'gate', 'check', 'agent', 'lost', 'connecting']
                      }

def get_key(val):
    for key, value in cluster_assignments.items():
        for item in value:
            if (val == item):
                return key

In [26]:
for i in range(num_clusters):
    cluster = clusterwords[clusterwords.isin([i]).any(axis=1)]
    tof = False
    word = cluster['Word'].values[0]
    word = word.replace(" ", "")
    cluster_name = "Miscellaneous"
    for value in cluster_assignments.values():
        for item in value:
            if (word == item):
                tof = True
                cluster_name = get_key(word)
                break
    clusterwords.loc[(clusterwords['Cluster'] == i), 'Name'] = cluster_name
    frame.loc[(frame['cluster'] == i), 'reason'] = cluster_name

In [27]:
clusterwords

Unnamed: 0,Cluster,Word,Name
0,0,help,Please Help
1,1,hour,Delays
2,2,service,Customer Service
3,3,'s,Miscellaneous
4,4,cancelled,Cancelled Flight(s)


In [28]:
frame

Unnamed: 0,tweet,cluster,reason
1,What said.,1,Delays
1,plus you've added commercials to the experienc...,1,Delays
1,I didn't today... Must mean I need to take ano...,1,Delays
3,"it's really aggressive to blast obnoxious ""ent...",3,Miscellaneous
3,and it's a really big bad thing about it,3,Miscellaneous
...,...,...,...
1,thank you we got on a different flight to Chic...,1,Delays
3,leaving over minutes Late Flight. No warnings ...,3,Miscellaneous
0,Please bring American Airlines to,0,Please Help
1,"you have my money, you change my flight, and d...",1,Delays


In [24]:
#https://codereview.stackexchange.com/questions/249329/finding-the-most-frequent-words-in-pandas-dataframe
# for i in range(num_clusters):
#     words = clusterwords[clusterwords.isin([i]).any(axis=1)]
#     word_count = Counter(" ".join(words.Word).split()).most_common(10)
#     word_frequency = pd.DataFrame(word_count, columns = ['Word', 'Frequency'])
#     print(word_frequency)
#     print("Cluster ", i, "'s most frequent word is: ", word_frequency.Word)

In [25]:
#import sklearn.externals.joblib
import joblib

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')