In [156]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

# data = pd.read_csv("../input/sample.csv")
data = pd.read_csv('data/twcs.csv')

In [157]:
data.shape

(1904290, 7)

In [158]:
initial_inbound_tweets = data[data['inbound'] & pd.isnull(data['in_response_to_tweet_id'])]

In [159]:
# Pick only inbound tweets that aren't in reply to anything...
first_inbound = data[pd.isnull(data.in_response_to_tweet_id) & data.inbound]

# Merge in all tweets in response
inbounds_and_outbounds = pd.merge(first_inbound, data, left_on='tweet_id', 
                                  right_on='in_response_to_tweet_id')
print(inbounds_and_outbounds.columns)
# Filter out cases where reply tweet isn't from company
inbounds_and_outbounds = inbounds_and_outbounds[inbounds_and_outbounds.inbound_y ^ True]

Index(['tweet_id_x', 'author_id_x', 'inbound_x', 'created_at_x', 'text_x',
       'response_tweet_id_x', 'in_response_to_tweet_id_x', 'tweet_id_y',
       'author_id_y', 'inbound_y', 'created_at_y', 'text_y',
       'response_tweet_id_y', 'in_response_to_tweet_id_y'],
      dtype='object')


In [160]:
# inbounds_and_outbounds.reset_index()
# Count total volume per company
total_company_tweets = inbounds_and_outbounds.groupby("author_id_y").count()["tweet_id_x"]

# Figure out the top 5 companies by volume.
total_company_tweets.sort_values(ascending=False)[0:5]

author_id_y
AppleSupport    56197
AmazonHelp      55389
Uber_Support    25602
Delta           20492
SpotifyCares    18152
Name: tweet_id_x, dtype: int64

In [161]:
apple_support = inbounds_and_outbounds[inbounds_and_outbounds['author_id_y'] == 'AmazonHelp']

In [162]:
def format_tweet(tweet):
    # Remove Twitter handles & hashtags
    entity_prefixes = ['@','#', 'http']
    tweet_without_entities = " ".join([word for word in tweet.split() if not word.startswith(tuple(entity_prefixes))])
    
    # Remove non-alpha characters
    return ''.join([char for char in tweet_without_entities if not char.isdigit()])

formatted_tweets = apple_support['text_x'].apply(format_tweet)

# Quick comparison pre-formatting & post-formatting.
print("{}".format(apple_support['text_x'].iloc[0]))
print("->")
print("{}".format(formatted_tweets.iloc[0]))

Dear @AmazonHelp  It feels  really devastating that once you promise something then you break it. I didn't get my cashback yet. See it. https://t.co/6mQg6QpQdc
->
Dear It feels really devastating that once you promise something then you break it. I didn't get my cashback yet. See it.


In [170]:
import re
no_features = 10000

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(formatted_tweets)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(formatted_tweets)
tf_feature_names = tf_vectorizer.get_feature_names()

In [171]:
no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

In [172]:
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic {}".format(topic_idx + 1))
        print("Feature Names: {}".format(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for i, doc_index in enumerate(top_doc_indices):
            print("Example #{}: {}".format(i + 1, documents.iloc[doc_index]))
        print()

In [173]:
nmf_W = nmf.transform(tfidf)
nmf_H = nmf.components_

# Run LDA
lda_W = lda.transform(tf)
lda_H = lda.components_

no_top_words = 5
no_top_documents = 5

In [174]:
display_topics(nmf_H, nmf_W, tfidf_feature_names, apple_support['text_x'], no_top_words, no_top_documents)

Topic 1
Feature Names: order, placed, id, cancelled, cancel
Example #1: @AmazonHelp Where's my order?!!!!!
Example #2: Why can't I order toasted ravioli @84042
Example #3: #AmazonPrime #Amazonin @84039 #order #delivered but not #received #deliveryissue #nofollowup #complaintmade Order#407-1918105-8965111
Example #4: @84039 my order no is

404-6253992-4618766
Example #5: @84039  order Quary #405-7877725-5465914

Topic 2
Feature Names: amazon, india, pay, account, balance
Example #1: Amazon Studentの2000円クーポン
使えないんだけどなんで？
貰えるのを前提に登録したのに…
ちゃんとAmazon販売配送商品なのに…
かれこれ2時間格闘してる泣きそう
Example #2: Amazon primeはいいぞ。 https://t.co/LpWUpytkz0
Example #3: 昨日からスピッツしばり。

Amazon Music本当に便利だからこのためだけにprime会員になってもいいレベルだと思うの…

#AmazonPrime https://t.co/jBwAmbAdjb
Example #4: Amazon bar行きたかった人生であった。
Example #5: amazon bar良さげやな

Topic 3
Feature Names: prime, membership, days, time, member
Example #1: @84042 How is this Prime? https://t.co/zPzOXmJiq2
Example #2: Prime readingとkindle unlimitedの区別ってどうされるの？両会員だけどアンリミ

In [175]:
display_topics(lda_H, lda_W, tf_feature_names, apple_support['text_x'], no_top_words, no_top_documents)

Topic 1
Feature Names: account, time, amazon, money, deliver
Example #1: @amazonhelp do you make most of your money by purposely making it impossible for customers to receive help on incorrect orders?
Example #2: Has @85465 increased the rate from 499pa to 999pa? Is it even for existing customers? @84039
Example #3: Too many times now, my prime orders have been delayed, @84039! Your reliability is becoming questionable.
#AmazonPrime
Example #4: Always used to have really reliable prime deliveries from @84028 but the last two have been a mess 🤨
Example #5: .@84042 sooo if some1 hacked my acct, changed e-mail address, purchased things on this acct, what should I do? Your cust svc no help at all

Topic 2
Feature Names: just, delivery, time, package, come
Example #1: @84042 The language barrier with your CSR’s SUCKS! I have called 4 times talked to 4 different people who can’t understand shit I say!
Example #2: @84398 misses delivery window once again to continue their streak of not a sing

In [176]:
import pyLDAvis
import pyLDAvis.sklearn
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning) 
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vectorizer)

In [177]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(nmf, tf, tf_vectorizer)

  return dists / dists.sum(axis=1)[:, None]
  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
  relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift
  relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift
