In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import nltk

<ul><li>The inbound feature is important because it allows you to distinguish between company and customer.</li>
<li>Obviously the 'text' function is the main source of information.</li>
<li>I will apply natural language processing techniques. In my opinion, it is possible to address the problem in two ways:</li>
    <ul><li>Catalog customer tweets and identify the most discussed topics;</li>
        <li>Create a model that adapts to the tweets of the companies and try to answer a specific topic of a customer (to do)</li></ul>
</ul>    

# 1. Geting Data

In [2]:
df = pd.read_csv('twcs.csv')

### 1. Data Exploration

In [3]:
df.sample(10)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
1037970,1149252,390440,True,Sun Oct 15 17:06:48 +0000 2017,@AskPlayStation how come my audio with the Pla...,1149251.0,
1027912,1137919,388026,True,Tue Oct 24 02:07:46 +0000 2017,Damn @115937 is from San Jose and my brother ...,1137918.0,
1905093,767149,ChaseSupport,False,Sat Oct 07 14:57:29 +0000 2017,@303346 1/2 We're sorry to hear about your rec...,,767148.0
970880,1077275,374155,True,Fri Oct 13 14:19:58 +0000 2017,@AppleSupport Thanks a million. It is just res...,1077272.0,1077276.0
1419301,1561934,AmazonHelp,False,Sat Nov 04 09:57:28 +0000 2017,@482486 This feature is not available yet. I'l...,1561935.0,1561936.0
1217598,1343129,267514,True,Mon Oct 16 08:01:10 +0000 2017,@115873 @Uber_Support @4018 Their numbers didn...,1343128.0,1343130.0
1512284,1659600,Uber_Support,False,Tue Oct 17 14:49:10 +0000 2017,@160540 Here to help! Send us a note here; htt...,,1659601.0
817103,910578,336276,True,Fri Oct 13 18:44:03 +0000 2017,@115858 That doesn’t close me out of apps and ...,910577.0,910579.0
905203,1005402,335124,True,Sun Oct 22 19:21:38 +0000 2017,@GWRHelp I've already rung your helpline. This...,,1005401.0
252198,290519,AppleSupport,False,Mon Oct 09 15:58:31 +0000 2017,@185186 Let's look into this particular issue ...,,290518.0


In [4]:
df.shape

(2811774, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2811774 entries, 0 to 2811773
Data columns (total 7 columns):
tweet_id                   int64
author_id                  object
inbound                    bool
created_at                 object
text                       object
response_tweet_id          object
in_response_to_tweet_id    float64
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 131.4+ MB


### 1.2 Checking missing data

In [6]:
df.isnull().sum()

tweet_id                         0
author_id                        0
inbound                          0
created_at                       0
text                             0
response_tweet_id          1040629
in_response_to_tweet_id     794335
dtype: int64

# 2. Feature Extraction

In [7]:
#first inbound = Richiesta iniziale di un cliente
first_inbound = df[pd.isnull(df.in_response_to_tweet_id) & df.inbound]

inbOutb = pd.merge(first_inbound, df, left_on='tweet_id', 
                                  right_on='in_response_to_tweet_id').sample(frac=1)

# Filter to only outbound replies (from companies)
inbOutb = inbOutb[inbOutb.inbound_y ^ True]

In [8]:
inbOutb.shape


(794299, 14)

<b>Now the dataset is doubled in size (features), as each line contains:</b>
<ul><li>a customer request (text_x)</li>
    <li>a reply from the company (text_y)</li>
    <li>Related related features</li></ul>

<b>From which we can easily verify that:</b>
<ul><li>the 'inbound_x' feature is always True;</li>
    <li>the 'inbound_y' feature is always False;</li>

In [9]:
inbOutb.isnull().sum()

tweet_id_x                        0
author_id_x                       0
inbound_x                         0
created_at_x                      0
text_x                            0
response_tweet_id_x               0
in_response_to_tweet_id_x    794299
tweet_id_y                        0
author_id_y                       0
inbound_y                         0
created_at_y                      0
text_y                            0
response_tweet_id_y          530528
in_response_to_tweet_id_y         0
dtype: int64

The <b>'in_response_to_tweet_id_x'</b> feature is totally composed of <b>NaN.</b> So this feature will be dropped.

### 2.1 Drop useless features.

In [10]:
inbOutb.columns

Index(['tweet_id_x', 'author_id_x', 'inbound_x', 'created_at_x', 'text_x',
       'response_tweet_id_x', 'in_response_to_tweet_id_x', 'tweet_id_y',
       'author_id_y', 'inbound_y', 'created_at_y', 'text_y',
       'response_tweet_id_y', 'in_response_to_tweet_id_y'],
      dtype='object')

In [11]:
toDrop = ['tweet_id_x', 'inbound_x','response_tweet_id_x', 'in_response_to_tweet_id_x', 
          'tweet_id_y', 'inbound_y','response_tweet_id_y', 'in_response_to_tweet_id_y']

In [12]:
inbOutb.drop(toDrop, axis=1, inplace=True)
print('inbOutb shape: ', inbOutb.shape)

inbOutb shape:  (794299, 6)


In [13]:
inbOutb.sample(5)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y
277971,361414,Fri Oct 13 03:12:48 +0000 2017,@hulu_support I'm getting this message when I ...,hulu_support,Sat Oct 14 16:03:24 +0000 2017,@361414 Sorry for the delay! Try power cycling...
635209,644104,Thu Nov 09 21:17:05 +0000 2017,@AmericanAir gate k16 at OHare today was a dam...,AmericanAir,Thu Nov 09 21:43:02 +0000 2017,"@644104 We don't want you feeling this way, ou..."
732023,717611,Thu Nov 16 00:11:03 +0000 2017,hey @117735 @AppleSupport ... both myself and ...,AppleSupport,Thu Nov 16 01:12:31 +0000 2017,@717611 Thank you for reaching out. Please nav...
202570,299257,Mon Oct 23 19:28:06 +0000 2017,That's over three hours in two days spent on t...,VirginAtlantic,Mon Oct 23 21:15:02 +0000 2017,@299257 @299258 Sorry to hear you're not happy...
830041,790310,Mon Nov 27 16:02:00 +0000 2017,@115911 @TMobileHelp Beste T Mobile ik heb sin...,TMobileHelp,Mon Nov 27 16:05:49 +0000 2017,"@790310 Hey there, welcome to T-Force! We woul..."


# 3. Text Processing

In [14]:
inbOutb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 794299 entries, 538902 to 769548
Data columns (total 6 columns):
author_id_x     794299 non-null object
created_at_x    794299 non-null object
text_x          794299 non-null object
author_id_y     794299 non-null object
created_at_y    794299 non-null object
text_y          794299 non-null object
dtypes: object(6)
memory usage: 42.4+ MB


In [15]:
inbOutb.shape

(794299, 6)

### 3.1 Lower Casing

Lower casing is a common text preprocessing technique. The idea is to convert the input text into same casing format so that 'text', 'Text' and 'TEXT' are treated the same way.

This is more helpful for text featurization techniques like frequency, tfidf as it helps to combine the same words together thereby reducing the duplication and get correct counts / tfidf values.

This may not be helpful when we do tasks like Part of Speech tagging (where proper casing gives some information about Nouns and so on) and Sentiment Analysis (where upper casing refers to anger and so on)

In [16]:
def remove_uppercase(text):
    text_lowercase = ' '.join(x.lower() for x in text.split())# It will discard all uppercases
    return text_lowercase

In [17]:
inbOutb['text_x_clean'] = inbOutb['text_x'].apply(lambda x: remove_uppercase(x))
inbOutb['text_y_clean'] = inbOutb['text_y'].apply(lambda x: remove_uppercase(x))
#in modo da poter rimuovere i nomi delle compagnie
inbOutb['author_id_y'] = inbOutb['author_id_y'].apply(lambda x: remove_uppercase(x)) 

In [18]:
inbOutb.head(3)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y,text_x_clean,text_y_clean
538902,568875,Wed Oct 18 09:54:26 +0000 2017,Hey @GloCare your LTE data service is becoming...,glocare,Wed Oct 18 13:08:38 +0000 2017,"@568875 Afternoon Tari, may we know the exact ...",hey @glocare your lte data service is becoming...,"@568875 afternoon tari, may we know the exact ..."
127066,230636,Fri Dec 01 14:34:28 +0000 2017,"If I can’t access tracking, how will I know wh...",amazonhelp,Fri Dec 01 15:05:00 +0000 2017,"@230636 Apologies for the concern, Stephanie! ...","if i can’t access tracking, how will i know wh...","@230636 apologies for the concern, stephanie! ..."
132865,236039,Fri Dec 01 20:57:53 +0000 2017,soooo glad i pay for amazon prime for my packa...,amazonhelp,Fri Dec 01 20:59:40 +0000 2017,@236039 Did we miss the delivery date in the c...,soooo glad i pay for amazon prime for my packa...,@236039 did we miss the delivery date in the c...


### 3.2 Remove Puntuaction

Punctuation can provide grammatical context to a sentence which supports our understanding. But for our vectorizer which counts the number of words and not the context, it does not add value, so we remove all special characters. eg: How are you?->How are you.

In [19]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [20]:
#Function to remove Punctuation
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])# It will discard all punctuations
    return text_nopunct
inbOutb['text_x_clean'] = inbOutb['text_x_clean'].apply(lambda x: remove_punct(x))
inbOutb['text_y_clean'] = inbOutb['text_y_clean'].apply(lambda x: remove_punct(x))
inbOutb.head(3)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y,text_x_clean,text_y_clean
538902,568875,Wed Oct 18 09:54:26 +0000 2017,Hey @GloCare your LTE data service is becoming...,glocare,Wed Oct 18 13:08:38 +0000 2017,"@568875 Afternoon Tari, may we know the exact ...",hey glocare your lte data service is becoming ...,568875 afternoon tari may we know the exact is...
127066,230636,Fri Dec 01 14:34:28 +0000 2017,"If I can’t access tracking, how will I know wh...",amazonhelp,Fri Dec 01 15:05:00 +0000 2017,"@230636 Apologies for the concern, Stephanie! ...",if i can’t access tracking how will i know whe...,230636 apologies for the concern stephanie cou...
132865,236039,Fri Dec 01 20:57:53 +0000 2017,soooo glad i pay for amazon prime for my packa...,amazonhelp,Fri Dec 01 20:59:40 +0000 2017,@236039 Did we miss the delivery date in the c...,soooo glad i pay for amazon prime for my packa...,236039 did we miss the delivery date in the co...


### 3.3 Removing usernames
rimuovere nomi utenti, compagnie da text_x_clean<br>
rimuovere nomi utenti, compagnie da text_y_clean

In [21]:
companies = inbOutb['author_id_y'].unique()

inbOutb['text_x_clean'] = inbOutb['text_x_clean'].str.replace('\d+', '')
inbOutb['text_y_clean'] = inbOutb['text_y_clean'].str.replace('\d+', '')

inbOutb['text_x_clean'] = inbOutb['text_x_clean'].str.replace('|'.join(companies), '')
inbOutb['text_y_clean'] = inbOutb['text_y_clean'].str.replace('|'.join(companies), '')

inbOutb.head(3)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y,text_x_clean,text_y_clean
538902,568875,Wed Oct 18 09:54:26 +0000 2017,Hey @GloCare your LTE data service is becoming...,glocare,Wed Oct 18 13:08:38 +0000 2017,"@568875 Afternoon Tari, may we know the exact ...",hey your lte data service is becoming terribl...,afternoon tari may we know the exact issue be...
127066,230636,Fri Dec 01 14:34:28 +0000 2017,"If I can’t access tracking, how will I know wh...",amazonhelp,Fri Dec 01 15:05:00 +0000 2017,"@230636 Apologies for the concern, Stephanie! ...",if i can’t access tracking how will i know whe...,apologies for the concern stephanie could you...
132865,236039,Fri Dec 01 20:57:53 +0000 2017,soooo glad i pay for amazon prime for my packa...,amazonhelp,Fri Dec 01 20:59:40 +0000 2017,@236039 Did we miss the delivery date in the c...,soooo glad i pay for amazon prime for my packa...,did we miss the delivery date in the confirma...


In [22]:
inbOutb.shape

(794299, 8)

### 3.4 Removal of stopwords
Stopwords are commonly occuring words in a language like 'the', 'a' and so on. They can be removed from the text most of the times, as they don't provide valuable information for downstream analysis. In cases like Part of Speech tagging, we should not remove them as provide very valuable information about the POS.

In [23]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

<b>Similarly we can also get the list for other languages as well and use them.

In [24]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

inbOutb['text_wo_stop'] = inbOutb['text_x_clean'].apply(lambda x: remove_stopwords(x))
inbOutb['text_wo_stop'] = inbOutb['text_y_clean'].apply(lambda y: remove_stopwords(y))

inbOutb.head(3)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y,text_x_clean,text_y_clean,text_wo_stop
538902,568875,Wed Oct 18 09:54:26 +0000 2017,Hey @GloCare your LTE data service is becoming...,glocare,Wed Oct 18 13:08:38 +0000 2017,"@568875 Afternoon Tari, may we know the exact ...",hey your lte data service is becoming terribl...,afternoon tari may we know the exact issue be...,afternoon tari may know exact issue experience...
127066,230636,Fri Dec 01 14:34:28 +0000 2017,"If I can’t access tracking, how will I know wh...",amazonhelp,Fri Dec 01 15:05:00 +0000 2017,"@230636 Apologies for the concern, Stephanie! ...",if i can’t access tracking how will i know whe...,apologies for the concern stephanie could you...,apologies concern stephanie could please confi...
132865,236039,Fri Dec 01 20:57:53 +0000 2017,soooo glad i pay for amazon prime for my packa...,amazonhelp,Fri Dec 01 20:59:40 +0000 2017,@236039 Did we miss the delivery date in the c...,soooo glad i pay for amazon prime for my packa...,did we miss the delivery date in the confirma...,miss delivery date confirmation email check ht...


### 3.5  Checking Most Common Worlds
Previously, we just removed commonly occurring words in a general sense. We can also remove commonly occurring words from our text data First, let’s check the 10 most frequently occurring words in our text data then take call to remove or retain.

In [25]:
#common worlds

freqX = pd.Series(' '.join(inbOutb['text_x_clean']).split()).value_counts()[:10]
freqY = pd.Series(' '.join(inbOutb['text_y_clean']).split()).value_counts()[:10]
print('FREQ X: \n',freqX,'\nFREQ Y: \n', freqY)

FREQ X: 
 to     450771
i      402429
the    394086
my     310607
a      289704
and    250549
is     210765
for    199934
on     184365
you    172960
dtype: int64 
FREQ Y: 
 to      597010
you     562841
the     438571
your    355927
we      300935
us      281678
for     280735
can     262541
a       257072
this    245768
dtype: int64


In [26]:
#removing them
freqX = list(freqX.index)
freqY = list(freqY.index)
inbOutb['text_x_clean'] = inbOutb['text_x_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in freqX))
inbOutb['text_y_clean'] = inbOutb['text_y_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in freqY))

### 3.6 Checking Most Rare Worlds

In [27]:
rareX = pd.Series(' '.join(inbOutb['text_x_clean']).split()).value_counts()[-100:]
rareY = pd.Series(' '.join(inbOutb['text_y_clean']).split()).value_counts()[-100:]
print('RARE X: \n',rareX,'\nRARE Y: \n', rareY)

RARE X: 
 httpstcoryaustq                                                                                                                                                                                                                 1
orz                                                                                                                                                                                                                             1
’order                                                                                                                                                                                                                          1
возврата                                                                                                                                                                                                                        1
httpstcowjgxwcckd                                                                     

In [28]:
#removing them
rareX = list(rareX.index)
rareY = list(rareY.index)
inbOutb['text_x_clean'] = inbOutb['text_x_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in rareX))
inbOutb['text_y_clean'] = inbOutb['text_y_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in rareY))

# 4. Tokenizing

Tokenizing separates text into units such as sentences or words. It gives structure to previously unstructured text.

In [29]:
import re

# Function to Tokenize words
def tokenize(text):
    tokens = re.split('\W+', text) #W+ means that either a word character (A-Za-z0-9_) or a dash (-) can go there.
    return tokens


In [30]:
inbOutb['text_x_tokenized'] = inbOutb['text_x_clean'].apply(lambda x: tokenize(x.lower())) 
inbOutb['text_y_tokenized'] = inbOutb['text_y_clean'].apply(lambda x: tokenize(x.lower()))
#We convert to lower.

In [31]:
inbOutb.head(3)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y,text_x_clean,text_y_clean,text_wo_stop,text_x_tokenized,text_y_tokenized
538902,568875,Wed Oct 18 09:54:26 +0000 2017,Hey @GloCare your LTE data service is becoming...,glocare,Wed Oct 18 13:08:38 +0000 2017,"@568875 Afternoon Tari, may we know the exact ...",hey your lte data service becoming terrible do...,afternoon tari may know exact issue being expe...,afternoon tari may know exact issue experience...,"[hey, your, lte, data, service, becoming, terr...","[afternoon, tari, may, know, exact, issue, bei..."
127066,230636,Fri Dec 01 14:34:28 +0000 2017,"If I can’t access tracking, how will I know wh...",amazonhelp,Fri Dec 01 15:05:00 +0000 2017,"@230636 Apologies for the concern, Stephanie! ...",if can’t access tracking how will know when th...,apologies concern stephanie could please confi...,apologies concern stephanie could please confi...,"[if, can, t, access, tracking, how, will, know...","[apologies, concern, stephanie, could, please,..."
132865,236039,Fri Dec 01 20:57:53 +0000 2017,soooo glad i pay for amazon prime for my packa...,amazonhelp,Fri Dec 01 20:59:40 +0000 2017,@236039 Did we miss the delivery date in the c...,soooo glad pay amazon prime package take days not,did miss delivery date in confirmation email c...,miss delivery date confirmation email check ht...,"[soooo, glad, pay, amazon, prime, package, tak...","[did, miss, delivery, date, in, confirmation, ..."


### 4.1 Remove StopWords

Stopwords are common words that will likely appear in any text. They don’t tell us much about our data so we remove them. eg: silver or lead is fine for me-> silver, lead, fine.

In [32]:
import nltk

stopword = nltk.corpus.stopwords.words('english') # All English Stopwords

# Function to remove Stopwords
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]# To remove all stopwords
    return text

inbOutb['text_x_tokenized'] = inbOutb['text_x_tokenized'].apply(lambda x: remove_stopwords(x))
inbOutb['text_y_tokenized'] = inbOutb['text_y_tokenized'].apply(lambda x: remove_stopwords(x))

In [33]:
inbOutb.head(3)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y,text_x_clean,text_y_clean,text_wo_stop,text_x_tokenized,text_y_tokenized
538902,568875,Wed Oct 18 09:54:26 +0000 2017,Hey @GloCare your LTE data service is becoming...,glocare,Wed Oct 18 13:08:38 +0000 2017,"@568875 Afternoon Tari, may we know the exact ...",hey your lte data service becoming terrible do...,afternoon tari may know exact issue being expe...,afternoon tari may know exact issue experience...,"[hey, lte, data, service, becoming, terrible, ...","[afternoon, tari, may, know, exact, issue, exp..."
127066,230636,Fri Dec 01 14:34:28 +0000 2017,"If I can’t access tracking, how will I know wh...",amazonhelp,Fri Dec 01 15:05:00 +0000 2017,"@230636 Apologies for the concern, Stephanie! ...",if can’t access tracking how will know when th...,apologies concern stephanie could please confi...,apologies concern stephanie could please confi...,"[access, tracking, know, arrived, arrive, got,...","[apologies, concern, stephanie, could, please,..."
132865,236039,Fri Dec 01 20:57:53 +0000 2017,soooo glad i pay for amazon prime for my packa...,amazonhelp,Fri Dec 01 20:59:40 +0000 2017,@236039 Did we miss the delivery date in the c...,soooo glad pay amazon prime package take days not,did miss delivery date in confirmation email c...,miss delivery date confirmation email check ht...,"[soooo, glad, pay, amazon, prime, package, tak...","[miss, delivery, date, confirmation, email, ch..."


# 5. Stemming

<b>Tokenizing separates text into units such as sentences or words. It gives structure to previously unstructured text.

Stemming helps reduce a word to its stem form. It often makes sense to treat related words in the same way. It removes suffices, like “ing”, “ly”, “s”, etc. by a simple rule-based approach.
It reduces the corpus of words but often the actual words get neglected. eg: Entitling,Entitled->Entitl

In [34]:
from nltk.stem.porter import PorterStemmer

ps = nltk.PorterStemmer()

def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

In [35]:
# inbOutb['text_x_stemmed'] = inbOutb['text_x_nostop'].apply(lambda x: stemming(x))
# inbOutb['text_y_stemmed'] = inbOutb['text_y_nostop'].apply(lambda x: stemming(x))

# 6. Lemmatizing

Lemmatizing derives the canonical form (‘lemma’) of a word. i.e the root form. It is better than stemming as it uses a dictionary-based approach i.e a morphological analysis to the root word.eg: Entitling, Entitled->Entitle <br>
In Short, Stemming is typically faster as it simply chops off the end of the word, without understanding the context of the word. Lemmatizing is slower and more accurate as it takes an informed analysis with the context of the word in mind.

In [36]:
wn = nltk.WordNetLemmatizer()

def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

In [37]:
inbOutb['text_x_lemmatized'] = inbOutb['text_x_tokenized'].apply(lambda x: lemmatizing(x))
inbOutb['text_y_lemmatized'] = inbOutb['text_y_tokenized'].apply(lambda x: lemmatizing(x))

In [38]:
inbOutb.head(3)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y,text_x_clean,text_y_clean,text_wo_stop,text_x_tokenized,text_y_tokenized,text_x_lemmatized,text_y_lemmatized
538902,568875,Wed Oct 18 09:54:26 +0000 2017,Hey @GloCare your LTE data service is becoming...,glocare,Wed Oct 18 13:08:38 +0000 2017,"@568875 Afternoon Tari, may we know the exact ...",hey your lte data service becoming terrible do...,afternoon tari may know exact issue being expe...,afternoon tari may know exact issue experience...,"[hey, lte, data, service, becoming, terrible, ...","[afternoon, tari, may, know, exact, issue, exp...","[hey, lte, data, service, becoming, terrible, ...","[afternoon, tari, may, know, exact, issue, exp..."
127066,230636,Fri Dec 01 14:34:28 +0000 2017,"If I can’t access tracking, how will I know wh...",amazonhelp,Fri Dec 01 15:05:00 +0000 2017,"@230636 Apologies for the concern, Stephanie! ...",if can’t access tracking how will know when th...,apologies concern stephanie could please confi...,apologies concern stephanie could please confi...,"[access, tracking, know, arrived, arrive, got,...","[apologies, concern, stephanie, could, please,...","[access, tracking, know, arrived, arrive, got,...","[apology, concern, stephanie, could, please, c..."
132865,236039,Fri Dec 01 20:57:53 +0000 2017,soooo glad i pay for amazon prime for my packa...,amazonhelp,Fri Dec 01 20:59:40 +0000 2017,@236039 Did we miss the delivery date in the c...,soooo glad pay amazon prime package take days not,did miss delivery date in confirmation email c...,miss delivery date confirmation email check ht...,"[soooo, glad, pay, amazon, prime, package, tak...","[miss, delivery, date, confirmation, email, ch...","[soooo, glad, pay, amazon, prime, package, tak...","[miss, delivery, date, confirmation, email, ch..."


In [39]:
# inbOutb.to_csv('inbOutb.csv')

# 7. Count Vectorizer

In [40]:
## Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
questions = inbOutb['text_x_clean'].dropna()
q = np.array(questions)

In [41]:
# Initialise the count vectorizer with the English stop words
countV = CountVectorizer(stop_words='english',
                         max_features=10000)

# Fit and transform the processed titles
bagQuestions = countV.fit_transform(q)

print('BOW Questions: ',bagQuestions.shape)

BOW Questions:  (794299, 10000)


Wordcloud with the most common words in customer requests

# 8. Positive Vs Negative Tweet

In [42]:
import matplotlib.pyplot as plt

%matplotlib inline

tweets = pd.read_csv('twcs.csv')
inbound_tweets = tweets[tweets.inbound == True]
inbound_tweets['timestamp'] = pd.to_datetime(inbound_tweets['created_at']).dt.date


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


### 8.1 Explore the dataset for inbound tweets:

In [43]:
display(inbound_tweets.head())
display(inbound_tweets.columns)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,timestamp
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0,2017-10-31
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0,2017-10-31
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0,2017-10-31
6,8,115712,True,Tue Oct 31 21:45:10 +0000 2017,@sprintcare is the worst customer service,9610.0,,2017-10-31
8,12,115713,True,Tue Oct 31 22:04:47 +0000 2017,@sprintcare You gonna magically change your co...,111314.0,15.0,2017-10-31


Index(['tweet_id', 'author_id', 'inbound', 'created_at', 'text',
       'response_tweet_id', 'in_response_to_tweet_id', 'timestamp'],
      dtype='object')

### 8.2 Collect quantities by author:

In [None]:
count_by_author_id = inbound_tweets.groupby(['author_id']) \
    .count()[['tweet_id']] \
    .sort_values(['tweet_id'], ascending = False) \
    .rename({'tweet_id': 'qty'}, axis='columns')
plt.figure()
plt.plot(np.arange(len(count_by_author_id.index.values)), count_by_author_id.qty )
plt.xlabel('Author #')
plt.ylabel('Tweets Qty.')
plt.grid(True)
plt.show()

### 8.3 Quantities by Date:

In [None]:
count_by_date = inbound_tweets.groupby(['timestamp']) \
    .count()[['tweet_id']] \
    .sort_values(['timestamp'], ascending = True) \
    .rename({'tweet_id': 'qty'}, axis='columns')

display(count_by_date.head())
plt.figure()
plt.plot(count_by_date.index.values, count_by_date.qty )
plt.xlabel('Date')
plt.ylabel('Tweets Qty.')
plt.grid(True)
plt.gcf().autofmt_xdate()
plt.show()

### 8.4 Process the sentiment analysis using NLTK

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment_analyzer = SentimentIntensityAnalyzer()

inbound_tweets['score'] = inbound_tweets.text.apply(lambda x: sentiment_analyzer.polarity_scores(x)['compound'])
display(inbound_tweets.head())

### 8.5 Check the Sentiment Analysis with an histogram of scores

In [None]:
hist_bins = 30
plt.figure(figsize=[10,5])
x = inbound_tweets[['score']]
scores_hist,edges = np.histogram(x, bins=hist_bins)
plt.bar(edges[:-1], scores_hist)
plt.grid(True)
plt.xlabel('Score')
plt.ylabel('Tweets Count')
plt.show()

### 8.6 Define limit for Pos. vs Neg. comment and compute results

In [None]:
limit_pos = 0.5
limit_neg = -0.5
limit_neg_per_author = 3
#plt.plot(inbound_tweets.index.values, inbound_tweets['score'], 'r')
#plt.axhline(0.5, 'g.')
inbound_tweets['is_pos'] = (inbound_tweets[['score']] > limit_pos)
inbound_tweets['is_neg'] = (inbound_tweets[['score']] < limit_neg)
inbound_tweets['is_other'] = ~inbound_tweets.is_pos & ~inbound_tweets.is_neg

def count_true(x):
    return np.sum(x == True)

scores_by_date = inbound_tweets.groupby(['timestamp']) \
    .agg({'tweet_id':'count', 'is_pos': count_true, 'is_neg': count_true, 'is_other': count_true}) \
    .rename({'tweet_id':'total'}, axis='columns')
display(scores_by_date.head())
x = scores_by_date.index.values
y1 = scores_by_date['is_pos']
y2 = scores_by_date['is_neg']

scores_by_author = inbound_tweets.groupby(['author_id']) \
    .agg({'tweet_id':'count','is_neg': count_true}) \
    .rename({'tweet_id':'total'}, axis='columns')
scores_by_author = scores_by_author.loc[scores_by_author['is_neg'] >= limit_neg_per_author]
display(scores_by_author.head())

#plt.scatter(x, y, alpha=0.5)
plt.figure(figsize=[10,5])
plt.plot(x, y1,'g-', x, y2, 'r--')
plt.xlabel('Date')
plt.ylabel('Qty.')
plt.legend(['Pos.Tweet', 'Neg.Tweet'])
plt.grid(True)
plt.show()
