In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import nltk

<ul><li>The inbound feature is important because it allows you to distinguish between company and customer.</li>
<li>Obviously the 'text' function is the main source of information.</li>
<li>I will apply natural language processing techniques. In my opinion, it is possible to address the problem in two ways:</li>
    <ul><li>Catalog customer tweets and identify the most discussed topics;</li>
        <li>Create a model that adapts to the tweets of the companies and try to answer a specific topic of a customer (to do)</li></ul>
</ul>    

# 1. Geting Data

In [2]:
df = pd.read_csv('twcs.csv')

### 1. Data Exploration

In [3]:
df.sample(10)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
195711,230120,170830,True,Wed Oct 04 19:00:53 +0000 2017,@AmazonHelp Already marked a email but you guy...,2.3011823012123008e+17,230123.0
203956,238736,AmazonHelp,False,Wed Oct 04 18:25:09 +0000 2017,@172787 Ideally it should have happened by now...,238737.0,238738.0
738949,825747,ChaseSupport,False,Wed Oct 18 01:22:58 +0000 2017,"@160010 Please DM your full name and zip code,...",825748.0,825746.0
745275,832893,219463,True,Fri Oct 13 07:33:31 +0000 2017,"@AmazonHelp Eure Reaktion erklaert, warum die ...",832894.0,832892.0
1248421,1377772,Uber_Support,False,Sat Oct 28 03:17:47 +0000 2017,"@440746 Here to help! Send us a note here, htt...",,1377773.0
1841527,1998603,AirbnbHelp,False,Fri Nov 03 19:20:20 +0000 2017,@590946 We want to help with this cancellation...,,1998604.0
1091642,1207672,403562,True,Wed Oct 25 09:14:06 +0000 2017,@SpotifyCares Can I have some help about stopp...,1207671.0,
2344108,2510790,AskPlayStation,False,Mon Oct 30 04:49:57 +0000 2017,@715674 Hi there. Glad to help! Please check y...,,2510791.0
2562239,2733137,sprintcare,False,Mon Nov 20 16:20:06 +0000 2017,@561268 You will be contacted. Team @116447,27331382733139.0,2733140.0
241614,279371,182606,True,Mon Oct 09 22:47:00 +0000 2017,@AmazonHelp https://t.co/nojv3hwcA9,279372.0,279370.0


In [4]:
df.shape

(2811774, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2811774 entries, 0 to 2811773
Data columns (total 7 columns):
tweet_id                   int64
author_id                  object
inbound                    bool
created_at                 object
text                       object
response_tweet_id          object
in_response_to_tweet_id    float64
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 131.4+ MB


### 1.2 Checking missing data

In [6]:
df.isnull().sum()

tweet_id                         0
author_id                        0
inbound                          0
created_at                       0
text                             0
response_tweet_id          1040629
in_response_to_tweet_id     794335
dtype: int64

# 2. Feature Extraction

In [7]:
#first inbound = Richiesta iniziale di un cliente
first_inbound = df[pd.isnull(df.in_response_to_tweet_id) & df.inbound]

inbOutb = pd.merge(first_inbound, df, left_on='tweet_id', 
                                  right_on='in_response_to_tweet_id').sample(frac=1)

# Filter to only outbound replies (from companies)
inbOutb = inbOutb[inbOutb.inbound_y ^ True]

In [8]:
inbOutb.shape


(794299, 14)

<b>Now the dataset is doubled in size (features), as each line contains:</b>
<ul><li>a customer request (text_x)</li>
    <li>a reply from the company (text_y)</li>
    <li>Related related features</li></ul>

<b>From which we can easily verify that:</b>
<ul><li>the 'inbound_x' feature is always True;</li>
    <li>the 'inbound_y' feature is always False;</li>

In [9]:
inbOutb.isnull().sum()

tweet_id_x                        0
author_id_x                       0
inbound_x                         0
created_at_x                      0
text_x                            0
response_tweet_id_x               0
in_response_to_tweet_id_x    794299
tweet_id_y                        0
author_id_y                       0
inbound_y                         0
created_at_y                      0
text_y                            0
response_tweet_id_y          530528
in_response_to_tweet_id_y         0
dtype: int64

The <b>'in_response_to_tweet_id_x'</b> feature is totally composed of <b>NaN.</b> So this feature will be dropped.

### 2.1 Drop useless features.

In [10]:
inbOutb.columns

Index(['tweet_id_x', 'author_id_x', 'inbound_x', 'created_at_x', 'text_x',
       'response_tweet_id_x', 'in_response_to_tweet_id_x', 'tweet_id_y',
       'author_id_y', 'inbound_y', 'created_at_y', 'text_y',
       'response_tweet_id_y', 'in_response_to_tweet_id_y'],
      dtype='object')

In [11]:
toDrop = ['tweet_id_x', 'inbound_x','response_tweet_id_x', 'in_response_to_tweet_id_x', 
          'tweet_id_y', 'inbound_y','response_tweet_id_y', 'in_response_to_tweet_id_y']

In [12]:
inbOutb.drop(toDrop, axis=1, inplace=True)
print('inbOutb shape: ', inbOutb.shape)

inbOutb shape:  (794299, 6)


In [13]:
inbOutb.sample(5)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y
568464,591714,Fri Nov 03 15:33:42 +0000 2017,SMH dog I️ did that I️ in the keyboard shit an...,AppleSupport,Fri Nov 03 20:29:00 +0000 2017,@591714 We'd be happy to look into what's goin...
398372,456007,Thu Nov 02 14:42:57 +0000 2017,@117157 Why is you mobile app not working. I k...,AskCiti,Thu Nov 02 16:06:00 +0000 2017,"@456007 Yes, sorry for this! Our app is having..."
830506,790657,Mon Nov 27 15:57:29 +0000 2017,@ChipotleTweets should do breakfast.,ChipotleTweets,Mon Nov 27 16:15:00 +0000 2017,@790657 We are testing breakfast burritos in t...
346390,125004,Thu Oct 26 10:32:52 +0000 2017,@115722 @125004 Good morning New York ☀️,VerizonSupport,Thu Oct 26 10:37:26 +0000 2017,@125004 Good morning. \n^TXA
807747,773738,Mon Nov 20 15:36:03 +0000 2017,@DropboxSupport how do we reinstate an account...,DropboxSupport,Tue Nov 21 10:07:47 +0000 2017,"@773738 ...ID, so as to look it up internally ..."


# 3. Text Processing

In [14]:
inbOutb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 794299 entries, 315288 to 671189
Data columns (total 6 columns):
author_id_x     794299 non-null object
created_at_x    794299 non-null object
text_x          794299 non-null object
author_id_y     794299 non-null object
created_at_y    794299 non-null object
text_y          794299 non-null object
dtypes: object(6)
memory usage: 42.4+ MB


In [15]:
inbOutb.shape

(794299, 6)

### 3.1 Lower Casing

Lower casing is a common text preprocessing technique. The idea is to convert the input text into same casing format so that 'text', 'Text' and 'TEXT' are treated the same way.

This is more helpful for text featurization techniques like frequency, tfidf as it helps to combine the same words together thereby reducing the duplication and get correct counts / tfidf values.

This may not be helpful when we do tasks like Part of Speech tagging (where proper casing gives some information about Nouns and so on) and Sentiment Analysis (where upper casing refers to anger and so on)

In [16]:
def remove_uppercase(text):
    text_lowercase = ' '.join(x.lower() for x in text.split())# It will discard all uppercases
    return text_lowercase

In [17]:
inbOutb['text_x_clean'] = inbOutb['text_x'].apply(lambda x: remove_uppercase(x))
inbOutb['text_y_clean'] = inbOutb['text_y'].apply(lambda x: remove_uppercase(x))
#in modo da poter rimuovere i nomi delle compagnie
inbOutb['author_id_y'] = inbOutb['author_id_y'].apply(lambda x: remove_uppercase(x)) 

In [18]:
inbOutb.head(3)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y,text_x_clean,text_y_clean
315288,390987,Sat Oct 14 23:33:37 +0000 2017,@115879 how many times should the system cance...,asklyft,Sun Oct 15 00:57:16 +0000 2017,@390987 Hello! We are sorry to hear this. Can ...,@115879 how many times should the system cance...,@390987 hello! we are sorry to hear this. can ...
461149,507004,Sat Nov 04 20:52:57 +0000 2017,@ATVIAssist servers aren't working for me. Err...,atviassist,Mon Nov 06 09:34:53 +0000 2017,"@507004 Apologies for the delay, things should...",@atviassist servers aren't working for me. err...,"@507004 apologies for the delay, things should..."
510666,547353,Sun Oct 15 14:58:01 +0000 2017,"@AskTarget FYI, App was updated on 10/13 and i...",asktarget,Sun Oct 15 20:11:54 +0000 2017,@547353 Thank you for letting us know. Can you...,"@asktarget fyi, app was updated on 10/13 and i...",@547353 thank you for letting us know. can you...


### 3.2 Remove Puntuaction

Punctuation can provide grammatical context to a sentence which supports our understanding. But for our vectorizer which counts the number of words and not the context, it does not add value, so we remove all special characters. eg: How are you?->How are you.

In [19]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [20]:
#Function to remove Punctuation
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])# It will discard all punctuations
    return text_nopunct
inbOutb['text_x_clean'] = inbOutb['text_x_clean'].apply(lambda x: remove_punct(x))
inbOutb['text_y_clean'] = inbOutb['text_y_clean'].apply(lambda x: remove_punct(x))
inbOutb.head(3)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y,text_x_clean,text_y_clean
315288,390987,Sat Oct 14 23:33:37 +0000 2017,@115879 how many times should the system cance...,asklyft,Sun Oct 15 00:57:16 +0000 2017,@390987 Hello! We are sorry to hear this. Can ...,115879 how many times should the system cancel...,390987 hello we are sorry to hear this can you...
461149,507004,Sat Nov 04 20:52:57 +0000 2017,@ATVIAssist servers aren't working for me. Err...,atviassist,Mon Nov 06 09:34:53 +0000 2017,"@507004 Apologies for the delay, things should...",atviassist servers arent working for me error ...,507004 apologies for the delay things should b...
510666,547353,Sun Oct 15 14:58:01 +0000 2017,"@AskTarget FYI, App was updated on 10/13 and i...",asktarget,Sun Oct 15 20:11:54 +0000 2017,@547353 Thank you for letting us know. Can you...,asktarget fyi app was updated on 1013 and is v...,547353 thank you for letting us know can you d...


### 3.3 Removing usernames
rimuovere nomi utenti, compagnie da text_x_clean<br>
rimuovere nomi utenti, compagnie da text_y_clean

In [21]:
companies = inbOutb['author_id_y'].unique()

inbOutb['text_x_clean'] = inbOutb['text_x_clean'].str.replace('\d+', '')
inbOutb['text_y_clean'] = inbOutb['text_y_clean'].str.replace('\d+', '')

inbOutb['text_x_clean'] = inbOutb['text_x_clean'].str.replace('|'.join(companies), '')
inbOutb['text_y_clean'] = inbOutb['text_y_clean'].str.replace('|'.join(companies), '')

inbOutb.head(3)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y,text_x_clean,text_y_clean
315288,390987,Sat Oct 14 23:33:37 +0000 2017,@115879 how many times should the system cance...,asklyft,Sun Oct 15 00:57:16 +0000 2017,@390987 Hello! We are sorry to hear this. Can ...,how many times should the system cancel or ch...,hello we are sorry to hear this can you dm us...
461149,507004,Sat Nov 04 20:52:57 +0000 2017,@ATVIAssist servers aren't working for me. Err...,atviassist,Mon Nov 06 09:34:53 +0000 2017,"@507004 Apologies for the delay, things should...",servers arent working for me error code but ...,apologies for the delay things should be back...
510666,547353,Sun Oct 15 14:58:01 +0000 2017,"@AskTarget FYI, App was updated on 10/13 and i...",asktarget,Sun Oct 15 20:11:54 +0000 2017,@547353 Thank you for letting us know. Can you...,fyi app was updated on and is version,thank you for letting us know can you dm us t...


In [22]:
inbOutb.shape

(794299, 8)

### 3.4 Removal of stopwords
Stopwords are commonly occuring words in a language like 'the', 'a' and so on. They can be removed from the text most of the times, as they don't provide valuable information for downstream analysis. In cases like Part of Speech tagging, we should not remove them as provide very valuable information about the POS.

In [23]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

<b>Similarly we can also get the list for other languages as well and use them.

In [24]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

inbOutb['text_wo_stop'] = inbOutb['text_x_clean'].apply(lambda x: remove_stopwords(x))
inbOutb['text_wo_stop'] = inbOutb['text_y_clean'].apply(lambda y: remove_stopwords(y))

inbOutb.head(3)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y,text_x_clean,text_y_clean,text_wo_stop
315288,390987,Sat Oct 14 23:33:37 +0000 2017,@115879 how many times should the system cance...,asklyft,Sun Oct 15 00:57:16 +0000 2017,@390987 Hello! We are sorry to hear this. Can ...,how many times should the system cancel or ch...,hello we are sorry to hear this can you dm us...,hello sorry hear dm us phone number ached acco...
461149,507004,Sat Nov 04 20:52:57 +0000 2017,@ATVIAssist servers aren't working for me. Err...,atviassist,Mon Nov 06 09:34:53 +0000 2017,"@507004 Apologies for the delay, things should...",servers arent working for me error code but ...,apologies for the delay things should be back...,apologies delay things back still unable conne...
510666,547353,Sun Oct 15 14:58:01 +0000 2017,"@AskTarget FYI, App was updated on 10/13 and i...",asktarget,Sun Oct 15 20:11:54 +0000 2017,@547353 Thank you for letting us know. Can you...,fyi app was updated on and is version,thank you for letting us know can you dm us t...,thank letting us know dm us date store locatio...


### 3.5  Checking Most Common Worlds
Previously, we just removed commonly occurring words in a general sense. We can also remove commonly occurring words from our text data First, let’s check the 10 most frequently occurring words in our text data then take call to remove or retain.

In [25]:
#common worlds

freqX = pd.Series(' '.join(inbOutb['text_x_clean']).split()).value_counts()[:10]
freqY = pd.Series(' '.join(inbOutb['text_y_clean']).split()).value_counts()[:10]
print('FREQ X: \n',freqX,'\nFREQ Y: \n', freqY)

FREQ X: 
 to     450771
i      402429
the    394086
my     310607
a      289704
and    250549
is     210765
for    199934
on     184365
you    172960
dtype: int64 
FREQ Y: 
 to      597010
you     562841
the     438571
your    355927
we      300935
us      281678
for     280735
can     262541
a       257072
this    245768
dtype: int64


In [26]:
#removing them
freqX = list(freqX.index)
freqY = list(freqY.index)
inbOutb['text_x_clean'] = inbOutb['text_x_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in freqX))
inbOutb['text_y_clean'] = inbOutb['text_y_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in freqY))

### 3.6 Checking Most Rare Worlds

In [27]:
rareX = pd.Series(' '.join(inbOutb['text_x_clean']).split()).value_counts()[-100:]
rareY = pd.Series(' '.join(inbOutb['text_y_clean']).split()).value_counts()[-100:]
print('RARE X: \n',rareX,'\nRARE Y: \n', rareY)

RARE X: 
 httpstcoklmsha             1
inascratchika              1
httpstcowtfxbssns          1
httpstcouqzpqgfu           1
mesti                      1
httpstcoptmwmoyj           1
playlist😫                  1
httpstcoollmmtg            1
pokerfaceswimsuit          1
httpstcoubfohsxu           1
httpstcohverymivvz         1
ondia                      1
httpstcoujbvbgzp           1
l’occitane                 1
httpstcommixmbev           1
httpstcosaoetx             1
httpstcocegdxglblo         1
deviery                    1
giftsforwomen              1
gbwill                     1
“garlic”                   1
httpstcongmpgeewwy         1
swellingkeayboard          1
undisclose                 1
httpstcocdwhfamb           1
httpstcojexaaknv           1
nooooothing                1
httpstcowvagjzt            1
httpstcosenbpelim          1
🎤🎤🎤                        1
                          ..
repurchase…                1
httpstcosmyklmaaqi         1
bellextremely              1
http

In [28]:
#removing them
rareX = list(rareX.index)
rareY = list(rareY.index)
inbOutb['text_x_clean'] = inbOutb['text_x_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in rareX))
inbOutb['text_y_clean'] = inbOutb['text_y_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in rareY))

# 4. Tokenizing

Tokenizing separates text into units such as sentences or words. It gives structure to previously unstructured text.

In [29]:
import re

# Function to Tokenize words
def tokenize(text):
    tokens = re.split('\W+', text) #W+ means that either a word character (A-Za-z0-9_) or a dash (-) can go there.
    return tokens


In [30]:
inbOutb['text_x_tokenized'] = inbOutb['text_x_clean'].apply(lambda x: tokenize(x.lower())) 
inbOutb['text_y_tokenized'] = inbOutb['text_y_clean'].apply(lambda x: tokenize(x.lower()))
#We convert to lower as Python is case-sensitive. 

In [31]:
inbOutb.head(3)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y,text_x_clean,text_y_clean,text_wo_stop,text_x_tokenized,text_y_tokenized
315288,390987,Sat Oct 14 23:33:37 +0000 2017,@115879 how many times should the system cance...,asklyft,Sun Oct 15 00:57:16 +0000 2017,@390987 Hello! We are sorry to hear this. Can ...,how many times should system cancel or change ...,hello are sorry hear dm phone number ached acc...,hello sorry hear dm us phone number ached acco...,"[how, many, times, should, system, cancel, or,...","[hello, are, sorry, hear, dm, phone, number, a..."
461149,507004,Sat Nov 04 20:52:57 +0000 2017,@ATVIAssist servers aren't working for me. Err...,atviassist,Mon Nov 06 09:34:53 +0000 2017,"@507004 Apologies for the delay, things should...",servers arent working me error code but friend...,apologies delay things should be back now are ...,apologies delay things back still unable conne...,"[servers, arent, working, me, error, code, but...","[apologies, delay, things, should, be, back, n..."
510666,547353,Sun Oct 15 14:58:01 +0000 2017,"@AskTarget FYI, App was updated on 10/13 and i...",asktarget,Sun Oct 15 20:11:54 +0000 2017,@547353 Thank you for letting us know. Can you...,fyi app was updated version,thank letting know dm date store location and ...,thank letting us know dm us date store locatio...,"[fyi, app, was, updated, version]","[thank, letting, know, dm, date, store, locati..."


### 4.1 Remove StopWords

Stopwords are common words that will likely appear in any text. They don’t tell us much about our data so we remove them. eg: silver or lead is fine for me-> silver, lead, fine.

In [32]:
import nltk

stopword = nltk.corpus.stopwords.words('english') # All English Stopwords

# Function to remove Stopwords
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]# To remove all stopwords
    return text

inbOutb['text_x_tokenized'] = inbOutb['text_x_tokenized'].apply(lambda x: remove_stopwords(x))
inbOutb['text_y_tokenized'] = inbOutb['text_y_tokenized'].apply(lambda x: remove_stopwords(x))

In [33]:
inbOutb.head(3)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y,text_x_clean,text_y_clean,text_wo_stop,text_x_tokenized,text_y_tokenized
315288,390987,Sat Oct 14 23:33:37 +0000 2017,@115879 how many times should the system cance...,asklyft,Sun Oct 15 00:57:16 +0000 2017,@390987 Hello! We are sorry to hear this. Can ...,how many times should system cancel or change ...,hello are sorry hear dm phone number ached acc...,hello sorry hear dm us phone number ached acco...,"[many, times, system, cancel, change, drivers,...","[hello, sorry, hear, dm, phone, number, ached,..."
461149,507004,Sat Nov 04 20:52:57 +0000 2017,@ATVIAssist servers aren't working for me. Err...,atviassist,Mon Nov 06 09:34:53 +0000 2017,"@507004 Apologies for the delay, things should...",servers arent working me error code but friend...,apologies delay things should be back now are ...,apologies delay things back still unable conne...,"[servers, arent, working, error, code, friends...","[apologies, delay, things, back, still, unable..."
510666,547353,Sun Oct 15 14:58:01 +0000 2017,"@AskTarget FYI, App was updated on 10/13 and i...",asktarget,Sun Oct 15 20:11:54 +0000 2017,@547353 Thank you for letting us know. Can you...,fyi app was updated version,thank letting know dm date store location and ...,thank letting us know dm us date store locatio...,"[fyi, app, updated, version]","[thank, letting, know, dm, date, store, locati..."


# 5. Stemming

<b>Tokenizing separates text into units such as sentences or words. It gives structure to previously unstructured text.

Stemming helps reduce a word to its stem form. It often makes sense to treat related words in the same way. It removes suffices, like “ing”, “ly”, “s”, etc. by a simple rule-based approach.
It reduces the corpus of words but often the actual words get neglected. eg: Entitling,Entitled->Entitl

In [34]:
from nltk.stem.porter import PorterStemmer

ps = nltk.PorterStemmer()

def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

In [35]:
# inbOutb['text_x_stemmed'] = inbOutb['text_x_nostop'].apply(lambda x: stemming(x))
# inbOutb['text_y_stemmed'] = inbOutb['text_y_nostop'].apply(lambda x: stemming(x))

# 6. Lemmatizing

Lemmatizing derives the canonical form (‘lemma’) of a word. i.e the root form. It is better than stemming as it uses a dictionary-based approach i.e a morphological analysis to the root word.eg: Entitling, Entitled->Entitle <br>
In Short, Stemming is typically faster as it simply chops off the end of the word, without understanding the context of the word. Lemmatizing is slower and more accurate as it takes an informed analysis with the context of the word in mind.

In [36]:
wn = nltk.WordNetLemmatizer()

def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

In [37]:
inbOutb['text_x_lemmatized'] = inbOutb['text_x_tokenized'].apply(lambda x: lemmatizing(x))
inbOutb['text_y_lemmatized'] = inbOutb['text_y_tokenized'].apply(lambda x: lemmatizing(x))

In [38]:
inbOutb.head(3)

Unnamed: 0,author_id_x,created_at_x,text_x,author_id_y,created_at_y,text_y,text_x_clean,text_y_clean,text_wo_stop,text_x_tokenized,text_y_tokenized,text_x_lemmatized,text_y_lemmatized
315288,390987,Sat Oct 14 23:33:37 +0000 2017,@115879 how many times should the system cance...,asklyft,Sun Oct 15 00:57:16 +0000 2017,@390987 Hello! We are sorry to hear this. Can ...,how many times should system cancel or change ...,hello are sorry hear dm phone number ached acc...,hello sorry hear dm us phone number ached acco...,"[many, times, system, cancel, change, drivers,...","[hello, sorry, hear, dm, phone, number, ached,...","[many, time, system, cancel, change, driver, r...","[hello, sorry, hear, dm, phone, number, ached,..."
461149,507004,Sat Nov 04 20:52:57 +0000 2017,@ATVIAssist servers aren't working for me. Err...,atviassist,Mon Nov 06 09:34:53 +0000 2017,"@507004 Apologies for the delay, things should...",servers arent working me error code but friend...,apologies delay things should be back now are ...,apologies delay things back still unable conne...,"[servers, arent, working, error, code, friends...","[apologies, delay, things, back, still, unable...","[server, arent, working, error, code, friend, ...","[apology, delay, thing, back, still, unable, c..."
510666,547353,Sun Oct 15 14:58:01 +0000 2017,"@AskTarget FYI, App was updated on 10/13 and i...",asktarget,Sun Oct 15 20:11:54 +0000 2017,@547353 Thank you for letting us know. Can you...,fyi app was updated version,thank letting know dm date store location and ...,thank letting us know dm us date store locatio...,"[fyi, app, updated, version]","[thank, letting, know, dm, date, store, locati...","[fyi, app, updated, version]","[thank, letting, know, dm, date, store, locati..."


In [39]:
# inbOutb.to_csv('inbOutb.csv')

# 7. Count Vectorizer

In [40]:
## Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
questions = inbOutb['text_x_clean'].dropna()
q = np.array(questions)

In [None]:
# Initialise the count vectorizer with the English stop words
countV = CountVectorizer(stop_words='english',
                         max_features=10000)

# Fit and transform the processed titles
bagQuestions = countV.fit_transform(q)

print('BOW Questions: ',bagQuestions.shape)

Wordcloud with the most common words in customer requests

# 8. Positive Vs Negative Tweet

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

tweets = pd.read_csv('twcs.csv')
inbound_tweets = tweets[tweets.inbound == True]
inbound_tweets['timestamp'] = pd.to_datetime(inbound_tweets['created_at']).dt.date


### 8.1 Explore the dataset for inbound tweets:

In [None]:
display(inbound_tweets.head())
display(inbound_tweets.columns)

### 8.2 Collect quantities by author:

In [None]:
count_by_author_id = inbound_tweets.groupby(['author_id']) \
    .count()[['tweet_id']] \
    .sort_values(['tweet_id'], ascending = False) \
    .rename({'tweet_id': 'qty'}, axis='columns')
plt.figure()
plt.plot(np.arange(len(count_by_author_id.index.values)), count_by_author_id.qty )
plt.xlabel('Author #')
plt.ylabel('Tweets Qty.')
plt.grid(True)
plt.show()

### 8.3 Quantities by Date:

In [None]:
count_by_date = inbound_tweets.groupby(['timestamp']) \
    .count()[['tweet_id']] \
    .sort_values(['timestamp'], ascending = True) \
    .rename({'tweet_id': 'qty'}, axis='columns')

display(count_by_date.head())
plt.figure()
plt.plot(count_by_date.index.values, count_by_date.qty )
plt.xlabel('Date')
plt.ylabel('Tweets Qty.')
plt.grid(True)
plt.gcf().autofmt_xdate()
plt.show()

### 8.4 Process the sentiment analysis using NLTK

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment_analyzer = SentimentIntensityAnalyzer()

inbound_tweets['score'] = inbound_tweets.text.apply(lambda x: sentiment_analyzer.polarity_scores(x)['compound'])
display(inbound_tweets.head())

### 8.5 Check the Sentiment Analysis with an histogram of scores

In [None]:
hist_bins = 30
plt.figure(figsize=[10,5])
x = inbound_tweets[['score']]
scores_hist,edges = np.histogram(x, bins=hist_bins)
plt.bar(edges[:-1], scores_hist)
plt.grid(True)
plt.xlabel('Score')
plt.ylabel('Tweets Count')
plt.show()

In [None]:
### 8.6 Define limit for Pos. vs Neg. comment and compute results

In [None]:
limit_pos = 0.5
limit_neg = -0.5
limit_neg_per_author = 3
#plt.plot(inbound_tweets.index.values, inbound_tweets['score'], 'r')
#plt.axhline(0.5, 'g.')
inbound_tweets['is_pos'] = (inbound_tweets[['score']] > limit_pos)
inbound_tweets['is_neg'] = (inbound_tweets[['score']] < limit_neg)
inbound_tweets['is_other'] = ~inbound_tweets.is_pos & ~inbound_tweets.is_neg

def count_true(x):
    return np.sum(x == True)

scores_by_date = inbound_tweets.groupby(['timestamp']) \
    .agg({'tweet_id':'count', 'is_pos': count_true, 'is_neg': count_true, 'is_other': count_true}) \
    .rename({'tweet_id':'total'}, axis='columns')
display(scores_by_date.head())
x = scores_by_date.index.values
y1 = scores_by_date['is_pos']
y2 = scores_by_date['is_neg']

scores_by_author = inbound_tweets.groupby(['author_id']) \
    .agg({'tweet_id':'count','is_neg': count_true}) \
    .rename({'tweet_id':'total'}, axis='columns')
scores_by_author = scores_by_author.loc[scores_by_author['is_neg'] >= limit_neg_per_author]
display(scores_by_author.head())

#plt.scatter(x, y, alpha=0.5)
plt.figure(figsize=[10,5])
plt.plot(x, y1,'g-', x, y2, 'r--')
plt.xlabel('Date')
plt.ylabel('Qty.')
plt.legend(['Pos.Tweet', 'Neg.Tweet'])
plt.grid(True)
plt.show()
