In [1]:
import scattertext as st
import spacy

import numpy as np
import pandas as pd
import nltk
import joblib
from nltk.corpus import stopwords
import string
import re

from nltk.tokenize import TweetTokenizer

from collections import Counter
from nltk.tokenize.treebank import TreebankWordDetokenizer

nltk.download('stopwords')



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alejandro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load Dataset

In [2]:
df = pd.read_csv('uber_support_sent140.csv')
df.shape

(215387, 36)

# Preprocess Tweets

In [3]:

EMOTICONS = {
    u":‑\)":"Happy face or smiley",
    u":\)":"Happy face or smiley",
    u":-\]":"Happy face or smiley",
    u":\]":"Happy face or smiley",
    u":-3":"Happy face smiley",
    u":3":"Happy face smiley",
    u":->":"Happy face smiley",
    u":>":"Happy face smiley",
    u"8-\)":"Happy face smiley",
    u":o\)":"Happy face smiley",
    u":-\}":"Happy face smiley",
    u":\}":"Happy face smiley",
    u":-\)":"Happy face smiley",
    u":c\)":"Happy face smiley",
    u":\^\)":"Happy face smiley",
    u"=\]":"Happy face smiley",
    u"=\)":"Happy face smiley",
    u":‑D":"Laughing, big grin or laugh with glasses",
    u":D":"Laughing, big grin or laugh with glasses",
    u"8‑D":"Laughing, big grin or laugh with glasses",
    u"8D":"Laughing, big grin or laugh with glasses",
    u"X‑D":"Laughing, big grin or laugh with glasses",
    u"XD":"Laughing, big grin or laugh with glasses",
    u"=D":"Laughing, big grin or laugh with glasses",
    u"=3":"Laughing, big grin or laugh with glasses",
    u"B\^D":"Laughing, big grin or laugh with glasses",
    u":-\)\)":"Very happy",
    u":‑\(":"Frown, sad, andry or pouting",
    u":-\(":"Frown, sad, andry or pouting",
    u":\(":"Frown, sad, andry or pouting",
    u":‑c":"Frown, sad, andry or pouting",
    u":c":"Frown, sad, andry or pouting",
    u":‑<":"Frown, sad, andry or pouting",
    u":<":"Frown, sad, andry or pouting",
    u":‑\[":"Frown, sad, andry or pouting",
    u":\[":"Frown, sad, andry or pouting",
    u":-\|\|":"Frown, sad, andry or pouting",
    u">:\[":"Frown, sad, andry or pouting",
    u":\{":"Frown, sad, andry or pouting",
    u":@":"Frown, sad, andry or pouting",
    u">:\(":"Frown, sad, andry or pouting",
    u":'‑\(":"Crying",
    u":'\(":"Crying",
    u":'‑\)":"Tears of happiness",
    u":'\)":"Tears of happiness",
    u"D‑':":"Horror",
    u"D:<":"Disgust",
    u"D:":"Sadness",
    u"D8":"Great dismay",
    u"D;":"Great dismay",
    u"D=":"Great dismay",
    u"DX":"Great dismay",
    u":‑O":"Surprise",
    u":O":"Surprise",
    u":‑o":"Surprise",
    u":o":"Surprise",
    u":-0":"Shock",
    u"8‑0":"Yawn",
    u">:O":"Yawn",
    u":-\*":"Kiss",
    u":\*":"Kiss",
    u":X":"Kiss",
    u";‑\)":"Wink or smirk",
    u";\)":"Wink or smirk",
    u"\*-\)":"Wink or smirk",
    u"\*\)":"Wink or smirk",
    u";‑\]":"Wink or smirk",
    u";\]":"Wink or smirk",
    u";\^\)":"Wink or smirk",
    u":‑,":"Wink or smirk",
    u";D":"Wink or smirk",
    u":‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"X‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"XP":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"d:":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"=p":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u">:P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":-[.]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":S":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":‑\|":"Straight face",
    u":\|":"Straight face",
    u":$":"Embarrassed or blushing",
    u":‑x":"Sealed lips or wearing braces or tongue-tied",
    u":x":"Sealed lips or wearing braces or tongue-tied",
    u":‑#":"Sealed lips or wearing braces or tongue-tied",
    u":#":"Sealed lips or wearing braces or tongue-tied",
    u":‑&":"Sealed lips or wearing braces or tongue-tied",
    u":&":"Sealed lips or wearing braces or tongue-tied",
    u"O:‑\)":"Angel, saint or innocent",
    u"O:\)":"Angel, saint or innocent",
    u"0:‑3":"Angel, saint or innocent",
    u"0:3":"Angel, saint or innocent",
    u"0:‑\)":"Angel, saint or innocent",
    u"0:\)":"Angel, saint or innocent",
    u":‑b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"0;\^\)":"Angel, saint or innocent",
    u">:‑\)":"Evil or devilish",
    u">:\)":"Evil or devilish",
    u"\}:‑\)":"Evil or devilish",
    u"\}:\)":"Evil or devilish",
    u"3:‑\)":"Evil or devilish",
    u"3:\)":"Evil or devilish",
    u">;\)":"Evil or devilish",
    u"\|;‑\)":"Cool",
    u"\|‑O":"Bored",
    u":‑J":"Tongue-in-cheek",
    u"#‑\)":"Party all night",
    u"%‑\)":"Drunk or confused",
    u"%\)":"Drunk or confused",
    u":-###..":"Being sick",
    u":###..":"Being sick",
    u"<:‑\|":"Dump",
    u"\(>_<\)":"Troubled",
    u"\(>_<\)>":"Troubled",
    u"\(';'\)":"Baby",
    u"\(\^\^>``":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(\^_\^;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(~_~;\) \(・\.・;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-\)zzz":"Sleeping",
    u"\(\^_-\)":"Wink",
    u"\(\(\+_\+\)\)":"Confused",
    u"\(\+o\+\)":"Confused",
    u"\(o\|o\)":"Ultraman",
    u"\^_\^":"Joyful",
    u"\(\^_\^\)/":"Joyful",
    u"\(\^O\^\)／":"Joyful",
    u"\(\^o\^\)／":"Joyful",
    u"\(__\)":"Kowtow as a sign of respect, or dogeza for apology",
    u"_\(\._\.\)_":"Kowtow as a sign of respect, or dogeza for apology",
    u"<\(_ _\)>":"Kowtow as a sign of respect, or dogeza for apology",
    u"<m\(__\)m>":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(__\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(_ _\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"\('_'\)":"Sad or Crying",
    u"\(/_;\)":"Sad or Crying",
    u"\(T_T\) \(;_;\)":"Sad or Crying",
    u"\(;_;":"Sad of Crying",
    u"\(;_:\)":"Sad or Crying",
    u"\(;O;\)":"Sad or Crying",
    u"\(:_;\)":"Sad or Crying",
    u"\(ToT\)":"Sad or Crying",
    u";_;":"Sad or Crying",
    u";-;":"Sad or Crying",
    u";n;":"Sad or Crying",
    u";;":"Sad or Crying",
    u"Q\.Q":"Sad or Crying",
    u"T\.T":"Sad or Crying",
    u"QQ":"Sad or Crying",
    u"Q_Q":"Sad or Crying",
    u"\(-\.-\)":"Shame",
    u"\(-_-\)":"Shame",
    u"\(一一\)":"Shame",
    u"\(；一_一\)":"Shame",
    u"\(=_=\)":"Tired",
    u"\(=\^\·\^=\)":"cat",
    u"\(=\^\·\·\^=\)":"cat",
    u"=_\^=	":"cat",
    u"\(\.\.\)":"Looking down",
    u"\(\._\.\)":"Looking down",
    u"\^m\^":"Giggling with hand covering mouth",
    u"\(\・\・?":"Confusion",
    u"\(?_?\)":"Confusion",
    u">\^_\^<":"Normal Laugh",
    u"<\^!\^>":"Normal Laugh",
    u"\^/\^":"Normal Laugh",
    u"\（\*\^_\^\*）" :"Normal Laugh",
    u"\(\^<\^\) \(\^\.\^\)":"Normal Laugh",
    u"\(^\^\)":"Normal Laugh",
    u"\(\^\.\^\)":"Normal Laugh",
    u"\(\^_\^\.\)":"Normal Laugh",
    u"\(\^_\^\)":"Normal Laugh",
    u"\(\^\^\)":"Normal Laugh",
    u"\(\^J\^\)":"Normal Laugh",
    u"\(\*\^\.\^\*\)":"Normal Laugh",
    u"\(\^—\^\）":"Normal Laugh",
    u"\(#\^\.\^#\)":"Normal Laugh",
    u"\（\^—\^\）":"Waving",
    u"\(;_;\)/~~~":"Waving",
    u"\(\^\.\^\)/~~~":"Waving",
    u"\(-_-\)/~~~ \($\·\·\)/~~~":"Waving",
    u"\(T_T\)/~~~":"Waving",
    u"\(ToT\)/~~~":"Waving",
    u"\(\*\^0\^\*\)":"Excited",
    u"\(\*_\*\)":"Amazed",
    u"\(\*_\*;":"Amazed",
    u"\(\+_\+\) \(@_@\)":"Amazed",
    u"\(\*\^\^\)v":"Laughing,Cheerful",
    u"\(\^_\^\)v":"Laughing,Cheerful",
    u"\(\(d[-_-]b\)\)":"Headphones,Listening to music",
    u'\(-"-\)':"Worried",
    u"\(ーー;\)":"Worried",
    u"\(\^0_0\^\)":"Eyeglasses",
    u"\(\＾ｖ\＾\)":"Happy",
    u"\(\＾ｕ\＾\)":"Happy",
    u"\(\^\)o\(\^\)":"Happy",
    u"\(\^O\^\)":"Happy",
    u"\(\^o\^\)":"Happy",
    u"\)\^o\^\(":"Happy",
    u":O o_O":"Surprised",
    u"o_0":"Surprised",
    u"o\.O":"Surpised",
    u"\(o\.o\)":"Surprised",
    u"oO":"Surprised",
    u"\(\*￣m￣\)":"Dissatisfied",
    u"\(‘A`\)":"Snubbed or Deflated"
}

In [4]:
def preprocess(words):
    
    tokens = []
   
    # Clean urls, punct, strange characters, emojis, emoticons
    char= re.compile(r'^[a-zA-Z]$')
    punct=re.compile(r'[.,-,:,<,;,(,=]')
    ht = re.compile(r'http.')
    bar = re.compile(r'//*')
    pr = ["rt","@","http","https","'s",'...', 'english', 'translation','):',
          '. .', '..','2-5','<3',']:','“','”','’','. ...','___','__','=(','‘','—','°','¢','•','®','—','…',
          '... .','--->','–','»','«','£','-->','×','->','©','\n','™','¤']
    no_emoji = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE) 
    
    no_emoticon = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    no_html_pattern = re.compile('<.*?>')
    
    stoplist = stopwords.words('english')
    punctuation = set(string.punctuation)
    
   
    # TOKENIZER
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(words)
    tokens = [w.lower() for w in tokens] 
    
    #Remove numbers
    pattern = '[0-9]'
    tokens = [re.sub(pattern, '', i) for i in tokens] 
    
    # Punctuation
    tokens = [w for w in tokens if  w not in punctuation]
    
    #Hacer algo con los hastags
    
    # Clean emojis,urls,bars
    tokens = [w.replace('#', '') for w in tokens if not w.startswith('@') if w not in pr 
            if not bar.search(w) if not ht.search(w) if not char.search(w) if not punct.search(w)
            if not w.isdigit() if not no_emoji.search(w) if not no_emoticon.search(w) if not no_html_pattern.search(w)]
    
    # Remove non alphanumeric
    tokens = [w for w in tokens if w.isalpha()]

    
    # Remove english stopwords
    tokens = [w for w in tokens if w not in stoplist]
    
    #Remove tokens with length < 1
    tokens = [word for word in tokens if len(word)>1]  ##Posiblemente sean 2
    
    return tokens

## Simple preprocess

In [5]:
texts = [preprocess(document) for document in df['tweet']]
texts

[['follow',
  'thru',
  'promise',
  'disinfecting',
  'materials',
  'drivers',
  'cars',
  'supplies',
  'keep',
  'drivers',
  'safe',
  'tele',
  'agents',
  'hang',
  'drivers',
  'call',
  'ask',
  'sbout',
  'supplies'],
 ['uber',
  'follow',
  'thru',
  'promise',
  'disinfecting',
  'materials',
  'drivers',
  'cars',
  'supplies',
  'keep',
  'drivers',
  'safe',
  'tele',
  'agents',
  'hang',
  'drivers',
  'call',
  'ask',
  'sbout',
  'supplies',
  'explain',
  'please'],
 ['follow',
  'thru',
  'promise',
  'disinfecting',
  'materials',
  'drivers',
  'cars',
  'supplies',
  'keep',
  'drivers',
  'safe',
  'tele',
  'agents',
  'hang',
  'drivers',
  'call',
  'ask',
  'sbout',
  'supplies',
  'explain',
  'please'],
 ['song',
  'dance',
  'dead',
  'wrong',
  'handling',
  'pandemic',
  'plight',
  'workers',
  'know'],
 ['hey', 'receiving', 'ride', 'reports', 'email', 'even', 'account', 'guys'],
 ['done'],
 ['uber', 'gives', 'ride', 'request', 'go', 'app', 'gone', 'r

## Detokenize texts

In [6]:
texts = [TreebankWordDetokenizer().detokenize(document) for document in texts]
texts

['follow thru promise disinfecting materials drivers cars supplies keep drivers safe tele agents hang drivers call ask sbout supplies',
 'uber follow thru promise disinfecting materials drivers cars supplies keep drivers safe tele agents hang drivers call ask sbout supplies explain please',
 'follow thru promise disinfecting materials drivers cars supplies keep drivers safe tele agents hang drivers call ask sbout supplies explain please',
 'song dance dead wrong handling pandemic plight workers know',
 'hey receiving ride reports email even account guys',
 'done',
 'uber gives ride request go app gone right away',
 'way worried privacy dm said please continue working support team even though one emailing help deception',
 'trying file california state unemployment insurance ever since pay cut dramatically uber ca state etin tried find number call hr department find one also still waiting file tax',
 'please answer message box',
 'contact info avoid charges driver says business closed',

# Filter neutral tweets and empty tweets

In [7]:
df['text'] = texts
df = df[df.text != ""]
df = df[df.polarity140 != "marl:Neutral"]
df

Unnamed: 0,cashtags,created_at,date,day,geo,hashtags,hour,id,language,link,...,tweet,urls,user_id,user_id_str,user_rt,user_rt_id,username,video,polarity140,text
7,[],1.585697e+12,2020-03-31 23:28:49,2.0,,[],23.0,1.245131e+18,en,https://twitter.com/JohnKir74833568/status/124...,...,@Uber_Support No way. I’m not worried about my...,[],1.244829e+18,1.244829e+18,,,JohnKir74833568,0.0,marl:Negative,way worried privacy dm said please continue wo...
8,[],1.585697e+12,2020-03-31 23:28:13,2.0,,[],23.0,1.245131e+18,en,https://twitter.com/OneTruGreenberg/status/124...,...,@Uber_Support I'm trying to file for my Califo...,[],1.007368e+08,1.007368e+08,,,OneTruGreenberg,0.0,marl:Negative,trying file california state unemployment insu...
10,[],1.585697e+12,2020-03-31 23:21:43,2.0,,[],23.0,1.245129e+18,en,https://twitter.com/1Chronicler/status/1245129...,...,@Uber_Support Why is there no contact info? Ho...,[],4.138178e+07,4.138178e+07,,,1Chronicler,0.0,marl:Negative,contact info avoid charges driver says busines...
11,[],1.585697e+12,2020-03-31 23:21:19,2.0,,[],23.0,1.245129e+18,en,https://twitter.com/andrew_the_pom/status/1245...,...,@Uber_Support Read my original tweet and answe...,[],5.892575e+08,5.892575e+08,,,andrew_the_pom,0.0,marl:Positive,read original tweet answer question nothing ac...
14,[],1.585696e+12,2020-03-31 23:11:02,2.0,,[],23.0,1.245126e+18,en,https://twitter.com/cherylj22/status/124512648...,...,@Uber_Support hello ordered 70$ worth of east ...,[],3.302313e+07,3.302313e+07,,,cherylj22,0.0,marl:Negative,hello ordered worth east sides came cold chick...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215376,[],1.596241e+12,2020-08-01 00:17:31,6.0,,[],0.0,1.289355e+18,en,https://twitter.com/gailifshin/status/12893545...,...,wow @ubereats...you see $900+ of charges in on...,[],9.467706e+07,9.467706e+07,,,gailifshin,0.0,marl:Negative,wow see charges one day make difficult get sor...
215377,[],1.596241e+12,2020-08-01 00:13:43,6.0,,[],0.0,1.289354e+18,en,https://twitter.com/DeJai24/status/12893535804...,...,@UberEats @Uber_Support I will literally pay e...,[],1.041376e+18,1.041376e+18,,,DeJai24,0.0,marl:Negative,literally pay extra food grouped another order...
215378,[],1.596241e+12,2020-08-01 00:09:15,6.0,,[],0.0,1.289352e+18,en,https://twitter.com/KitOConnell/status/1289352...,...,@Uber @lyft @chadloder @Uber_Support Update: L...,['https://twitter.com/alcaprari23/status/12893...,2.094630e+07,2.094630e+07,,,KitOConnell,0.0,marl:Positive,update lyft says would still good hear
215380,[],1.596240e+12,2020-08-01 00:07:31,6.0,,[],0.0,1.289352e+18,en,https://twitter.com/kittenpocalypse/status/128...,...,@Real_CGThomas @Uber @Uber_Support @dkhos @che...,[],4.141801e+09,4.141801e+09,,,kittenpocalypse,0.0,marl:Negative,bruh doordash driver fine update lisecse turni...


In [8]:
#convention_df = st.SampleCorpora.ConventionData2012.get_data() 
df.iloc[0]

cashtags                                                       []
created_at                                        1585697329000.0
date                                          2020-03-31 23:28:49
day                                                           2.0
geo                                                           NaN
hashtags                                                       []
hour                                                         23.0
id                                          1245130962113003520.0
language                                                       en
link            https://twitter.com/JohnKir74833568/status/124...
name                                                 JohnKirkland
near                                                          NaN
nlikes                                                        2.0
nreplies                                                      1.0
nretweets                                                     0.0
photos    

# Scattertext process

In [9]:
nlp = spacy.load("en_core_web_sm")
corpus = st.CorpusFromPandas(df, 
                              category_col='polarity140', 
                              text_col='text',
                              nlp=nlp).build()

In [20]:
html = st.produce_scattertext_explorer(corpus,
          category='marl:Positive',
          category_name='Positive',
          not_category_name='Negative',
          width_in_pixels=1000,
          metadata=df['username']
        )
open("Sentiment2.html", 'wb').write(html.encode('utf-8'))

15848516