## #BuildWithAI - Armor Team

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
warnings.filterwarnings('ignore')
%matplotlib inline

# Set up the chosen style format
plt.style.use('fivethirtyeight')
%matplotlib inline

# Increase default figure and font sizes for easier viewing.
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 14
sns.set_context('notebook')

In [2]:
# load dataset
data = pd.read_csv("2020-04-30_Coronavirus_Tweets.CSV")

### Cleaning and Extracting

In [3]:
# Remove links from tweets
data['tweet'] = data['text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

In [4]:
# Extract the most 5000 likeable tweets
data_top = data.nlargest(5000,'favourites_count')

In [5]:
data_top

Unnamed: 0,status_id,user_id,created_at,screen_name,text,source,reply_to_status_id,reply_to_user_id,reply_to_screen_name,is_quote,...,country_code,place_full_name,place_type,followers_count,friends_count,account_lang,account_created_at,verified,lang,tweet
462057,1250876607239249920,1245214176,2020-04-16T19:59:57Z,ChelseaAMusic,Ever since the #COVID19 epidemic began i think...,Twitter for Android,,,,False,...,,,,22629,22631,,2013-03-06T04:11:45Z,False,en,Ever since the #COVID19 epidemic began i think...
468237,1250879975953182721,1245214176,2020-04-16T20:13:21Z,ChelseaAMusic,Horrible what @FashionNova is doing to people ...,Twitter for Android,,,,False,...,,,,22629,22631,,2013-03-06T04:11:45Z,False,en,Horrible what @FashionNova is doing to people ...
294824,1250798677796544513,783028986205069312,2020-04-16T14:50:18Z,MiguelCalabria3,"""Las tormentas hacen apreciar\n lo “extraordin...",Twitter for Android,,,,False,...,,,,9094,243,,2016-10-03T19:40:33Z,False,es,"""Las tormentas hacen apreciar\n lo “extraordin..."
284413,1250793907987705856,94346808,2020-04-16T14:31:20Z,GokhanAkar,"Güzel Olmuş...\n 😍😍😍\n""Ruh Eşi""\nhttps://t.co/...",Twitter for Android,,,,False,...,,,,73120,20439,,2009-12-03T15:08:58Z,True,tr,"Güzel Olmuş...\n 😍😍😍\n""Ruh Eşi""\n"
178246,1250737886376398848,804366366522376193,2020-04-16T10:48:44Z,SueRMichael,This is a wonderful story of surviving #COVID1...,Twitter for Android,,,,True,...,,,,1563,1081,,2016-12-01T16:47:41Z,False,en,This is a wonderful story of surviving #COVID1...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47293,1250627684222119936,38486830,2020-04-16T03:30:50Z,VividConfusion,@preferspuppies @WeTalkSportz @cnnbrk That's t...,Twitter for Android,1.250625e+18,3.230339e+07,preferspuppies,False,...,,,,725,1064,,2009-05-07T18:32:14Z,False,en,@preferspuppies @WeTalkSportz @cnnbrk That's t...
34674,1250611780814061569,29235171,2020-04-16T02:27:38Z,daahmom,@dawg_lb It was #CCP &amp; #WHO telling us tha...,Twitter for iPhone,1.250576e+18,7.381535e+17,dawg_lb,False,...,,,,46045,46060,,2009-04-06T16:34:07Z,False,en,@dawg_lb It was #CCP &amp; #WHO telling us tha...
116324,1250696890556903424,903270351064285184,2020-04-16T08:05:50Z,AapActive123,How well is Delhi prepared to tackle #covid19...,Twitter for Android,,,,False,...,,,,1476,179,,2017-08-31T14:56:48Z,False,en,How well is Delhi prepared to tackle #covid19...
55992,1250639027038031873,903270351064285184,2020-04-16T04:15:54Z,AapActive123,"Breaking: Over the past 24 hours, the U.S. rep...",Twitter for Android,,,,False,...,,,,1476,179,,2017-08-31T14:56:48Z,False,en,"Breaking: Over the past 24 hours, the U.S. rep..."


### Translating non-English Tweets

In [6]:
# Translation and returning to data_top
!pip install googletrans

# import the library
import googletrans
from googletrans import Translator



In [7]:
data_translate = data_top[data_top["lang"] != "en"]
data_eng = data_top[data_top["lang"] == "en"]

In [8]:
translator = Translator()
data_translate['tweet_en'] = data_translate['tweet'].apply(translator.translate, src='auto', dest='en').apply(getattr, args=('text',))

In [9]:
data_translate = data_translate.drop(['tweet'], axis=1)
data_translate = data_translate.rename(columns={'tweet_en': 'tweet'})

In [10]:
data_merge = pd.concat([data_translate, data_eng], ignore_index=True)

In [11]:
data_merge

Unnamed: 0,status_id,user_id,created_at,screen_name,text,source,reply_to_status_id,reply_to_user_id,reply_to_screen_name,is_quote,...,country_code,place_full_name,place_type,followers_count,friends_count,account_lang,account_created_at,verified,lang,tweet
0,1250798677796544513,783028986205069312,2020-04-16T14:50:18Z,MiguelCalabria3,"""Las tormentas hacen apreciar\n lo “extraordin...",Twitter for Android,,,,False,...,,,,9094,243,,2016-10-03T19:40:33Z,False,es,"""Storms do appreciate\n how “extraordinary was..."
1,1250793907987705856,94346808,2020-04-16T14:31:20Z,GokhanAkar,"Güzel Olmuş...\n 😍😍😍\n""Ruh Eşi""\nhttps://t.co/...",Twitter for Android,,,,False,...,,,,73120,20439,,2009-12-03T15:08:58Z,True,tr,"It is nice...\n 😍😍😍\n""Soul mate"""
2,1250802390946676736,1341462944,2020-04-16T15:05:03Z,paoloigna1,In un tweet precedente ho scritto che Guido Ol...,Twitter Web Client,,,,False,...,,,,87542,94859,,2013-04-10T09:29:25Z,False,it,In a previous tweet I wrote that Guido Olimpio...
3,1250878232909529094,1341462944,2020-04-16T20:06:25Z,paoloigna1,a tutta velocità #innovazione #3D #Ferrari #co...,Twitter Web App,,,,True,...,,,,87541,94863,,2013-04-10T09:29:25Z,False,it,at full speed #innovation # 3D #Ferrari # covi...
4,1250888059756908544,1341462944,2020-04-16T20:45:28Z,paoloigna1,Sento il dibattito a #Piazzapulita e sembra ch...,Twitter Web Client,,,,False,...,,,,87541,94863,,2013-04-10T09:29:25Z,False,es,I hear the debate in #Piazzapulita and it seem...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1250627684222119936,38486830,2020-04-16T03:30:50Z,VividConfusion,@preferspuppies @WeTalkSportz @cnnbrk That's t...,Twitter for Android,1.250625e+18,3.230339e+07,preferspuppies,False,...,,,,725,1064,,2009-05-07T18:32:14Z,False,en,@preferspuppies @WeTalkSportz @cnnbrk That's t...
4996,1250611780814061569,29235171,2020-04-16T02:27:38Z,daahmom,@dawg_lb It was #CCP &amp; #WHO telling us tha...,Twitter for iPhone,1.250576e+18,7.381535e+17,dawg_lb,False,...,,,,46045,46060,,2009-04-06T16:34:07Z,False,en,@dawg_lb It was #CCP &amp; #WHO telling us tha...
4997,1250696890556903424,903270351064285184,2020-04-16T08:05:50Z,AapActive123,How well is Delhi prepared to tackle #covid19...,Twitter for Android,,,,False,...,,,,1476,179,,2017-08-31T14:56:48Z,False,en,How well is Delhi prepared to tackle #covid19...
4998,1250639027038031873,903270351064285184,2020-04-16T04:15:54Z,AapActive123,"Breaking: Over the past 24 hours, the U.S. rep...",Twitter for Android,,,,False,...,,,,1476,179,,2017-08-31T14:56:48Z,False,en,"Breaking: Over the past 24 hours, the U.S. rep..."


### Sentiment Analysis

In [12]:
# Extract the tweets
tweets30 = data_merge["tweet"]
tweets30.head()

0    "Storms do appreciate\n how “extraordinary was...
1                     It is nice...\n 😍😍😍\n"Soul mate"
2    In a previous tweet I wrote that Guido Olimpio...
3    at full speed #innovation # 3D #Ferrari # covi...
4    I hear the debate in #Piazzapulita and it seem...
Name: tweet, dtype: object

In [13]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier

import re, string, random

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

if __name__ == "__main__":

    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                         for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                         for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(10))

    custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

    custom_tokens = remove_noise(word_tokenize(custom_tweet))

    print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]
Accuracy is: 0.9963333333333333
Most Informative Features
                      :( = True           Negati : Positi =   2063.6 : 1.0
                follower = True           Positi : Negati =     33.7 : 1.0
                     sad = True           Negati : Positi =     24.7 : 1.0
                     bam = True           Positi : Negati =     22.4 : 1.0
                     x15 = True           Negati : Positi =     17.0 : 1.0
               community = True           Positi : Negati =     15.0 : 1.0
                followed = True           Negati : Positi =     15.0 : 1.0
                 welcome = True           Positi : Negati =     14.5 : 1.0
                    damn = True           Negati : Positi =     14.3 : 1.0
               goodnight = True           Positi : Negati =     13.7 : 1.0
None
I ordered just once from TerribleCo, 

In [14]:
from nltk.tokenize import word_tokenize

for tweet in tweets30:
    custom_tokens = remove_noise(word_tokenize(tweet))
    data_merge["polarity"] = classifier.classify(dict([token, True] for token in custom_tokens))

In [15]:
data_merge.head()

Unnamed: 0,status_id,user_id,created_at,screen_name,text,source,reply_to_status_id,reply_to_user_id,reply_to_screen_name,is_quote,...,place_full_name,place_type,followers_count,friends_count,account_lang,account_created_at,verified,lang,tweet,polarity
0,1250798677796544513,783028986205069312,2020-04-16T14:50:18Z,MiguelCalabria3,"""Las tormentas hacen apreciar\n lo “extraordin...",Twitter for Android,,,,False,...,,,9094,243,,2016-10-03T19:40:33Z,False,es,"""Storms do appreciate\n how “extraordinary was...",Positive
1,1250793907987705856,94346808,2020-04-16T14:31:20Z,GokhanAkar,"Güzel Olmuş...\n 😍😍😍\n""Ruh Eşi""\nhttps://t.co/...",Twitter for Android,,,,False,...,,,73120,20439,,2009-12-03T15:08:58Z,True,tr,"It is nice...\n 😍😍😍\n""Soul mate""",Positive
2,1250802390946676736,1341462944,2020-04-16T15:05:03Z,paoloigna1,In un tweet precedente ho scritto che Guido Ol...,Twitter Web Client,,,,False,...,,,87542,94859,,2013-04-10T09:29:25Z,False,it,In a previous tweet I wrote that Guido Olimpio...,Positive
3,1250878232909529094,1341462944,2020-04-16T20:06:25Z,paoloigna1,a tutta velocità #innovazione #3D #Ferrari #co...,Twitter Web App,,,,True,...,,,87541,94863,,2013-04-10T09:29:25Z,False,it,at full speed #innovation # 3D #Ferrari # covi...,Positive
4,1250888059756908544,1341462944,2020-04-16T20:45:28Z,paoloigna1,Sento il dibattito a #Piazzapulita e sembra ch...,Twitter Web Client,,,,False,...,,,87541,94863,,2013-04-10T09:29:25Z,False,es,I hear the debate in #Piazzapulita and it seem...,Positive


In [28]:
sentiment_data = data_merge[["created_at","tweet","lang","favourites_count","retweet_count","verified","polarity"]]

In [31]:
sentiment_data.head()

Unnamed: 0,created_at,tweet,lang,favourites_count,retweet_count,verified,polarity
0,2020-04-16T14:50:18Z,"""Storms do appreciate\n how “extraordinary was the ordinary”\nAnd they never last forever. ""\n-Laura Len #photography\n\n#coronavirus #StayHome #Q...",es,1420056,35,False,Positive
1,2020-04-16T14:31:20Z,"It is nice...\n 😍😍😍\n""Soul mate""",tr,1352326,4,True,Positive
2,2020-04-16T15:05:03Z,"In a previous tweet I wrote that Guido Olimpio endorses the hoax, it is not correct and I apologize for it neither confirming nor excluding it. an...",it,1122833,3,False,Positive
3,2020-04-16T20:06:25Z,at full speed #innovation # 3D #Ferrari # covid19 #coronavirus,it,1122833,1,False,Positive
4,2020-04-16T20:45:28Z,I hear the debate in #Piazzapulita and it seems that only in #Italy there are problems of chain of command and instead everywhere ... # covid19 Ma...,es,1122833,2,False,Positive


In [32]:
# save the new data to another CSV file befoe moving to the next stage
sentiment_data.to_csv("Sentiment_30.csv")

### Emotion Detection

In [19]:
from emotion_predictor import EmotionPredictor

# Pandas presentation options
pd.options.display.max_colwidth = 150   # show whole tweet's content
pd.options.display.width = 200          # don't break columns

Using Theano backend.


In [20]:
model = EmotionPredictor(classification='ekman', setting='mc', use_unison_model=True)

In [33]:
tweets = sentiment_data['tweet'].tolist()

In [34]:
predictions = model.predict_classes(tweets)

In [35]:
predictions

Unnamed: 0,Tweet,Emotion
0,"""Storms do appreciate\n how “extraordinary was the ordinary”\nAnd they never last forever. ""\n-Laura Len #photography\n\n#coronavirus #StayHome #Q...",Joy
1,"It is nice...\n 😍😍😍\n""Soul mate""",Joy
2,"In a previous tweet I wrote that Guido Olimpio endorses the hoax, it is not correct and I apologize for it neither confirming nor excluding it. an...",Joy
3,at full speed #innovation # 3D #Ferrari # covid19 #coronavirus,Fear
4,I hear the debate in #Piazzapulita and it seems that only in #Italy there are problems of chain of command and instead everywhere ... # covid19 Ma...,Joy
...,...,...
195,@KhaosodEnglish @PravitR # COVID19 #lockdown #Thailand 😷,Fear
196,@GermanyDiplo @HeikoMaas @welt Grand Hotel Sonnenbichl #Bavaria #Germany #COVID19 😷,Joy
197,@carolinecstark @littlemeanj9 #COVID19 #michiganshutdown #Michigan 😷,Fear
198,# Covid_19 Anxiety and confinement: what are you doing wrong?,Fear


In [49]:
# Merge predictions with sentiment data

#data_final = pd.concat([sentiment_data.head(200), predictions], axis=1) #the sample
data_final = pd.concat([sentiment_data, predictions], axis=1)
data_final = data_final.drop(['Tweet'], axis=1)


# save the new data to another CSV file befoe moving to the next stage
data_final.to_csv("Emotions.csv")

1. Remove links from all tweets. [Done]
2. The final dataframe: created at, clean links, polarity, emotion, favourites_count, retweet_count, verified
3. Run the model on 2-3 days
4. Visulaization 
5. live stream, see how trends change on time