In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [101]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re

import seaborn as sns
import plotly.express as px

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

import nltk
from rake_nltk import Rake
from nltk import sent_tokenize
from nltk.stem.snowball import SnowballStemmer

from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

<IPython.core.display.Javascript object>

### Data Cleaning

In [3]:
# had to open original csv from kaggle and save with UTF-8 encoding
tweets = pd.read_csv("data/Corona_NLP_train.csv", encoding="latin1")

<IPython.core.display.Javascript object>

In [4]:
tweets.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


<IPython.core.display.Javascript object>

In [5]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


<IPython.core.display.Javascript object>

In [6]:
# I want to keep location as a variable of interest
# tweets = tweets.dropna(subset=["Location"])

# there was no structure to this variable at ALL, completely useless as
# a variable to group.
# tweets["Location"].value_counts()

<IPython.core.display.Javascript object>

In [7]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


<IPython.core.display.Javascript object>

In [8]:
" ".join(tweets["OriginalTweet"])[:2000]

'@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/iFz9FAn2Pa and https://t.co/xX6ghGFzCC and https://t.co/I2NlzdxNo8 advice Talk to your neighbours family to exchange phone numbers create contact list with phone numbers of neighbours schools employer chemist GP set up online shopping accounts if poss adequate supplies of regular meds but not over order Coronavirus Australia: Woolworths to give elderly, disabled dedicated shopping hours amid COVID-19 outbreak https://t.co/bInCA9Vp8P My food stock is not the only one which is empty...\r\r\n\r\r\nPLEASE, don\'t panic, THERE WILL BE ENOUGH FOOD FOR EVERYONE if you do not take more than you need. \r\r\nStay calm, stay safe.\r\r\n\r\r\n#COVID19france #COVID_19 #COVID19 #coronavirus #confinement #Confinementotal #ConfinementGeneral https://t.co/zrlG0Z520j Me, ready to go at supermarket during the #COVID19 outbreak.\r\r\n\r\r\nNot because I\'m paranoid, but because my food stock is litteraly empty. The #coronavirus is a serious thing, but please, 

<IPython.core.display.Javascript object>

Just scanning through the raw collection of tweets, some obvious patterns appear:
* \r\r\n pattern
* lots of tweets have links
* "Â" appears before apostrophe's for some reason
  * conveniently, can just do a .replace
* Hastags
  * could separate them out, make a count, just strip the 

In [9]:
# how I was approaching the data cleaning, but it wasn't appearing to change anything when ran.
# I cracked a kaggle notebook to get an idea and put in the function linked below

# tweets_df["OriginalTweet"] = tweets_df["OriginalTweet"].apply(
#     lambda x: str.replace(x, "Â", "")
# )
# tweets_df["OriginalTweet"] = tweets_df["OriginalTweet"].apply(
#     lambda x: str.replace(x, "\\r\\r\\n", "")
# )

# tweets_df["OriginalTweet"] = tweets_df["OriginalTweet"].replace("Â", "")
# tweets_df["OriginalTweet"].replace("\n", " ", inplace=True)
# tweets_df["OriginalTweet"].replace("\r", "", inplace=True)
# tweets_df["OriginalTweet"] = tweets_df["OriginalTweet"].apply(lambda x: "".join(x))

<IPython.core.display.Javascript object>

In [10]:
# https://www.kaggle.com/shahraizanwar/covid19-tweets-sentiment-prediction-rnn-85-acc
def data_cleaner(tweet):
    
    # remove urls
    tweet = re.sub(r'http\S+', ' ', tweet)
    
    # remove html tags
    tweet = re.sub(r'<.*?>',' ', tweet)
    
    # remove digits
    tweet = re.sub(r'\d+',' ', tweet)
    
    # remove hashtags
    tweet = re.sub(r'#\w+',' ', tweet)
    
    # remove mentions
    tweet = re.sub(r'@\w+',' ', tweet)
    
    # remove some special characters?
    tweet = tweet.replace('\x92', "")
    
    # replace Â's with apostrophe
    tweet = tweet.replace('Â', "'")
      
    # removing stop words
    # note: this step also gets rid of new lines and excess spaces
    tweet = tweet.split()
    tweet = " ".join([word for word in tweet if not word in stop_words])
    
    return tweet


stop_words = stopwords.words('english')

tweets['CleanedTweets'] = tweets['OriginalTweet'].apply(data_cleaner)


<IPython.core.display.Javascript object>

In [86]:
def preprocess(docs):
    # lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")
    preprocessed = []

    for doc in docs:
        tokenized = word_tokenize(doc)

        cleaned = [
            stemmer.stem(token.lower())
            for token in tokenized
            if not token.lower() in stopwords.words("english")
            if token.isalpha()
        ]

        untokenized = " ".join(cleaned)
        preprocessed.append(untokenized)

    return preprocessed

<IPython.core.display.Javascript object>

In [11]:
# " ".join(tweets["CleanedTweets"][:1000])

<IPython.core.display.Javascript object>

In [90]:
all_tweets_no_stops = preprocess(all_tweets)

<IPython.core.display.Javascript object>

After filtering for only tweets with a location, there no remaining missing values. 

In [81]:
positive_tweets = " ".join(
    tweets.loc[tweets["Sentiment"] == "Positive", "CleanedTweets"]
)
# positive_tweets = nlp(positive_tweets)
# need more memory, boooo

extremly_positive_tweets = " ".join(
    tweets.loc[tweets["Sentiment"] == "Extremely Positive", "CleanedTweets"]
)
# extremly_positive_tweets =

neutral_tweets = " ".join(tweets.loc[tweets["Sentiment"] == "Neutral", "CleanedTweets"])

extremly_negative_tweets = " ".join(
    tweets.loc[tweets["Sentiment"] == "Extremely Negative", "CleanedTweets"]
)

negative_tweets = " ".join(
    tweets.loc[tweets["Sentiment"] == "Negative", "CleanedTweets"]
)

all_tweets = [
    extremly_positive_tweets,
    positive_tweets,
    neutral_tweets,
    negative_tweets,
    extremly_negative_tweets,
]

<IPython.core.display.Javascript object>

### EDA

In [13]:
tweets["tweet_length"] = tweets["CleanedTweets"].apply(len)

<IPython.core.display.Javascript object>

In [24]:
fig = px.histogram(tweets, "tweet_length", color="Sentiment",)
fig.show()

<IPython.core.display.Javascript object>

It appears there is no relationship between tweet length and sentiment.

In [38]:
sentiment_counts = tweets["Sentiment"].value_counts()
fig = px.bar(y=sentiment_counts.values, x=sentiment_counts.index)
fig.show()

<IPython.core.display.Javascript object>

In [78]:
nlp = spacy.load("en_core_web_sm")

<IPython.core.display.Javascript object>

Extracting 

In [70]:
r_pos = Rake()
# r.
r_pos.extract_keywords_from_text(positive_tweets)
positive_key_phrases = r_pos.get_ranked_phrases_with_scores()
# positive_key_phrases[:10]

<IPython.core.display.Javascript object>

In [82]:
pos_freq = r_pos.frequency_dist.most_common(n=20)
pos_freq

[('covid', 2395),
 ('prices', 1923),
 ('store', 1848),
 ('supermarket', 1708),
 ('grocery', 1663),
 ('food', 1641),
 ('people', 1389),
 ('consumer', 1214),
 ('online', 1004),
 ('shopping', 980),
 ('like', 886),
 ('get', 816),
 ('hand', 774),
 ('need', 717),
 ('sanitizer', 710),
 ('workers', 687),
 ('help', 673),
 ('pandemic', 656),
 ('us', 646),
 ('demand', 635)]

<IPython.core.display.Javascript object>

In [72]:
r_neg = Rake()
# r.
r_neg.extract_keywords_from_text(negative_tweets)
negative_key_phrases = r_neg.get_ranked_phrases_with_scores()
# negative_key_phrases[:10]

<IPython.core.display.Javascript object>

In [83]:
neg_freq = r_neg.frequency_dist.most_common(n=20)
neg_freq

[('prices', 2239),
 ('covid', 2128),
 ('food', 1867),
 ('supermarket', 1557),
 ('store', 1435),
 ('people', 1378),
 ('grocery', 1282),
 ('consumer', 884),
 ('demand', 849),
 ('panic', 757),
 ('shopping', 640),
 ('get', 630),
 ('online', 613),
 ('pandemic', 591),
 ('need', 581),
 ('time', 563),
 ('oil', 540),
 ('due', 517),
 ('us', 516),
 ('buying', 515)]

<IPython.core.display.Javascript object>

In [74]:
r_ex_neg = Rake()
# r.
r_ex_neg.extract_keywords_from_text(extremly_negative_tweets)
ex_negative_key_phrases = r_ex_neg.get_ranked_phrases_with_scores()
# negative_key_phrases[:10]

<IPython.core.display.Javascript object>

In [84]:
ex_neg_freq = r_neg.frequency_dist.most_common(n=20)
ex_neg_freq

[('prices', 2239),
 ('covid', 2128),
 ('food', 1867),
 ('supermarket', 1557),
 ('store', 1435),
 ('people', 1378),
 ('grocery', 1282),
 ('consumer', 884),
 ('demand', 849),
 ('panic', 757),
 ('shopping', 640),
 ('get', 630),
 ('online', 613),
 ('pandemic', 591),
 ('need', 581),
 ('time', 563),
 ('oil', 540),
 ('due', 517),
 ('us', 516),
 ('buying', 515)]

<IPython.core.display.Javascript object>

In [76]:
r_ex_pos = Rake()
# r.
r_ex_pos.extract_keywords_from_text(extremly_positive_tweets)
ex_positive_key_phrases = r_pos.get_ranked_phrases_with_scores()
# positive_key_phrases[:10]

<IPython.core.display.Javascript object>

In [85]:
ex_pos_freq = r_ex_pos.frequency_dist.most_common(n=20)
ex_pos_freq

[('covid', 1348),
 ('store', 1145),
 ('supermarket', 969),
 ('grocery', 929),
 ('food', 922),
 ('help', 899),
 ('prices', 894),
 ('hand', 854),
 ('people', 826),
 ('sanitizer', 747),
 ('like', 723),
 ('consumer', 680),
 ('online', 645),
 ('workers', 624),
 ('shopping', 617),
 ('please', 600),
 ('us', 534),
 ('need', 495),
 ('get', 486),
 ('time', 476)]

<IPython.core.display.Javascript object>

Just by looking at the most frequent words for each category, there isn't much difference except for 'panic' seeming to appear in the negative sentiments and 'help' appearing in the positive sentiments. A more rigorous analysis will include TF-IDF, as follows.

In [112]:
vectorizer = TfidfVectorizer()
vectorized = vectorizer.fit_transform(tweets["CleanedTweets"])

<IPython.core.display.Javascript object>

In [113]:
type(vectorized)

scipy.sparse.csr.csr_matrix

<IPython.core.display.Javascript object>

Hopefully we find 5 groups as optimal, since there were 5 groups manually labeled. Even if a different amount of optimal clusters is found, key words can be extracted from each to see what topics the tweets differ on. 

In [116]:
model = KMeans()
visualizer = KElbowVisualizer(model, k=(10, 20), metric="silhouette")

visualizer.fit(vectorized)

KeyboardInterrupt: 

<IPython.core.display.Javascript object>

In [None]:
# num_topics from optimal number of clusters from elbow_viz above

# from sklearn.decomposition import LatentDirichletAllocation
# from sklearn.feature_extraction.text import CountVectorizer

# num_keywords = 5
# num_topics = 14

# vectorizer = CountVectorizer()
# model = LatentDirichletAllocation(n_components=num_topics, learning_method='online')

# vectorized = vectorizer.fit_transform(docs_clean)
# model.fit_transform(vectorized)

In [None]:
# get the keywords for each group from LDA


# results = [[(vectorizer.get_feature_names()[i], topic[i])
#              for i in topic.argsort()[:-num_keywords - 1:-1]]
#              for topic in model.components_]

# topics = [[x[0] for x in i] for i in results]
# topics

### Final thoughts

Just on the surface, it seems there isn't much difference in the most frequent words for the pre-assigned categories of extremely positive, positive, neutral, negative, and extremly negative. However, we do see 'panic' appearing more frequently in the negative tweets, and words related to 'helping' appearing in the positive tweets. Getting the Latent Dirichlet Allocation or similar topic modeling would allow us to possibly create more accurate groups. Since we have pre-labeled data, evaluating supervised learning models would be another way to test how meaningful the premade-labels are. Analysis is currently limited by run time. 