# Importation

In [None]:
from textblob import TextBlob
from wordcloud import WordCloud
import re
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

**We scraped tweets 4 times in 4 different days to make a temporal comparaison between them.**

In [None]:
df1 = pd.read_csv("../input/textbt/COVID19_coronavirus.csv", index_col=0, parse_dates=True)
df2 = pd.read_csv("../input/dataset/COVID19_coronavirus_20.csv", index_col=0, parse_dates=True)
df4 = pd.read_csv("../input/dataset/COVID19_coronavirus_21.csv", index_col=0, parse_dates=True)
df5 = pd.read_csv("../input/dataset/COVID19_coronavirus_22.csv", index_col=0, parse_dates=True)


pdList = [df1,df2,df4,df5]
# Concat all dataframes
df = pd.concat(pdList)

# Cleaning tweets

In [None]:
df

In [None]:
# Delete "    b'   "
df["tweet_text"] = df["tweet_text"].apply(lambda x : str(x)[2:])

In [None]:
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+','',text) #remove @ // re.sub == replace
    text = re.sub(r'#','',text)
    text = re.sub(r'RT[\s]+','',text)
    text = re.sub(r'https://[A-Za-z0-9].[A-Za-z0-9]+','',text)
    text = re.sub(r'/[A-Za-z0-9]+','',text)
    text = re.sub(r"\\[a-z][a-z]?[0-9]+",'', text) #DELETE \xe2\x80\xa6\\
    text = re.sub(r"\\[a-z]+",'', text)
    text = re.sub(r"[0-9]",'', text)
    #text = re.sub(r",-_'",'', text)
    return text

In [None]:
#apply clean_text fonction to df
df["tweet_text"] = df["tweet_text"].apply(clean_text)

In [None]:
df.head()

In [None]:
# Delete ":" if the begin with it
df["tweet_text"] = df["tweet_text"].apply(lambda x : str(x)[1:] if ':' == str(x)[0] else x )

In [None]:
df.head()

# Removal of stopwords

Stopwords are commonly occuring words in a language like 'the', 'a' and so on. They can be removed from the text most of the times, as they don't provide valuable information for downstream analysis. In cases like Part of Speech tagging, we should not remove them as provide very valuable information about the POS.

In [None]:
from nltk.corpus import stopwords
# It will show all the stopwords in the english library similary we can access stopwords from other languages
", ".join(stopwords.words('english'))

In [None]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["tweet_text"] = df["tweet_text"].apply(lambda text: remove_stopwords(text))

In [None]:
df.head()

# Removal of Frequent words

In [None]:
from collections import Counter
cnt = Counter()
for text in df["tweet_text"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10) # gives top 10 most common words with there count

In [None]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["tweet_text"] = df["tweet_text"].apply(lambda text: remove_freqwords(text))

# Removal of Rare words

In [None]:
n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["tweet_text"] = df["tweet_text"].apply(lambda text: remove_rarewords(text))
df.head()

# Stemming

In [None]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["tweet_text"] = df["tweet_text"].apply(lambda text: stem_words(text))

IF YOU WANT THE TEXT TO PRODUCE SOME MEANINGFUL WORDS THEN USE LEMMATIZATION OVER STEMMING . STEMMING IS USEFUL WHEN WORDS ARE NOT INTERPRETED BY HUMANS WHILE LIKE IN CHATBOTS WORDS MUST READBLE AND UNDERSTANABLE TO USER SO AVOID STEMMING IN SUCH CASES

# Delete duplicate tweets

In [None]:
#If two different person wrote same tweet, we won't delete it.
df = df.drop_duplicates(keep='first')
df = df.dropna()
df

# Lowercasing text

In [None]:
df["tweet_text"] = df["tweet_text"].apply(lambda x : x.lower())

# Remove ponctuation

In [None]:
df["tweet_text"] = df["tweet_text"].str.replace('[^\w\s]','')
df["tweet_text"] = df["tweet_text"].str.replace('\n','')

In [None]:
df['tweet_text']

In [None]:
# Make the index "timestamp" a column
df.reset_index(inplace=True)

In [None]:
df.head()

In [None]:
# We want to keep just the day
df.timestamp = df.timestamp.apply(lambda x : str(x)[:-9])

In [None]:
# The last operation had turn the "timestamp" column to str
# We will convert it to date type
df['timestamp'] = pd.to_datetime(df.timestamp)

In [None]:
df.head()

In [None]:
df.dtypes

# Natural Language Processing

In [None]:
# Subjective sentences generally refer to personal opinion, emotion or judgment
def Getsubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
# Polarity is float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement. 
def Getpolarity(text):
    return TextBlob(text).sentiment.polarity    
#
df['subjectivity'] = df['tweet_text'].apply(Getsubjectivity)
df['polarity'] = df['tweet_text'].apply(Getpolarity)

In [None]:
df

In [None]:
def getanalysis(text):
    if text < 0:
        return 'Negative'
    elif text ==0:
        return 'Neutral'
    else:
        return 'Positive'

df['analysis'] = df['polarity'].apply(getanalysis)
df.head()

As we can see, this is not very precise.
Because for example the first tweet in the df should be positive, but we can see that the algorithm has see it as neutral.

In [None]:
df['analysis'].unique()

In [None]:
positif = df[df['analysis'] == 'Positive']
Negative = df[df['analysis'] == 'Negative']
Neutral = df[df['analysis'] == 'Neutral']

# Ploting and analysis

In [None]:
#plot polarity and subjectivity
plt.figure(figsize=(8,6))
for i in range(0, df.shape[0]):
    plt.scatter(list(df['polarity'])[i], list(df['subjectivity'])[i], color='Blue')
plt.title('Sentiment analysis')
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
plt.show()

In [None]:
allwords = ''.join( [twt for twt in df['tweet_text']])
wordCloud = WordCloud(width = 500, height = 300, random_state=21, max_font_size=119).generate(allwords)

plt.imshow(wordCloud, interpolation = "bilinear")
plt.axis('off')
plt.show()

We can notice that the most reapeated words are not negative, but positive or neutral.

In [None]:
positif_time = positif.groupby('timestamp').analysis.count()
positif_time

In [None]:
Negative_time = Negative.groupby('timestamp').analysis.count()
Negative_time

In [None]:
Neutral_time = Neutral.groupby('timestamp').analysis.count()
Neutral_time

**We think we can't compare the changes of the amount of negative, positive or neutral sentiments in these 4 days, because we don't have the same quantity of data collected in each day.**

**So we decided to see which is the dominant sentiment in each day.**

In [None]:
X = list(df.timestamp.unique())
for i in range(len(X)):
    X[i] = str(X[i])[:-19]
X

In [None]:
X_axis = np.arange(len(X))

plt.subplots(figsize=(12, 6))
plt.bar(X_axis - 0.2, Neutral_time, 0.2, label = 'Neutral')
plt.bar(X_axis , Negative_time, 0.2, label = 'Negative')
plt.bar(X_axis + 0.2, positif_time, 0.2, label = 'positif')
  
plt.xticks(X_axis, X)
plt.xlabel("Days")
plt.ylabel("Sentiments")
plt.title("Changes of sentiments for 4 days")
plt.legend()
plt.show()

In [None]:
#percentage of positive tweets
p = (positif.shape[0]/df.shape[0])*100
print('Positives percentage '+ str(p))
#percentage of Negative tweets
n = (Negative.shape[0]/df.shape[0])*100
print('Negatives percentage '+ str(n))
x = (Neutral.shape[0]/df.shape[0])*100
print('Neutrals percentage '+ str(x))

# Importing tweets from 2020

**We want to make a small comparaison between tweets in 2020 and 2021. But we couldn't scrape data from 2020. So we are using this database that we found on kaggle.**

In [None]:
df_2020 = pd.read_csv('../input/data-2020-1/covid19_tweets.csv')

In [None]:
df_2020.head()

In [None]:
df_2020 = df_2020[["user_name","date","text"]]

In [None]:
df_2020

In [None]:
df_2020["text"] = df_2020["text"].apply(clean_text)
df_2020["text"] = df_2020["text"].str.replace('[^\w\s]','')
df_2020["text"] = df_2020["text"].apply(lambda x : x.lower())
df_2020["text"] = df_2020["text"].apply(lambda text: stem_words(text))


df_2020["text"] = df_2020["text"].apply(lambda text: remove_stopwords(text))


In [None]:
from collections import Counter
cnt = Counter()
for text in df_2020["text"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10) # gives top 10 most common words with there count

In [None]:
df_2020["text"] = df_2020["text"].apply(lambda text: remove_freqwords(text))

In [None]:
df_2020["text"] = df_2020["text"].apply(lambda text: remove_rarewords(text))

In [None]:
df_2020 = df_2020.drop_duplicates(keep='first')
df_2020 = df_2020.dropna()

In [None]:
df_2020

In [None]:
df_2020.date = df_2020.date.apply(lambda x : str(x)[:-9])
df_2020['date'] = pd.to_datetime(df_2020.date)
df_2020.head()

In [None]:
df_2020_1 = df_2020[df_2020["date"]=='2020-07-25']
df_2020_2 = df_2020[df_2020["date"]=='2020-07-26']
df_2020_3 = df_2020[df_2020["date"]=='2020-07-27']
df_2020_4 = df_2020[df_2020["date"]=='2020-07-28']
df_2020_4.head()

In [None]:
df_2020_1 = df_2020_1.iloc[:1830]
df_2020_2 = df_2020_2.iloc[:1830]
df_2020_3 = df_2020_3.iloc[:1830]
df_2020_4 = df_2020_4.iloc[:1830]

In [None]:
pdList = [df_2020_1, df_2020_2,df_2020_3 ,df_2020_4]
df_2020 = pd.concat(pdList)
df_2020

In [None]:
df_2020 = df_2020.reset_index(drop=True)
df_2020

In [None]:
df_2020["text"] = df_2020["text"].str.replace('[^\w\s]','')
df_2020["text"] = df_2020["text"].str.replace('\n','')
df_2020["text"] = df_2020["text"].str.replace('<','')
df_2020

In [None]:
df_2020['subjectivity'] = df_2020["text"].apply(Getsubjectivity)
df_2020['polarity'] = df_2020["text"].apply(Getpolarity)
df_2020['analysis'] = df_2020["polarity"].apply(getanalysis)
df_2020.head()

In [None]:
positif_2020 = df_2020[df_2020['analysis'] == 'Positive']
Negative_2020 = df_2020[df_2020['analysis'] == 'Negative']
Neutral_2020 = df_2020[df_2020['analysis'] == 'Neutral']

# Plot and analysis

In [None]:
#plot polarity and subjectivity
plt.figure(figsize=(8,6))
for i in range(0, df_2020.shape[0]):
    plt.scatter(list(df_2020['polarity'])[i], list(df_2020['subjectivity'])[i], color='Blue')
plt.title('Sentiment analysis')
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
plt.show()

In [None]:
allwords = ''.join( [twt for twt in df_2020['text']])
wordCloud = WordCloud(width = 500, height = 300, random_state=21, max_font_size=119).generate(allwords)

plt.imshow(wordCloud, interpolation = "bilinear")
plt.axis('off')
plt.show()

In [None]:
positif_time_2020 = positif_2020.groupby('date').analysis.count()
positif_time_2020

In [None]:
Negative_time_2020 = Negative_2020.groupby('date').analysis.count()
Negative_time_2020

In [None]:
Neutral_time_2020 = Neutral_2020.groupby('date').analysis.count()
Neutral_time_2020

In [None]:
X = list(df_2020.date.unique())
for i in range(len(X)):
    X[i] = str(X[i])[:-19]
X

In [None]:
X_axis = np.arange(len(X))

plt.subplots(figsize=(12, 6))
plt.bar(X_axis - 0.2, Neutral_time_2020, 0.2, label = 'Neutral')
plt.bar(X_axis , Negative_time_2020, 0.2, label = 'Negative')
plt.bar(X_axis + 0.2, positif_time_2020, 0.2, label = 'positif')
  
plt.xticks(X_axis, X)
plt.xlabel("Days")
plt.ylabel("Sentiments")
plt.title("Changes of sentiments for 4 days")
plt.legend()
plt.show()

In [None]:
Neutral_time_2020

In [None]:
#percentage of positive tweets
p1 = (positif_2020.shape[0]/df_2020.shape[0])*100
print('Positives percentage '+ str(p))
#percentage of Negative tweets
n1 = (Negative_2020.shape[0]/df_2020.shape[0])*100
print('Negatives percentage '+ str(n))
x1 = (Neutral_2020.shape[0]/df_2020.shape[0])*100
print('Neutrals percentage '+ str(x))

In [None]:
l1 =[p, p1]
l2= [n, n1]
l3= [x, x1]

# 2020 vs 2021

In [None]:
index =['2020','2021']
df1 = pd.DataFrame(list(zip(l1, index)),
               columns =['Percentage', 'date'])
df2 = pd.DataFrame(list(zip(l2, index)),
               columns =['Percentage', 'date'])
df3 = pd.DataFrame(list(zip(l3, index)),
               columns =['Percentage', 'date'])
df1

In [None]:
df1 = df1.groupby('date').Percentage.sum()
df2 = df2.groupby('date').Percentage.sum()
df3 = df3.groupby('date').Percentage.sum()

In [None]:
X = ['2020','2021']
X_axis = np.arange(len(X))

plt.subplots(figsize=(12, 6))
plt.bar(X_axis - 0.2, df1, 0.2, label = 'Poisitive')
plt.bar(X_axis , df2, 0.2, label = 'Negative')
plt.bar(X_axis + 0.2 , df3, 0.2, label = 'Neutral')
plt.xticks(X_axis, X)
plt.xlabel("Days")
plt.ylabel("Percentage")
plt.title("2020 vs 2021")
plt.legend()
plt.show()