# WELCOME TO THE NOTEBOOK
------------------

### Importing the Modules

In [2]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import re
import string

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
nltk.download('stopwords')
nltk.download('vader_lexicon')


from collections import Counter

from matplotlib import pyplot as plt
from matplotlib import ticker
import seaborn as sns
import plotly.express as px

sns.set(style="darkgrid")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


ModuleNotFoundError: No module named 'plotly'

### Importing the Dataset

In [None]:
df =pd.read_csv('https://raw.githubusercontent.com/gabrielpreda/covid-19-tweets/master/covid19_tweets.csv')
df.head()

let's check the shape of the dataframe

In [None]:
df.shape

let's select the needed columns for our project

In [None]:
needed_columns= ['user_name' ,'date' ,'text']
df = df[needed_columns]
df.head()

change the type of some columns

In [None]:
df.user_name = df.user_name.astype('category')
df.user_name= df.user_name.cat.codes
df.date = pd.to_datetime(df.date).dt.date
df.head()

### Picking out the tweet texts

In [None]:
texts = df["text"]
texts

### Removing URLs from tweets

In [None]:
remove_url = lambda x:re.sub(r'https\S+', ' ' , str(x))
texts_lr =texts.apply(remove_url)
texts_lr

### Converting all tweets to lowercase

In [None]:
to_lower = lambda x:x.lower()
texts_lr_lc = texts_lr.apply(to_lower)
texts_lr_lc

### Removing punctuations

In [None]:
remove_puncs = lambda x:x.translate(str.maketrans('','',string.punctuation))
texts_lr_lc_np = texts_lr_lc.apply(remove_puncs)
texts_lr_lc_np

### Removing stopwords

In [None]:
more_words = ['covid','#coronavirus', '#coronavirusoutbreak', '#coronavirusPandemic', '#covid19', '#covid_19', '#epitwitter', '#ihavecorona', 'amp', 'coronavirus', 'covid19']
stop_words = set(stopwords.words('English'))
stop_words.update(more_words)
stop_words
remove_words = lambda x:' '.join([word for word in x.split() if word not in stop_words])
texts_lr_lc_np_ns = texts_lr_lc_np.apply(remove_words)
texts_lr_lc_np_ns

### let's create a big list of words out of all the tweets 

In [None]:
words_list = [word for line in texts_lr_lc_np_ns for word in line.split()]

words_list[:5]

In [None]:
word_counts = Counter(words_list).most_common(50)
words_df = pd.DataFrame(word_counts)
words_df.columns=['word','frq']
#words_df.head()
px.bar(words_df , x='word',y='frq',title= 'Most common')


### put the Cleaned text in main dataframe

In [None]:
df.text = texts_lr_lc_np_ns
df.head()

# Sentiment Analysis 

Getting the polarity scores for each tweet

In [None]:
sid =  SentimentIntensityAnalyzer()
ps = lambda x: sid.polarity_scores(x)
sentiment_scores = df.text.apply(ps)
sentiment_scores

In [None]:
sentiment_df = pd.DataFrame(data= list(sentiment_scores))
sentiment_df.head()

### Labeling the scores based on the compound polarity value

In [None]:
labalize = lambda x : 'neutral' if x== 0 else('positive' if x>0 else 'negative')
sentiment_df['label'] = sentiment_df.compound.apply(labalize)
sentiment_df.head()

### let's join two dataframes

In [None]:
data = df.join(sentiment_df.label)
data.head()

### Plotting the sentiment score counts

In [None]:
counts_df = data.label.value_counts().reset_index()
counts_df


In [None]:
sns.barplot(x='index',y='label',data=counts_df)

In [None]:
data.head()

In [None]:
data_agg = data[['user_name','date','label']].groupby(['date' ,'label']).count().reset_index()
data_agg.columns=['date' , 'label','counts']
data_agg.head()

In [None]:
px.line(data_agg , x = 'date',color='label',title = 'daily tweets sentimental Analysis')