In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import pandas_profiling
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pycountry
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# **Loading Dataset**

In [None]:
df=pd.read_csv('/kaggle/input/omicron-rising/omicron.csv')

# **Examining Dataset**

In [None]:
df.profile_report()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#total null values
df.isnull().sum().sum()

In [None]:
pd.isnull(df).sum()

**So from a preliminary analysis of our data we can see that the train set contains 78168 rows and 16 columns. Our data contains about 3.6% of missing values with them being in 'user_location', 'user_description' and 'hashtags' columns. There are 6 numerical, 8 categorical and 2 boolean columns.**

# **Exploratory Data Analysis**

# Missing Values

In [None]:
plt.figure(figsize=(18,16))
sns.displot(
    data=df.isna().melt(value_name="missing"),
    y="variable",
    hue="missing",
    multiple="fill",
    aspect=3,
    palette='BuGn'
)
plt.title('Bar plot showing Missing Values in training data', weight = 'bold', size = 20, color = 'black')
plt.xlabel(" ")
plt.ylabel(" ")
plt.xticks(size = 12, weight = 'bold', color = 'black')
plt.yticks(size = 12, weight = 'bold', color = 'black');

plt.figure(figsize=(18,10))
sns.heatmap(df.isna().transpose(),
            cmap="copper",
            cbar_kws={'label': 'Missing Data'})
plt.title('Heatmap showing Missing Values in training data', weight = 'bold', size = 20, color = 'brown')
plt.xticks(size = 12, color = 'maroon')
plt.yticks(size = 12, color = 'maroon')
plt.show();

# Correlation matrix

In [None]:
sns.heatmap(df.corr(), square=True, cmap="YlGnBu")

**From this we can see that favourites and retweets are highly correlated which makes sense. Also user_verified and user_followers have a high correlation.**

# Some Other Plots

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# change to date time format
df['date']=pd.to_datetime(df['date'])

# tweets per hour
tweets_per_hr = df['date'].dt.strftime('%H').value_counts().sort_index().to_frame(name='Count')
tweets_per_hr['Hour']=tweets_per_hr.index

# plot
plt.figure(figsize=(12,7))
ax=sns.barplot(x='Hour', y='Count',data=tweets_per_hr, palette='inferno')
ax.bar_label(ax.containers[0])
plt.title('Tweets per hour', size='xx-large')
plt.show()

In [None]:
# excluding null values (where location is not specified) in user_location 
location = [loc for loc in df['user_location'] if type(loc)==str]

# extracting country names from given location
country_name = [country.name for loc in location for country in pycountry.countries if country.name in loc]
country_name[:5]

In [None]:
# dictionary to count number of occurances of each country
count={}
for country in country_name:
    count[country] = count.get(country, 0) + 1

# Country vs tweets count
country_df = pd.DataFrame({'Country': list(count.keys()),'Tweets Count': list(count.values())})
country_df = country_df.sort_values(by = 'Tweets Count', ascending=False)
country_df=country_df[:15] # top 15 countries

# plot the data
plt.figure(figsize=(20,8))
plt.title('Country vs Tweets Count', size='xx-large')
ax = sns.barplot(x='Country', y='Tweets Count',data=country_df, palette='inferno', edgecolor='grey');
ax.bar_label(ax.containers[0])
plt.show()

# **Preprocessing**

In [None]:
def preprocess_tweet_text(tweet):
    tweet.lower()
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # lemmatization
    tweet = [WordNetLemmatizer().lemmatize(word) for word in tweet.split(' ')]
    tweet = " ".join(tweet)
    # stopword removal
    tweet = [word for word in tweet.split(' ') if word not in set(stopwords.words('english'))]
    tweet=" ".join(tweet)
    
    return tweet

In [None]:
df['text']=df['text'].apply(preprocess_tweet_text)

In [None]:
word_count = [len(text.split()) for text in df.text]
df['word_count'] = word_count

# excluding text with less than 3 words
df=df[df['word_count']>2]

# excluding tweets with more than 16 words
df=df[df['word_count']<17]

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
SIA = SentimentIntensityAnalyzer()

df["Positive"] = [SIA.polarity_scores(i)["pos"] for i in df["text"]]
df["Neutral"] = [SIA.polarity_scores(j)["neu"] for j in df["text"]]
df["Negative"] = [SIA.polarity_scores(k)["neg"] for k in df["text"]]

df1 = df[["text", "Positive","Neutral", "Negative"]]
df1.head()

In [None]:
sentiments_nltk = []

for tweet in df.text:
    sentiment_dict = SIA.polarity_scores(tweet)
    sentiment_dict.pop('compound', None)
    sentiments_nltk.append(max(sentiment_dict , key=sentiment_dict.get))
    
df['sentiment_nltk'] = sentiments_nltk
df['sentiment_nltk'].value_counts()

In [None]:
df.head()

In [None]:
sentiments_nltk