In [None]:
import re
import pandas as pd 
import numpy as np
from datetime import datetime
import dateutil.parser
from tqdm.auto import tqdm 
import os
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import seaborn as sns 
import plotly.offline as pyo 
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import scattertext as st
from IPython.display import IFrame
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import random 
import warnings
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from spellchecker import SpellChecker




nltk.download('vader_lexicon')
nltk.download('stopwords')
warnings.filterwarnings('ignore')

In [6]:
directory   = os.path.join('.', 'data')

csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

df_list = [
    pd.read_csv(os.path.join(directory, csv_file))
    for csv_file in csv_files
]
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

for csv in csv_files:
    df = pd.read_csv(os.path.join(directory, csv))
    df_list.append(df)

# For sentiment analysis 
sia = SIA() 

# To identify misspelled words
spell = SpellChecker() 

# To display plotly graphs 
pyo.init_notebook_mode() 

# Storing csv dataset into a datframe
df = pd.concat(df_list)

Preprocess data

In [7]:
data = df.copy()
data['original_tweet'] = df['text']
data['datetime'] = data['tweet_created']
data['datetime'] = data.datetime.apply(lambda x: dateutil.parser.parse(x))
rt_mask = data.text.apply(lambda x: "RT @" in x)

# standard tweet preprocessing 
data.text = data.text.str.lower()
#Remove twitter handlers
data.text = data.text.apply(lambda x:re.sub('@[^\s]+','',x))
#remove hashtags
data.text = data.text.apply(lambda x:re.sub(r'\B#\S+','',x))
# Remove URLS
data.text = data.text.apply(lambda x:re.sub(r"http\S+", "", x))
# Remove all the special characters
data.text = data.text.apply(lambda x:' '.join(re.findall(r'\w+', x)))
#remove all single characters
data.text = data.text.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))
# Substituting multiple spaces with single space
data.text = data.text.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))

# convert the 'date' column to datetime format and remove the timezone information
data['datetime'] = pd.to_datetime(data['datetime']).dt.tz_localize(None)


# Viewing the preprocessed data
data.head()

In [None]:
def label_sentiment(x: float):
    if x < -0.05:
        return 'negative'
    if x > 0.35:
        return 'positive'
    return 'neutral'

# Feature Extraction
data['words'] = data.text.apply(lambda txt: re.findall(r'\w+', txt))
data['errors'] = data.words.apply(spell.unknown)
data['errors_count'] = data.errors.apply(len)
data['words_count'] = data.words.apply(len)
data['sentence_length'] = data.text.apply(len)
data['hour'] = data.datetime.apply(lambda dt: dt.hour)
data['date'] = data.datetime.apply(lambda dt: dt.date())
data['month'] = data.datetime.apply(lambda dt: dt.month)
data['year'] = data.datetime.apply(lambda dt: dt.year)

# Extract Sentiment Values for each tweet 
data['sentiment'] = [
    sia.polarity_scores(txt)['compound'] 
    for txt in tqdm(data['text'], desc="Sentiment pass")
]
data['overall_sentiment'] = data['sentiment'].apply(label_sentiment)
