## **NLP based model to perform sentiment analysis on the corona virus tweets**

#**Step 1: Importing Packages**

In [None]:
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Text data
import re
import unicodedata
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, plot_model

# Model
from tensorflow.keras import layers, Sequential
from tensorflow.keras.callbacks import EarlyStopping

# NLTK
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn import metrics

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#**Step 2: Data Loading**

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/SCAAI_Drive_Ankita Mandal/SCAAI_Drive_Ankita Mandal_PS4 /Corona_NLP_train.csv", encoding="latin_1")    # encoding latin 1 will map all possible byte values to first 256 unicode points
df_test = pd.read_csv("/content/drive/MyDrive/SCAAI_Drive_Ankita Mandal/SCAAI_Drive_Ankita Mandal_PS4 /Corona_NLP_test.csv", encoding="latin_1")

FileNotFoundError: ignored

Copy of training data to preserve the original dataset.

In [None]:
df_train_original = df_train.copy() 

In [None]:
df_train

In [None]:
# No. of tweets = 41157 in the training dataset
df_train.shape

Copying test data to preserve the original data set


In [None]:
df_test_original=df_test.copy()

In [None]:
df_test

In [None]:
# No.of tweets in test dataset = 3798
df_test.shape

**Data Overview**

In [None]:
parameters = {'axes.labelsize': 20,
              'axes.titlesize': 30}

plt.rcParams.update(parameters)

# A figure with 1 subplot
fig, ax = plt.subplots()
fig.set_size_inches(8, 4)

#Group by sentiment
df_train["Sentiment"].reset_index().groupby("Sentiment").count().rename(columns={"index": "Count"}).sort_values(by= 
       "Count").plot(kind="barh", legend=False, 
        ax=ax).grid(axis='x')
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)
ax.set_ylabel("")
ax.set_title("Tweet sentiment count", color ="#292421", fontsize= 16)
fig.tight_layout(pad=2.0)
plt.rcParams.update(parameters)


**Converting the above 5 sentiment categories into 3 categories** 

i.e here "Extremely Positive" and "Extremely Negative" will be converted to "Positive" and "Negative" respectively.

In [None]:
def set_3_classes(x):
  if x=="Extremely Negative":
    return "Negative"
  elif x=="Extremely Positive":
    return "Positive"
  else:
    return x

In [None]:
df_train["Sentiment"] = df_train["Sentiment"].apply(set_3_classes)
df_test["Sentiment"] = df_test["Sentiment"].apply(set_3_classes)

Plotting the changes in sentiment classes

In [None]:
fig, ax = plt.subplots()
fig.suptitle("Count", fontsize=16)
df_train["Sentiment"].reset_index().groupby("Sentiment").count().sort_values(by= 
       "index").plot(kind="barh", legend=False, 
        ax=ax).grid(axis='x')
plt.show()

In [None]:
labels=['Negative', 'Neutral', 'Positive']
sizes = [
         
         df_train[df_train['Sentiment'] == 'Negative'].shape[0], 
         df_train[df_train['Sentiment'] == 'Neutral'].shape[0],
         df_train[df_train['Sentiment'] == 'Positive'].shape[0]
        ]
plt.pie(sizes,labels=labels, data=df_train, autopct='%1.2f%%', shadow=True, startangle=90)
plt.title("Sentiments percentages in train data")
plt.axis("equal")

#**Step 3: Data Preprocessing**

 **Cleaning Training dataset**

In [None]:
df_train["CleanTweet"] = df_train["OriginalTweet"]
df_train.sample(10)

*We only need 4 columns which are Location, TweetAt, Original T
weet and Sentiment column*

In [None]:
df_train = df_train.iloc[:,2:]
df_test = df_test.iloc[:,2:]

In [None]:
df_train.head()

**Removing end-of-line, tabulation and carriage return. Turning into lower case**

In [None]:
def clean_eol_tabs(df, label):
    """ text lowercase
        removes \n
        removes \t
        removes \r """
    df[label] = df[label].str.lower()
    df[label] = df[label].apply(lambda x: x.replace("\n", " "))
    df[label] = df[label].apply(lambda x: x.replace("\r", " "))
    df[label] = df[label].apply(lambda x: x.replace("\t", " "))
    return df

df_train = clean_eol_tabs(df_train, "CleanTweet")

**Removing e-mails**

In [None]:
def remove_emails(df, label):
    """ This function removes email adresses inputs: - text """
    df[label] = df[label].apply(lambda x: re.sub(r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""", " ", x))
    return df

df_train = remove_emails(df_train, "CleanTweet")

**Removing mentions**

In [None]:
def remove_mentions(df, label):
    """ This function removes mentions (Twitter - starting with @) from texts inputs:  - text """
    df[label] = df[label].apply(lambda x: re.sub(r"@([a-zA-Z0-9_.-]{1,100})", " ", x))
    return df

df_train = remove_mentions(df_train, "CleanTweet")

**Removing hyperlinks**

In [None]:
def remove_hyperlinks(df, label):
    """ This function removes hyperlinks from texts
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"http\S+", " ", x))
    return df

df_train = remove_hyperlinks(df_train, "CleanTweet")

**Removing hashtags**

In [None]:
def remove_hashtags(df, label):
    """ This function removes hashtags
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"#\w+", " ", x))
    return df

df_train = remove_hashtags(df_train, "CleanTweet")

**Removing html tags**

In [None]:
def remove_html_tags(df, label):
    """ This function removes html tags from texts
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"<.*?>", " ", x))
    return df

df_train = remove_html_tags(df_train, "CleanTweet")

**Removing numbers**

In [None]:
def remove_numbers(df, label):
    """ This function removes numbers from a text
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"\d+", " ", x))
    return df
#
df_train = remove_numbers(df_train, "CleanTweet")

**Encode unknown characters**

In [None]:
def encode_unknown(df, label):
    """ This function encodes special caracters """
    df[label] = df[label].apply(lambda x: unicodedata.normalize("NFD", x).encode('ascii', 'ignore').decode("utf-8"))
    return df

df_train = encode_unknown(df_train, "CleanTweet")

**Removing punctuations and special characters**

In [None]:
def clean_punctuation_no_accent(df, label):
    """ This function removes punctuation and accented characters from texts in a dataframe 
        To be appplied to languages that have no accents, ex: english 
    """
    df[label] = df[label].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
    return df

df_train = clean_punctuation_no_accent(df_train, "CleanTweet")

**Removing one and two letters words, removing unnecessary spaces, droping empty lines**

In [None]:
def more_cleaning(df, label):
    """ This function
     1) removes remaining one-letter words and two letters words
     2) replaces multiple spaces by one single space
     3) drop empty lines """
    df[label] = df[label].apply(lambda x: re.sub(r'\b\w{1,2}\b', " ", x))
    df[label] = df[label].apply(lambda x: re.sub(r"[ \t]{2,}", " ", x))
    df[label] = df[label].apply(lambda x: x if len(x) != 1 else '')
    df[label] = df[label].apply(lambda x: np.nan if x == '' else x)
    df = df.dropna(subset=[label], axis=0).reset_index(drop=True).copy()
    return df

df_train = more_cleaning(df_train, "CleanTweet")

 ***Lexical Analysis***

**Tokenization**

In [None]:
tokenized_tweet = df_train['CleanTweet'].apply(lambda x: x.split())
tokenized_tweet.head()

**Removing Stop Words**

In [None]:
def remove_stop_words(text, stopwords=set(stopwords.words('english'))):
    """ This function removes stop words from a text
        inputs:
         - stopword list
         - text """

    # prepare new text
    text_splitted = text.split(" ")
    text_new = list()
    
    # stop words updated
    #stopwords = stopwords.union({"amp", "grocery store", "covid", "supermarket", "people", "grocery", "store", "price", "time", "consumer"})
    
    # loop
    for word in text_splitted:
        if word not in stopwords:
            text_new.append(word)
    return " ".join(text_new)

def clean_stopwords(df, label):
    """ This function removes stopwords """
    df[label] = df[label].apply(lambda x: remove_stop_words(x))
    return df
#
df_train = clean_stopwords(df_train, "CleanTweet")

***Syntactic Analysis***

**Stemming**

In [None]:
from nltk import PorterStemmer

ps = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [ps.stem(i) for i in x])

tokenized_tweet.head()

**Lemmatization**

In [None]:
def lemmatize_one_text(text):
    """ This function lemmatizes words in text (it changes word to most close root word)
        inputs:
         - lemmatizer
         - text """

    # initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # tags
    lem_tags = ['a', 'r', 'n', 'v']

    # prepare new text
    text_splitted = text.split(" ")
    text_new = list()

    # change bool
    changed = ''
    
    # loop
    for word in text_splitted:
        text_new.append(lemmatizer.lemmatize(word))
        #changed = ''
        #for tag in lem_tags:
        #    if lemmatizer.lemmatize(word, tag) != word:
        #        changed = tag
        #if changed == '':
        #    text_new.append(word)
        #else:
        #    text_new.append(lemmatizer.lemmatize(word, changed))

    return " ".join(text_new)

def lemmatize(df, label):
    """ This function lemmatizes texts """
    df[label] = df[label].apply(lambda x: lemmatize_one_text(x))
    return df
#
df_train = lemmatize(df_train, "CleanTweet")

In [None]:
df_train.sample(10)

***Similarly applying cleaning techniques on Test data***

In [None]:
df_test["CleanTweet"] = df_test["OriginalTweet"]
df_test = clean_eol_tabs(df_test, "CleanTweet")
df_test = remove_emails(df_test, "CleanTweet")
df_test = remove_mentions(df_test, "CleanTweet")
df_test = remove_hyperlinks(df_test, "CleanTweet")
df_test = remove_hashtags(df_test, "CleanTweet")
df_test = remove_html_tags(df_test, "CleanTweet")
df_test = remove_numbers(df_test, "CleanTweet")
df_test = encode_unknown(df_test, "CleanTweet")
df_test = clean_punctuation_no_accent(df_test, "CleanTweet")
df_test = more_cleaning(df_test, "CleanTweet")
df_test = lemmatize(df_test, "CleanTweet")


**Tokenization**

In [None]:
tokenized_tweet = df_test['CleanTweet'].apply(lambda x: x.split())
tokenized_tweet.head()

**Removing Stop Words**

In [None]:
def remove_stop_words(text, stopwords=set(stopwords.words('english'))):
    """ This function removes stop words from a text
        inputs:
         - stopword list
         - text """

    # prepare new text
    text_splitted = text.split(" ")
    text_new = list()
    
    # stop words updated
    #stopwords = stopwords.union({"amp", "grocery store", "covid", "supermarket", "people", "grocery", "store", "price", "time", "consumer"})
    
    # loop
    for word in text_splitted:
        if word not in stopwords:
            text_new.append(word)
    return " ".join(text_new)

def clean_stopwords(df, label):
    """ This function removes stopwords """
    df[label] = df[label].apply(lambda x: remove_stop_words(x))
    return df
#
df_test = clean_stopwords(df_test, "CleanTweet")

**Stemming**

In [None]:
from nltk import PorterStemmer

ps = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [ps.stem(i) for i in x])

tokenized_tweet.head()

**Lemmaztization**


In [None]:
df_test.sample(10)

**Before/after cleaning on several tweets**

In [None]:
import random
tweet_num = random.randint(0, df_train.shape[1])
print("############################# Original Tweet #############################")
print(df_train.iloc[tweet_num].at["OriginalTweet"])
print("\n")
print("############################# Clean Tweet ################################")
print(df_train.iloc[tweet_num].at["CleanTweet"])

#**Step 4: Data Visualization with the help of word cloud**

**Word cloud in each sentiment category**

In [None]:
all_words_positive = " ".join([text for text in df_train[df_train["Sentiment"]=="Positive"]["CleanTweet"]])
all_words_neutral = " ".join([text for text in df_train[df_train["Sentiment"]=="Neutral"]["CleanTweet"]])
all_words_negative = " ".join([text for text in df_train[df_train["Sentiment"]=="Negative"]["CleanTweet"]])

In [None]:
wordcloud_positive = WordCloud(width=800, height=600, max_font_size=120, background_color="white", colormap="Greens").generate(all_words_positive)
wordcloud_neutral = WordCloud(width=800, height=600, max_font_size=120, background_color="white", colormap="Blues").generate(all_words_neutral)
wordcloud_negative = WordCloud(width=800, height=600, max_font_size=120, background_color="white", colormap="Reds").generate(all_words_negative)

In [None]:
parameters = {'axes.labelsize': 12,
              'axes.titlesize': 10}

# A figure with 3 subplots
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.set_size_inches(18.5, 7)
ax1.imshow(wordcloud_positive, interpolation='bilinear')
ax1.axis("off")
ax1.set_title("WordCloud of positive tweets", fontsize=12)
ax2.imshow(wordcloud_neutral, interpolation='bilinear')
ax2.axis("off")
ax2.set_title("WordCloud of neutral tweets", fontsize=12)
ax3.imshow(wordcloud_negative, interpolation='bilinear')
ax3.axis("off")
ax3.set_title("WordCloud of negative tweets", fontsize=12)
plt.rcParams.update(parameters)
plt.show()

#**Step 5: Model Creation and training**

**Sentiment Encoding**

In [None]:
df_train_encoded = df_train.copy()
df_test_encoded = df_test.copy()

print("train set shape: " + str(df_train_encoded.shape))
print("test set shape: " + str(df_test_encoded.shape))

In [None]:
map_sentiment = {"Neutral":0, "Positive":1,"Negative":2}
df_train_encoded['Sentiment'] = df_train_encoded['Sentiment'].map(map_sentiment)
df_test_encoded['Sentiment']  = df_test_encoded['Sentiment'].map(map_sentiment)

In [None]:
# Target Preparation
y_train = df_train['Sentiment'].copy()
y_test = df_test['Sentiment'].copy()

y_train_encoded = to_categorical(df_train_encoded['Sentiment'], 3)
y_test_encoded = to_categorical(df_test_encoded['Sentiment'], 3)

y_train_mapped = df_train_encoded['Sentiment'].copy()
y_test_mapped = df_test_encoded['Sentiment'].copy()

X_train = df_train_encoded[['CleanTweet']].copy()
X_test = df_test_encoded[['CleanTweet']].copy()

**Tokens, sequence and padding**

->key = word

->value = unique number

 *a. Tokens*

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train["CleanTweet"])
vocab_length = len(tokenizer.word_index) + 1
vocab_length

In [None]:
# texts_to_sequences function first transforms a text into list of words
X_train = tokenizer.texts_to_sequences(X_train["CleanTweet"])
X_test = tokenizer.texts_to_sequences(X_test["CleanTweet"])

In [None]:
# texts_to_sequences function will be a list of list of numbers of varying length, since different tweets have different lengths
print("First tweet encoded:")
print(X_train[0])
print("\nSecond tweet encoded:")
print(X_train[1])
print("\nThird tweet encoded:")
print(X_train[2])
print("\nFourth tweet encoded:")
print(X_train[3])
print("\nFifth tweet encoded:")
print(X_train[4])

**Maximum no.of words in one tweet**

In [None]:
max_word_count = 0
word_count = []
#
for encoded_tweet in X_train:
    word_count.append(len(encoded_tweet))
    if len(encoded_tweet) > max_word_count:
        max_word_count = len(encoded_tweet)
print("Maximum number of word in one tweet: " + str(max_word_count) + " words")

*b. Padding*

In [None]:
# pad the sequences with a maximum length of 37 since the max word count is 37
X_train = pad_sequences(X_train, maxlen=max_word_count, padding='post')
X_test = pad_sequences(X_test, maxlen=max_word_count, padding='post')
X_train.shape

In [None]:
# All encoded tweets are of the same length
print("First tweet encoded:", "Size = ", len(X_train[0]))
print(X_train[0])
print("\nSecond tweet encoded:", "Size = ", len(X_train[1]))
print(X_train[1])
print("\nThird tweet encoded:", "Size = ", len(X_train[2]))
print(X_train[2])
print("\nFourth tweet encoded:", "Size = ", len(X_train[3]))
print(X_train[3])
print("\nFifth tweet encoded:", "Size = ", len(X_train[4]))
print(X_train[4])

**LSTM**

Long Short-Term Memory (LSTM) networks are a type of recurrent neural network capable of learning order dependence in sequence prediction problems.

In [None]:
model_LSTM = Sequential()
model_LSTM.add(layers.Embedding(vocab_length, output_dim=32, input_length=max_word_count, mask_zero=True))
model_LSTM.add(layers.LSTM(100))
model_LSTM.add(layers.Dense(64, activation="relu"))
model_LSTM.add(layers.Dense(32, activation="relu"))
model_LSTM.add(layers.Dense(16, activation="relu"))
model_LSTM.add(layers.Dense(3, activation='softmax'))
model_LSTM.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_LSTM.summary())

In [None]:
# restore_best_weights to True so that the weights of best score on monitored metric
#val_accuracy i.e accuracy on test set - are restored when training stops
es = EarlyStopping(patience=10, monitor='val_accuracy', restore_best_weights=True)
history = model_LSTM.fit(X_train,
                         y_train_encoded,
                         validation_data=(X_test, y_test_encoded),
                         epochs=5,
                         batch_size=16,
                         verbose=1,
                         callbacks=[es]
                        )

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('LOSS',fontdict={'size':'22'})
plt.plot()

#**Step 6: Accuracy on test data set**

*For LSTM Algo*

In [None]:
# prediction on test data set
predicted = model_LSTM.predict(X_test)
y_pred = predicted.argmax(axis=-1)

**Accuracy and Area Under (ROC) Curve - AUC - scores**

In [None]:
acc_score = accuracy_score(y_test_mapped, y_pred)
auc_score = roc_auc_score(y_test_mapped, predicted, multi_class="ovr")

In [None]:
report = classification_report(y_test_mapped, y_pred, target_names=list(y_test.unique()), output_dict=True)
accuracy_col = ([""]*3) + [round(acc_score, 2)]
roc_auc_col = ([""]*3) + [round(auc_score, 2)]
accuracy_col = pd.Series(accuracy_col, index=list(report["Neutral"].keys()))
roc_auc_col = pd.Series(roc_auc_col, index=list(report["Neutral"].keys()))
df_report = pd.DataFrame(report)[["Neutral", "Positive", "Negative", "macro avg", "weighted avg"]].apply(lambda x: round(x, 2))
df_report["accuracy"] = accuracy_col
df_report["roc_auc"] = roc_auc_col
df_report

**From the report generated above we can infer that 85% accuracy is obtained on test data set.**

#**Applying Machine Learning Models**

In [None]:
from sklearn.model_selection import train_test_split

train,valid = train_test_split(df_train,test_size = 0.2,random_state=0,stratify = df_train.Sentiment.values) #stratification means that the train_test_split method returns training and test subsets that have the same proportions of class labels as the input dataset.
print("train shape : ", train.shape)
print("valid shape : ", valid.shape)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stop = list(stopwords.words('english'))
vectorizer = CountVectorizer(decode_error = 'replace',stop_words = stop)

X_train = vectorizer.fit_transform(train.CleanTweet.values)
X_valid = vectorizer.transform(valid.CleanTweet.values)

y_train = train.Sentiment.values
y_valid = valid.Sentiment.values

print("X_train.shape : ", X_train.shape)
print("X_train.shape : ", X_valid.shape)
print("y_train.shape : ", y_train.shape)
print("y_valid.shape : ", y_valid.shape)

**a. Naive Bayes Classifier for MULTICLASS Classification**

In [None]:
from sklearn.naive_bayes import MultinomialNB

naiveByes_clf = MultinomialNB()

naiveByes_clf.fit(X_train,y_train)

NB_prediction = naiveByes_clf.predict(X_valid)
NB_accuracy = accuracy_score(y_valid,NB_prediction)
print("training accuracy Score    : ",naiveByes_clf.score(X_train,y_train))
print("Validation accuracy Score : ",NB_accuracy )
print(classification_report(NB_prediction,y_valid))

**b. Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()

rf_clf.fit(X_train,y_train)

rf_prediction = rf_clf.predict(X_valid)
rf_accuracy = accuracy_score(y_valid,rf_prediction)
print("Training accuracy Score    : ",rf_clf.score(X_train,y_train))
print("Validation accuracy Score : ",rf_accuracy )
print(classification_report(rf_prediction,y_valid))

**c. Support vector machine**

In [None]:
from sklearn.svm import SVC

svc = SVC()

svc.fit(X_train, y_train)

svc_prediction = svc.predict(X_valid)
svc_accuracy = accuracy_score(y_valid,svc_prediction)
print("Training accuracy Score    : ",svc.score(X_train,y_train))
print("Validation accuracy Score : ",svc_accuracy )
print(classification_report(svc_prediction,y_valid))

#**ML Model Comparison**

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 
              'Random Forest', 'Naive Bayes',],
    'Test accuracy': [svc_accuracy, rf_accuracy, NB_accuracy,]})

models.sort_values(by='Test accuracy', ascending=False)