Import libraries

In [3]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style

style.use('ggplot')
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

ModuleNotFoundError: No module named 'textblob'

Import Dataset

In [None]:
dataset = pd.read_csv("vaccination_tweets.csv")

Preview Dataset and more information about the dataset

In [None]:
dataset.shape
dataset.head()
dataset.info()

In [None]:
#null values in the dataset
dataset.isnull().sum()


In [None]:
#creating a data set which is only consisting with text data
#all column names
dataset.columns
text_dataset = dataset.drop([], axis=1)
text_dataset.head()

In [None]:
print(text_dataset['text'].iloc[0], "\n")
print(text_dataset['text'].iloc[1], "\n")
print(text_dataset['text'].iloc[2], "\n")
print(text_dataset['text'].iloc[3], "\n")
print(text_dataset['text'].iloc[4], "\n")

Text Processing

In [None]:
text_dataset.info()


def data_processing(text):
    #process lowercase
    text = text.lower()
    #remove url in tweets
    text = re.sub(r"https\S+|www\S+https\S+", '', text, flags=re.MULTILINE)
    #remove punctuation
    text = re.sub(r'\@w+|\#', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text_tokens = word_tokenize(text)
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)

In [None]:
#Add Preprocessing function
text_dataset.text = text_dataset['text'].apply(data_processing)

In [None]:
#Remove Duplicates
text_dataset = text_dataset.drop_duplicates('text')

Stemming

In [None]:
#Porter Stemmer
stemmer = PorterStemmer()


def stemming(data):
    text = [stemmer.stem(word) for word in data]
    return data

In [None]:
#Apply stemming inti process data
text_dataset['text'] = text_dataset['text'].apply(lambda x: stemming(x))

In [None]:
#Display process data
text_dataset.head()

In [None]:
#check the effects of the preprocessing
print(text_dataset['text'].iloc[0], "\n")
print(text_dataset['text'].iloc[1], "\n")
print(text_dataset['text'].iloc[2], "\n")
print(text_dataset['text'].iloc[3], "\n")
print(text_dataset['text'].iloc[4], "\n")

In [None]:
#to see the updated no of columns
text_dataset.info()

In [None]:
#calculate the polarity of the sentences
def polarity(text):
    return TextBlob(text).sentiment.polarity

In [None]:
#add the calculated polarity into dataframe
text_dataset['polarity'] = text_dataset['text'].apply(polarity)

In [None]:
text_dataset.head(10)

Data Frame

In [None]:
#Add sentiment column to the data frame
def sentiment(label):
    if label < 0:
        return "Negative"
    elif label == 0:
        return "Neutral"
    elif label > 0:
        return "Positive"

In [None]:
#Add the function ito data frame
text_dataset['sentiment'] = text_dataset['polarity'].apply(sentiment)

In [None]:
#Preview the data frame
text_dataset.head()

In [None]:
#Visualize a distribution of data using count plot
fig = plt.figure(figsize=(5, 5))
sns.countplot(x='sentiment', data=text_dataset)

Pie chart

In [None]:
#visualize data in pie chart
fig = plt.figure(figsize=(7, 7))
colors = ("yellowgreen", "gold", "red")
wp = {'linewidth': 2, 'edgecolor': "black"}
tags = text_dataset['sentiment'].value_counts()
explode = (0.1, 0.1, 0.1)
tags.plot(kind='pie', autopct='%1.1f%%', shadow=True, colors=colors,
          startangle=90, wedgeprops=wp, explode=explode, label='')
plt.title('Distribution of sentiments')

In [None]:
#top 5 tweets in positive sentiment
pos_tweets = text_dataset[text_dataset.sentiment == 'Positive']
pos_tweets = pos_tweets.sort_values(['polarity'], ascending=False)
pos_tweets.head()

In [None]:
#Visualize all the positive tweets using word plot
text = ' '.join([word for word in pos_tweets['text']])
plt.figure(figsize=(20, 15), facecolor='None')
wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most frequent words in positive tweets', fontsize=19)
plt.show()

In [None]:
#top 5 tweets in negative sentiments
neg_tweets = text_dataset[text_dataset.sentiment == 'Negative']
neg_tweets = neg_tweets.sort_values(['polarity'], ascending=False)
neg_tweets.head()

In [None]:
#visualize the negative tweets using word plot
text = ' '.join([word for word in neg_tweets['text']])
plt.figure(figsize=(20, 15), facecolor='None')
wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most frequent words in Negative tweets', fontsize=19)
plt.show()

In [None]:
#top 5 tweets in neutral sentiments
neutral_tweets = text_dataset[text_dataset.sentiment == 'Neutral']
neutral_tweets = neutral_tweets.sort_values(['polarity'], ascending=False)
neutral_tweets.head()

In [None]:
#visualize the neutral tweets using word plot
text = ' '.join([word for word utral_tweets['text']])
plt.figure(figsize=(20, 15), facecolor='None')
wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most frequent words in Neutral tweets', fontsize=19)
plt.show()

#vectorize the data using count vectorizer

In [None]:
#background language model
vect = CountVectorizer(ngram_range=(1,2)).fit(text_dataset['text'])

In [None]:
#Display no of features and print 20 features
feature_names = vect.get_feature_names()
print("Number of features: {}\n".format(len(feature_names)))
print("First 20 features:\n {}".format(feature_names[:20]))

Build the Model

In [None]:
#seperate the data into x and y
X = text_dataset['text']
Y = text_dataset['sentiment']
X = vect.transform(X)

In [None]:
#separate the data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
print("Size of x_train:", (x_train.shape))
print("Size of y_train:", (y_train.shape))
print("Size of x_test:", (x_test.shape))
print("Size of y_test:", (y_test.shape))

In [None]:
#ti ignore the warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
#train the data on logistic regression model
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
logreg_pred = logreg.predict(x_test)
logreg_acc = accuracy_score(logreg_pred, y_test)
print("Test accuracy: {:.2f}%".format(logreg_acc*100))

Confusion matrix

In [None]:
#confusion matrix
print(confusion_matrix(y_test, logreg_pred))
print("\n")
print(classification_report(y_test, logreg_pred))

In [None]:
style.use('classic')
cm = confusion_matrix(y_test, logreg_pred, labels=logreg.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels=logreg.classes_)
disp.plot()

hyperparameter tuning

In [None]:
#perform hyper parameter tuning
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid={'C':[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid)
grid.fit(x_train, y_train)

In [None]:
print("Best parameters:", grid.best_params_)

In [None]:
#print the values for the given excess
y_pred = grid.predict(x_test)

In [None]:
#calculate the model accuracy
logreg_acc = accuracy_score(y_pred, y_test)
print("Test accuracy: {:.2f}%".format(logreg_acc*100))

In [None]:
#updated confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print("\n")
print(classification_report(y_test, y_pred))

In [None]:
#run the model on support vector machine
#import support vector classifier
from sklearn.svm import LinearSVC

In [None]:
SVCmodel = LinearSVC()
SVCmodel.fit(x_train, y_train)

In [None]:
#split the data into test
svc_pred = SVCmodel.predict(x_test)
svc_acc = accuracy_score(svc_pred, y_test)
#calculate the accuracy
print("test accuracy: {:.2f}%".format(svc_acc*100))

In [None]:
#confusion matrix and classification report
print(confusion_matrix(y_test, svc_pred))
print("\n")
print(classification_report(y_test, svc_pred))

In [None]:
#hyperparameter tuning for the svm model
grid = {
    'C':[0.01, 0.1, 1, 10],
    'kernel':["linear","poly","rbf","sigmoid"],
    'degree':[1,3,5,7],
    'gamma':[0.01,1]
}
grid = GridSearchCV(SVCmodel, param_grid)
grid.fit(x_train, y_train)

In [None]:
print("Best parameter:", grid.best_params_)

In [None]:
y_pred = grid.predict(x_test)

In [None]:
#calculate the model accuracy
logreg_acc = accuracy_score(y_pred, y_test)
print("Test accuracy: {:.2f}%".format(logreg_acc*100))

In [None]:
#confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print("\n")
print(classification_report(y_test, y_pred))