https://www.kaggle.com/datasets/kashishparmar02/social-media-sentiments-analysis-dataset

In [198]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

In [199]:
df = pd.read_csv("sentimentAnalysis.csv")

In [None]:
df.sample(5)

In [None]:
df.columns

In [202]:
df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'Timestamp', 'User', 'Platform', 'Hashtags', 'Retweets', 'Likes', 'Country', 'Year', 'Month', 'Day', 'Hour'], inplace=True)

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [206]:
#df = df.drop_duplicates()

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
nltk.download('punkt')
nltk.download('all')

In [210]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
analyzer = SentimentIntensityAnalyzer() 
def classify_sentiment(text, pos_threshold, neg_threshold):
    score = analyzer.polarity_scores(text)
    if score['compound'] > pos_threshold:
        return "Positive"
    elif score['compound'] < neg_threshold:
        return "Negative"
    else:
        return "Neutral"


In [None]:
df.sample(5)

In [212]:
labels = []
for sentiment in df['Sentiment']:
    labels.append(classify_sentiment(sentiment, 0.05, -0.05))

In [213]:
df['label'] = labels

In [None]:
df.sample(5)

In [None]:
df.drop(columns = 'Sentiment', inplace = True)
df.sample(5)

In [None]:
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])

#displaying the edited dataframe
df.sample(5)

0 = negative;
1 = neutral;
2 = positive;

In [None]:
print(df[df['label'] == 0]['Text'].count())
print(df[df['label'] == 1]['Text'].count())
print(df[df['label'] == 2]['Text'].count())

In [None]:
import matplotlib.pyplot as plt
plt.pie(df['label'].value_counts(), labels=['Positive', 'Negative', 'Neutral'], autopct = '%0.2f')
plt.show()

In [219]:
#creating a new column with count of characters
df['countCharacters'] = df['Text'].apply(len)

#creating a new column with count of words
df['countWords'] = df['Text'].apply(lambda i:len(nltk.word_tokenize(i)))
#'word_tokenize' function takes a string of text as input and returns a list of words

#creating a new column with count of sentences
df['countSentences'] = df['Text'].apply(lambda i:len(nltk.sent_tokenize(i)))
#'sent_tokenize' function takes a string of text as input and returns a list of sentences

In [None]:
#extracting the summary of the 3 new column values
df[['countCharacters', 'countWords', 'countSentences']].describe()

In [None]:
#for negative
df[df['label'] == 0][['countCharacters', 'countWords', 'countSentences']].describe()

In [None]:
#for neutral
df[df['label'] == 1][['countCharacters', 'countWords', 'countSentences']].describe()

In [None]:
#for positive
df[df['label'] == 2][['countCharacters', 'countWords', 'countSentences']].describe()

In [None]:
plt.figure(figsize = (10, 5))
sns.histplot(df[df['label'] == 0]['countCharacters'], color = "red")
sns.histplot(df[df['label'] == 1]['countCharacters'], color = "yellow")
sns.histplot(df[df['label'] == 2]['countCharacters'], color = "green")

In [None]:
plt.figure(figsize = (10, 5))
sns.histplot(df[df['label'] == 0]['countWords'], color = "red")
sns.histplot(df[df['label'] == 1]['countWords'], color = "yellow")
sns.histplot(df[df['label'] == 2]['countWords'], color = "green")

In [None]:
nltk.download('stopwords')

In [227]:
from nltk.corpus import stopwords
#stopwords = stopwords.words("English")
def transform_text (text):
    
    #converting to lower case
    text = text.lower()
    
    #tokenization
    text = nltk.word_tokenize(text)
    
    #removing special characters
    removedSC = list()
    for i in text:
        if i.isalnum():
            removedSC.append(i)
            
    #updating the text after removed special characters
    text = removedSC[:]
    
    #removing stop words and punctuation characters
    removedSWPC = list()
    for i in text:
        #stopwords.words('english') is a function of 'nltk', returns list of english stop words
        #string.punctuation is a part of 'string' module, containing the ASCII punctuation characters
        if i not in stopwords.words('english') and i not in string.punctuation:
            removedSWPC.append(i)
            
    #updating the text after removed stop words and punctuation characters
    text = removedSWPC[:]
    
    #stemming the data using 'PorterStemmer' algorithm.
    #nltk module provides this class to use.
    ps = PorterStemmer()
    stemmed = list()
    for i in text:
        stemmed.append(ps.stem(i))
    text = stemmed[:]
    return " ".join(text)

In [None]:
print(transform_text("Hello world this is me typing"))

In [None]:
#will create a new column to store the transformed text -> 'processed'
df['processed'] = df['Text'].apply(transform_text)

#displaying the edited dataframe with a new column 'processed'
df.head()

In [230]:
wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')

In [None]:
#creating a wordcloud for the positive messages
pos = wc.generate(df[df['label'] == 2]['processed'].str.cat(sep=" "))

#creating figure and displaying
plt.figure(figsize=(12, 6))
plt.imshow(pos)

In [None]:
#creating a wordcloud for the neutral messages
neutral = wc.generate(df[df['label'] == 1]['processed'].str.cat(sep=" "))

#creating figure and displaying
plt.figure(figsize=(12, 6))
plt.imshow(neutral)

In [None]:
#creating a wordcloud for the negitive messages
neg = wc.generate(df[df['label'] == 0]['processed'].str.cat(sep=" "))

#creating figure and displaying
plt.figure(figsize=(12, 6))
plt.imshow(neg)

Building the model

In [234]:
#converting the collection of text into a matrix of token counts
cv = CountVectorizer()

In [None]:
#transforming the data of processed column
X = cv.fit_transform(df['processed']).toarray()

#printing size of X
X.shape

In [None]:
#storing the values of the 'result' column
y = df['label'].values
y.shape

In [237]:
#splitting the training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 49)

In [238]:
#creating the objects for the models
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
#training the dataset for GaussianNB
gnb.fit(X_train, y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test, y_pred1))
print(confusion_matrix(y_test, y_pred1))
print(precision_score(y_test, y_pred1, average = 'macro'))

In [None]:
#training the dataset for MultinomialnNB
mnb.fit(X_train, y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))
print(precision_score(y_test, y_pred2, average='macro'))

In [None]:
#training the dataset for BernoulliNB
bnb.fit(X_train, y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test, y_pred3, ))
print(confusion_matrix(y_test, y_pred3))
print(precision_score(y_test, y_pred3, average='macro'))

In [242]:
#using 'TfidfVectorizer' for vectorization 
tf = TfidfVectorizer()

#transforming the data of processed column
X = tf.fit_transform(df['processed']).toarray()

#storing the values of the 'result' column
y = df['label'].values

#splitting the training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 49)

In [None]:
#training the dataset for GaussianNB
gnb.fit(X_train, y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test, y_pred1))
print(confusion_matrix(y_test, y_pred1))
print(precision_score(y_test, y_pred1, average = 'macro'))

In [None]:
#training the dataset for MultinomialnNB
mnb.fit(X_train, y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))
print(precision_score(y_test, y_pred2, average='macro'))

In [None]:
#training the dataset for BernoulliNB
bnb.fit(X_train, y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test, y_pred3))
print(confusion_matrix(y_test, y_pred3))
print(precision_score(y_test, y_pred3, average='macro'))

In [246]:
from sklearn import svm
svm1 = svm.SVC()

In [None]:
svm1.fit(X_train, y_train)
y_pred4 = svm1.predict(X_test)
print(accuracy_score(y_test, y_pred4))
print(confusion_matrix(y_test, y_pred4))
print(precision_score(y_test, y_pred4, average='macro'))

In [248]:
#we will pickle 2 files
import pickle
pickle.dump(tf,open('vectorizer.pkl','wb'))
pickle.dump(gnb,open('model.pkl','wb'))