In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [4]:
import pandas as pd

df = pd.read_csv("sentiment_analysis_dataset.csv",
                 sep=",",                 # change if your file uses ; or \t
                 engine="python",         # more tolerant parser
                 dtype=str,               # read everything as text to avoid type errors
                 on_bad_lines="skip")     # skip broken lines (or "warn" to see them)


In [5]:
df.head()

Unnamed: 0,Comment,Sentiment
0,Achieving million views in days is dangerous,Positive
1,How many people here want to participate in su...,Neutral
2,Mrbeast is slowly turning into mrjigsaw,Negative
3,genuinely can't believe how dystopian this is,Negative
4,Have of the worlds smartest people compete in ...,Neutral


In [6]:
df.isnull().sum()

Comment      5
Sentiment    0
dtype: int64

In [7]:
df.shape

(6739, 2)

In [8]:
df = df.dropna()


In [9]:
df.isnull().sum()

Comment      0
Sentiment    0
dtype: int64

In [10]:
df.shape

(6734, 2)

In [11]:
df['Sentiment'].unique()

array(['Positive', 'Neutral', 'Negative'], dtype=object)

In [12]:
df['Sentiment'].value_counts()

Sentiment
Positive    4679
Neutral     1976
Negative      79
Name: count, dtype: int64

In [13]:
unoque_sentiment = df['Sentiment'].unique()
sentiment_num = {}
i=0
for sen in unoque_sentiment:
    sentiment_num[sen]=i
    i+=1

df['Sentiment'] = df['Sentiment'].map(sentiment_num)

In [14]:
df.head()

Unnamed: 0,Comment,Sentiment
0,Achieving million views in days is dangerous,0
1,How many people here want to participate in su...,1
2,Mrbeast is slowly turning into mrjigsaw,2
3,genuinely can't believe how dystopian this is,2
4,Have of the worlds smartest people compete in ...,1


In [15]:
df['Comment'] =  df['Comment'].apply(lambda x: x.lower())

In [16]:
import string
def remove_punctuation(comm):
    return comm.translate(str.maketrans("","", string.punctuation))

In [17]:
df['Comment'].apply(remove_punctuation)

0            achieving million views in days is dangerous
1       how many people here want to participate in su...
2                 mrbeast is slowly turning into mrjigsaw
3            genuinely cant believe how dystopian this is
4       have of the worlds smartest people compete in ...
                              ...                        
6734                   congrats man youve come a long way
6735    “when i’ll get older i’ll be stronger” you pro...
6736    this has been awesome and heart felt video so ...
6737                                        it is amazing
6738                                 yeah congrats on mil
Name: Comment, Length: 6734, dtype: object

In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Comment'], df['Sentiment'], test_size=0.25, random_state=42)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)


nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)


pred_bow = nb_model.predict(X_test_bow)
print(accuracy_score(y_test, pred_bow))

0.9305225653206651


In [21]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [22]:
y_pred = nb2_model.predict(X_test_tfidf)

In [23]:
print(accuracy_score(y_test, y_pred))

0.9091448931116389


In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
logistic_model = LogisticRegression(max_iter=1000)

In [26]:
logistic_model.fit(X_train_tfidf,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [27]:
log_pred = logistic_model.predict(X_test_tfidf)

In [28]:
print(accuracy_score(y_test,log_pred ))

0.9590261282660333


In [29]:
import joblib
# Assuming lr_model and nb_model are your trained models
joblib.dump(logistic_model, 'lr_model.pkl')
joblib.dump(nb_model, 'nb_model.pkl')
# Assuming vectorizer is your fitted TfidfVectorizer or CountVectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']