In [1]:
# Import the libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# import nltk 
# nltk.download('wordnet')

In [2]:
# Read the data 
df = pd.read_csv('sentiment.csv')

# Check first 5 rows
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
#size of data 
df.shape

(31962, 3)

In [4]:
#Total no. of classes in target value
df['label'].value_counts()

label
0    29720
1     2242
Name: count, dtype: int64

In [5]:
#Understanding the sentiments in the data
df[df['label']==0].head(10) 

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.ð...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...


In [6]:
df[df['label']==1].head(10)

Unnamed: 0,id,label,tweet
13,14,1,@user #cnn calls #michigan middle school 'buil...
14,15,1,no comment! in #australia #opkillingbay #se...
17,18,1,retweet if you agree!
23,24,1,@user @user lumpy says i am a . prove it lumpy.
34,35,1,it's unbelievable that in the 21st century we'...
56,57,1,@user lets fight against #love #peace
68,69,1,ð©the white establishment can't have blk fol...
77,78,1,"@user hey, white people: you can call people '..."
82,83,1,how the #altright uses &amp; insecurity to lu...
111,112,1,@user i'm not interested in a #linguistics tha...


In [7]:
df.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

In [8]:
# Remove 'id' column
df.drop(['id'], axis=1, inplace=True)
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [9]:
# Data preprocessing

import string

# Function to remove punctuation
def clean(text):
    remv_pun = [char for char in text.lower() if char not in string.punctuation]
    remv_punc_join = ''.join(remv_pun)
    return remv_punc_join

In [10]:
clean(' @ Great beginning,,, takes! time,,,.   #run')

'  great beginning takes time   run'

In [11]:
# Apply the 'clean' function to the 'tweet' column
df['cleaned_tweet'] = df['tweet'].apply(clean)
tweets_df_clean = df['tweet'].apply(clean)

In [12]:
tweets_df_clean.head()

0     user when a father is dysfunctional and is so...
1    user user thanks for lyft credit i cant use ca...
2                                  bihday your majesty
3    model   i love u take with u all the time in u...
4                 factsguide society now    motivation
Name: tweet, dtype: object

In [13]:
# Remove stopwords
import nltk 
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords[:5])

['i', 'me', 'my', 'myself', 'we']


In [14]:
# stopwords treatment and converting the data into lower case 
def stop(text):
    remv_stop = [a for a in text.split() if a.lower() not in stopwords]
    remv_stop_join = ' '.join(remv_stop)
    return remv_stop_join

In [15]:
# Apply the 'stop' function to the 'cleaned_tweet' column
df['stopwords_removed'] = df['cleaned_tweet'].apply(stop)
tweets_df = pd.DataFrame(tweets_df_clean)
tweets_df_stopwords = tweets_df['tweet'].apply(stop)
tweets_df_stopwords = pd.DataFrame(tweets_df_stopwords)
tweets_df_stopwords

Unnamed: 0,tweet
0,user father dysfunctional selfish drags kids d...
1,user user thanks lyft credit cant use cause do...
2,bihday majesty
3,model love u take u time urð± ðððð...
4,factsguide society motivation
...,...
31957,ate user isz youuuððððððð...
31958,see nina turner airwaves trying wrap mantle ge...
31959,listening sad songs monday morning otw work sad
31960,user sikh temple vandalised calgary wso condem...


In [16]:
# Stemming
st = PorterStemmer()
def steming(text):
    ste = [st.stem(word) for word in text.split()]
    ste_join = ' '.join(ste)
    return ste_join

# Apply the 'steming' function to the 'stopwords_removed' column
df['stemmed_tweet'] = df['stopwords_removed'].apply(steming)

In [17]:
# Lemmatization
wl = WordNetLemmatizer()
def lematize(text):
    lem = [wl.lemmatize(word) for word in text.split()]
    lem_join = ' '.join(lem)
    return lem_join

# Apply the 'lematize' function to the 'stemmed_tweet' column
df['lemmatized_tweet'] = df['stemmed_tweet'].apply(lematize)

In [18]:
lematize('Dog keepss on barkings')

'Dog keep on barkings'

In [19]:
# Feature extraction using CountVectorizer
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(df['lemmatized_tweet']).toarray()

# Target variable
y = df['label']

In [20]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=25)

# Initialize and train the Naive Bayes classifier
NaiveBclassifier = MultinomialNB()
NaiveBclassifier.fit(X_train, y_train)

In [21]:
X_train.shape
y_train.shape

(21414,)

In [22]:
# Make predictions on the training set
y_pred_train = NaiveBclassifier.predict(X_train)

In [23]:
# Accuracy Score 
acc_train = accuracy_score(y_train, y_pred_train)
print("Accuracy Score: ", acc_train)

Accuracy Score:  0.962127580087793


In [24]:
# Generate classification report for the test set
y_pred_test = NaiveBclassifier.predict(X_test)
report = classification_report(y_test, y_pred_test)

# Print the classification report
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      9806
           1       0.61      0.69      0.65       742

    accuracy                           0.95     10548
   macro avg       0.79      0.83      0.81     10548
weighted avg       0.95      0.95      0.95     10548

