In [83]:
# Importing required libraries and loading dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import resample

df = pd.read_csv("Sentiment dataset.csv")
print(df.shape)
df.head()

(732, 15)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


In [84]:
# Inspecting and cleaning dataset
df.info()
print("\nMissing values:")
print(df.isnull().sum())
print("\nDuplicated rows:", df.duplicated().sum())
df["Sentiment"].value_counts()

df.columns = df.columns.str.lower().str.replace(" ",  "_")
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0.1  732 non-null    int64  
 1   Unnamed: 0    732 non-null    int64  
 2   Text          732 non-null    object 
 3   Sentiment     732 non-null    object 
 4   Timestamp     732 non-null    object 
 5   User          732 non-null    object 
 6   Platform      732 non-null    object 
 7   Hashtags      732 non-null    object 
 8   Retweets      732 non-null    float64
 9   Likes         732 non-null    float64
 10  Country       732 non-null    object 
 11  Year          732 non-null    int64  
 12  Month         732 non-null    int64  
 13  Day           732 non-null    int64  
 14  Hour          732 non-null    int64  
dtypes: float64(2), int64(6), object(7)
memory usage: 85.9+ KB

Missing values:
Unnamed: 0.1    0
Unnamed: 0      0
Text            0
Sentiment       0
Timestamp   

Unnamed: 0,unnamed:_0.1,unnamed:_0,text,sentiment,timestamp,user,platform,hashtags,retweets,likes,country,year,month,day,hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


In [85]:
# Text cleaning function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,unnamed:_0.1,unnamed:_0,text,sentiment,timestamp,user,platform,hashtags,retweets,likes,country,year,month,day,hour,clean_text
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12,enjoying beautiful day park
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8,traffic terrible morning
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15,finished amazing workout
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18,excited upcoming weekend getaway
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19,trying new recipe dinner tonight


In [86]:
# Simplifying emotion labels to general sentiment
positive_emotions = [
    "joy", "happiness", "contentment", "love", "hopeful", "inspiration",
    "gratitude", "excitement", "serenity", "satisfaction", "euphoria"
]

negative_emotions = [
    "sadness", "anger", "fear", "disgust", "grief", "betrayal",
    "heartbreak", "regret", "loneliness", "melancholy", "frustration"
]

def map_to_sentiment(label):
    l = label.strip().lower()
    if l in positive_emotions:
        return "positive"
    elif l in negative_emotions:
        return "negative"
    else:
        return "neutral"

df["sentiment_grouped"] = df["sentiment"].apply(map_to_sentiment)
df["sentiment_grouped"].value_counts()

sentiment_grouped
neutral     511
positive    166
negative     55
Name: count, dtype: int64

In [87]:
# Extracting features from text to numeric
y = df["sentiment_grouped"]

vectorizer = TfidfVectorizer(max_features = 2000)
X = vectorizer.fit_transform(df["clean_text"])

In [88]:
# Balancing classes
df_bal = pd.concat([pd.DataFrame(X.toarray()), y.reset_index(drop = True)], axis = 1)
df_bal.columns = [f'f{i}' for i in range(X.shape[1])] + ['sentiment']

# Balance classes by oversampling minority ones
majority = df_bal[df_bal.sentiment == 'neutral']
minority_pos = df_bal[df_bal.sentiment == 'positive']
minority_neg = df_bal[df_bal.sentiment == 'negative']

minority_pos_up = resample(minority_pos, replace = True, n_samples = len(majority), random_state = 42)
minority_neg_up = resample(minority_neg, replace = True, n_samples = len(majority), random_state = 42)

df_bal = pd.concat([majority, minority_pos_up, minority_neg_up]).sample(frac = 1, random_state = 42)

# Recreate X, y
X = df_bal.drop('sentiment', axis = 1)
y = df_bal['sentiment']

In [89]:
 # Splitting training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print("Train size:", X_train.shape, "Test size:", X_test.shape)

Train size: (1226, 2000) Test size: (307, 2000)


In [90]:
# Training models
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

lr = LogisticRegression(max_iter = 1000, random_state = 42)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [91]:
# Evalauting models
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n ==== {model_name} ==== ")
    print("Accuracy:", round(accuracy_score(y_true, y_pred), 3))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))

evaluate_model(y_test, y_pred_nb, "Naive Bayes")
evaluate_model(y_test, y_pred_lr, "Logistics Regression")


 ==== Naive Bayes ==== 
Accuracy: 0.876

Classification Report:
               precision    recall  f1-score   support

    negative       0.90      1.00      0.95       111
     neutral       0.96      0.67      0.79       101
    positive       0.80      0.95      0.87        95

    accuracy                           0.88       307
   macro avg       0.89      0.87      0.87       307
weighted avg       0.89      0.88      0.87       307


 ==== Logistics Regression ==== 
Accuracy: 0.964

Classification Report:
               precision    recall  f1-score   support

    negative       0.97      1.00      0.99       111
     neutral       0.97      0.92      0.94       101
    positive       0.95      0.97      0.96        95

    accuracy                           0.96       307
   macro avg       0.96      0.96      0.96       307
weighted avg       0.96      0.96      0.96       307

