In [1]:
# 1. load and explore dataset

import pandas as pd
import numpy as np

df = pd.read_csv('Sentiment dataset.csv')
print('read doc:\n', df.head())
print('more info:\n', df.info())
print('is missing:\n', df.isnull().sum())

read doc:
    Unnamed: 0.1  Unnamed: 0  \
0             0           0   
1             1           1   
2             2           2   
3             3           3   
4             4           4   

                                                Text    Sentiment  \
0   Enjoying a beautiful day at the park!        ...   Positive     
1   Traffic was terrible this morning.           ...   Negative     
2   Just finished an amazing workout! 💪          ...   Positive     
3   Excited about the upcoming weekend getaway!  ...   Positive     
4   Trying out a new recipe for dinner tonight.  ...   Neutral      

             Timestamp            User     Platform  \
0  2023-01-15 12:30:00   User123          Twitter     
1  2023-01-15 08:45:00   CommuterX        Twitter     
2  2023-01-15 15:45:00   FitnessFan      Instagram    
3  2023-01-15 18:20:00   AdventureX       Facebook    
4  2023-01-15 19:55:00   ChefCook        Instagram    

                                     Hashtags  Retweets 

In [2]:
print(df['Sentiment'].value_counts())

Sentiment
Positive               44
Joy                    42
Excitement             32
Happy                  14
Neutral                14
                       ..
Vibrancy                1
Culinary Adventure      1
Mesmerizing             1
Thrilling Journey       1
Winter Magic            1
Name: count, Length: 279, dtype: int64


In [3]:
#2. Preprocess Text
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
#3. creating an emotion map to group the emotions

def map_sentiment(sent):
    sent = sent.lower().strip()
    
    positive = [
        'acceptance', 'admiration', 'affection', 'anticipation', 'arousal', 
        'awe', 'contentment', 'coziness', 'creativity', 'curiosity',
        'elation', 'empowerment', 'enjoyment', 'enthusiasm', 'euphoria',
        'excitement', 'grateful', 'hopeful', 'inspiration', 'inspired',
        'joy', 'joyfulreunion', 'love', 'positive', 'proud', 'satisfaction',
        'serenity', 'spark', 'thrill', 'vibrancy', 'zest'
    ]
    
    negative = [
        'anger', 'ambivalence', 'betrayal', 'bitter', 'bitterness', 
        'bittersweet', 'boredom', 'confusion', 'desolation', 'devastated',
        'disgust', 'embarrassed', 'envious', 'fear', 'fearful', 'frustrated',
        'frustration', 'grief', 'hate', 'heartbreak', 'isolation', 
        'jealousy', 'loneliness', 'lostlove', 'melancholy', 'numbness',
        'overwhelmed', 'regret', 'sadness', 'sorrow'
    ]
    
    neutral = [
        'calmness', 'contemplation', 'reflection', 'neutral', 
        'serenity', 'solitude'
    ]
    
    if sent in positive:
        return 'Positive'
    elif sent in negative:
        return 'Negative'
    else:
        return 'Neutral'


In [5]:
df['Category'] = df['Sentiment'].apply(map_sentiment)
print(df['Category'].value_counts())

Category
Neutral     318
Positive    282
Negative    132
Name: count, dtype: int64


In [6]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ''.join(tokens)

df['clean_text'] = df['Text'].apply(preprocess_text)

In [7]:
#4. Feature and Target

X = df['clean_text']
y = df['Category']

In [13]:
#5. convert text to numbers(vectorization)

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,3))
X_tfidf = vectorizer.fit_transform(df['clean_text'])

In [14]:
#6. split into train and test sets

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_tfidf, df['Category'], test_size=0.2, 
                                                 random_state=42, stratify=df['Category'])


In [15]:
#7. Using SMOTE to balance data
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train,y_train)

In [16]:
#8. Train Logistic regression

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_res, y_train_res)

In [17]:
#9. Evaluate model

from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00        26
     Neutral       0.45      1.00      0.62        64
    Positive       1.00      0.07      0.13        57

    accuracy                           0.46       147
   macro avg       0.48      0.36      0.25       147
weighted avg       0.58      0.46      0.32       147



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
