In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("data/preprocessed/sentiments_preprocessed.csv")
df.head()

Unnamed: 0,clean_comment,category
0,cant believe modi,0
1,karachi total blackout,0
2,couldnt done year modi year increasing unemplo...,0
3,modi talk world tallest statue talk world larg...,-1
4,major announcement modi everyone waiting game ...,-1


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
## Remaping outputs
df['category'] = df['category'].map({-1: 0, 0: 1, 1: 2})

In [5]:
import json
# Best Params for  models

# Logistic Regression
with open("best_params/LoR.json", "r") as file:
    lor_params = json.load(file)

# LightGBM
with open("best_params/lightgbm.json", "r") as file:
    lgbm_params = json.load(file)


In [6]:
ngram_range = (1, 1)  # Unigram setting
max_features = 9000

# Vectorization using TF-IDF with 9000 max features
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)

X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

### Model

In [7]:
# Logistic Regression
lor = LogisticRegression(**lor_params, n_jobs=-1)

# LightGBM
lgbm = LGBMClassifier(**lgbm_params, boosting_type="gbdt",
                        n_jobs=-1,objective="multiclass",num_class=3,
                        verbosity=-1)

# Stacking Classifier
estimators = [
    ('lor', lor),
    ('lgbm', lgbm)
    #('catboost', catboost)
]
stack = StackingClassifier(estimators=estimators,
                            final_estimator=LogisticRegression(n_jobs=-1),
                            n_jobs = -1, cv= 5) 

stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)



In [8]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8965344676592018

In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      8000
           1       0.88      0.96      0.92      7979
           2       0.92      0.86      0.89      8000

    accuracy                           0.90     23979
   macro avg       0.90      0.90      0.90     23979
weighted avg       0.90      0.90      0.90     23979



### Testing on Real Data

In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Ensure necessary NLTK data is downloaded
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/anmoljindal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/anmoljindal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
## Preprocessing step
# Define the preprocessing function
def preprocess_comment(comment):
    # Convert to lowercase
    comment = comment.lower()

    # Remove trailing and leading whitespaces
    comment = comment.strip()

    # Remove newline characters
    comment = re.sub(r'\n', ' ', comment)

    # Remove non-alphanumeric characters, except punctuation
    comment = re.sub(r'[^A-Za-z0-9\s!?.,]', '', comment)

    # Remove stopwords but retain important ones for sentiment analysis
    stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
    comment = ' '.join([word for word in comment.split() if word not in stop_words])

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    comment = ' '.join([lemmatizer.lemmatize(word) for word in comment.split()])

    return comment

# Predict Sentiment
def predict_sentiment(comment):
    # Preprocess the comment
    comment = preprocess_comment(comment)

    # Vectorize the comment
    comment_vectorized = vectorizer.transform([comment])

    # Predict the sentiment
    sentiment = stack.predict(comment_vectorized)[0]
    confidence = stack.predict_proba(comment_vectorized).max()

    if sentiment == 0:
        sentiment = "Negative"
    elif sentiment == 1:
        sentiment = "Neutral"
    else:
        sentiment = "Positive"

    return sentiment, confidence

In [16]:
import warnings
warnings.simplefilter("ignore")

In [48]:
# Positive Comments
pos1 = "This video is fantastic! I learned so much. 👍"
pos2 = "Great job! Keep up the excellent work. 😊"
pos3 = "I love this content, very informative and well presented. 💯"
pos4 = "Amazing content! Keep it up. 👏"
pos5 = "This video was a joy to watch. Excellent work! 🌟"

# Negative Comments
neg1 = "This video is terrible. I didn't learn anything. 👎"
neg2 = "I don't like the way this was explained. 😡"
neg3 = "Waste of time, very disappointing. 😞"
neg4 = "This video was frustrating to watch. 😠"
neg5 = "Completely useless content, didn't learn anything new. 😒"

# Neutral Comments
neu1 = "This video is okay, nothing special. 🤷"
neu2 = "I have mixed feelings about this content. 😐"
neu3 = "It's an average video, could be better. 😕"
neu4 = "Some parts were good, some were not. 😶"
neu5 = "Didn't feel anything special watching this video. 😑"

In [71]:
pred = predict_sentiment(neg5)
print(f"Sentiment: {pred[0]}, Confidence: {pred[1]:.2f}")

Sentiment: Negative, Confidence: 0.55


In [41]:
# Positive Comments
pos1 = "Yeh video bahut accha hai! Maine bahut kuch seekha. 👍"
pos2 = "Great job! Aise hi kaam karte raho. 😊"
pos3 = "Mujhe yeh content bahut pasand aaya, bahut informative aur achhe se present kiya gaya. 💯"
pos4 = "Amazing content! Keep it up. 👏"
pos5 = "Yeh video dekh kar maza aa gaya. Excellent work! 🌟"

# Negative Comments
neg1 = "Yeh video bakwas hai. Mujhe kuch bhi nahi samajh aaya. 👎"
neg2 = "Mujhe yeh explanation bilkul pasand nahi aayi. 😡"
neg3 = "Time waste, bahut disappointing. 😞"
neg4 = "Yeh video dekh kar frustration ho gaya. 😠"
neg5 = "Bilkul bekaar content, kuch bhi naya nahi seekha. 😒"

# Neutral Comments
neu1 = "Yeh video theek thaak hai, kuch khaas nahi. 🤷"
neu2 = "Mere mixed feelings hain is content ke baare mein. 😐"
neu3 = "Average video, aur better ho sakta tha. 😕"
neu4 = "Kuch parts achhe the, kuch nahi. 😶"
neu5 = "Yeh video dekh kar kuch khaas feel nahi aaya. 😑"

In [89]:
pred = predict_sentiment(pos5)
print(f"Sentiment: {pred[0]}, Confidence: {pred[1]:.2f}")

Sentiment: Positive, Confidence: 0.98


### Takeaways from the model
- The model is able to predict Postive sentiments in English and Hindi with high accuracy.
- The model is able to predict Negative sentiments in English and Hindi with good accuracy.
- However, the model is struggling to predict neutral sentiments, either they predict them as positive or negative.
- This could be due to overall quality of dataset