In [17]:
import pandas as pd
import re

df = pd.read_csv('cleaned_tweets_with_emojis.csv')

# Column fix
if 'Tweet' in df.columns:
    df = df.rename(columns={'Tweet': 'text'})
if 'label' in df.columns and 'sentiment' not in df.columns:
    df = df.rename(columns={'label': 'sentiment'})
if 'text' not in df.columns:
    print("ERROR: No 'text' column found!")

print(df.head())

  Sentiment                                               text
0  Negative               Enna da ellam avan seyal  Mari iruku
1  Negative          This movei is just like  ellam avan seyal
2  Positive  Padam vanthathum 13k dislike pottavaga yellam ...
3  Positive    Neraya neraya neraya... ... V era level...thala
4  Positive  wow thavala sema mass....padam oru pundaikum a...


In [18]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string

emoji_sentiment = {"üòä":"HAPPY", "üòç":"LOVE", "üò¢":"SAD", "üò°":"ANGRY", "üòÇ":"JOY", "üò≠":"SAD", "üòî":"SAD", "‚ù§Ô∏è":"LOVE"}
stopword_list = set(stopwords.words('english'))

def clean_text(Tweet):
    Tweet = str(Tweet).lower()
    # Remove URLs
    Tweet = re.sub(r"http\S+|www\S+", "", Tweet)
    # Remove mentions and hashtags
    Tweet = re.sub(r"@\w+|#\w+", "", Tweet)
    # Remove special characters and numbers, but keep emojis
    Tweet = re.sub(r"[^a-z\süòäüòçüò¢üò°üòÇüò≠üòî‚ù§Ô∏è]", "", Tweet)
    # Remove stopwords
    words = Tweet.split()
    words = [word for word in words if word not in stopword_list and len(word) > 2]
    Tweet = " ".join(words)
    return Tweet

# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)
print(df.head())

  Sentiment                                               text  \
0  Negative               Enna da ellam avan seyal  Mari iruku   
1  Negative          This movei is just like  ellam avan seyal   
2  Positive  Padam vanthathum 13k dislike pottavaga yellam ...   
3  Positive    Neraya neraya neraya... ... V era level...thala   
4  Positive  wow thavala sema mass....padam oru pundaikum a...   

                                          clean_text  
0                   enna ellam avan seyal mari iruku  
1                        movei like ellam avan seyal  
2  padam vanthathum dislike pottavaga yellam yea ...  
3                neraya neraya neraya era levelthala  
4   wow thavala sema masspadam oru pundaikum aagathu  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["label"] = le.fit_transform(df["Sentiment"])
print(le.classes_)

['Mixed_feelings' 'Negative' 'Positive' 'not-Tamil' 'unknown_state']


In [20]:
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4000, ngram_range=(1,2))
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"].values
smote = SMOTE(random_state=42, k_neighbors=3)
X_res, y_res = smote.fit_resample(X, y)
print("Balanced class distribution:", pd.Series(y_res).value_counts())

Balanced class distribution: 1    9992
2    9992
0    9992
3    9992
4    9992
Name: count, dtype: int64


In [29]:
# Define emotion extraction function
def extract_emotion(text):
    for emoji, emotion in emoji_sentiment.items():
        if emoji in text:
            return emotion
    return "NEUTRAL"

df['predicted_emotion'] = df['text'].apply(extract_emotion)
print("\n=== Sample Tweets with Emotions ===")
print(df[['text', 'Sentiment', 'predicted_emotion']].head(10))


=== Sample Tweets with Emotions ===
                                                text Sentiment  \
0               Enna da ellam avan seyal  Mari iruku  Negative   
1          This movei is just like  ellam avan seyal  Negative   
2  Padam vanthathum 13k dislike pottavaga yellam ...  Positive   
3    Neraya neraya neraya... ... V era level...thala  Positive   
4  wow thavala sema mass....padam oru pundaikum a...  Positive   
5  Andha 19 k unlike panavangaluku kolandha porak...  Negative   
6  Yaarellam frst like pottutu video paaka start ...  Positive   
7  Ethana padam vanthanu SALT AND PEPPER Mattum t...  Positive   
8        Thala mass  Hvy sprt kerala Surya anna fans  Positive   
9                   Elam avan jayal movie  remake pa  Negative   

  predicted_emotion  
0           NEUTRAL  
1           NEUTRAL  
2           NEUTRAL  
3           NEUTRAL  
4           NEUTRAL  
5           NEUTRAL  
6           NEUTRAL  
7           NEUTRAL  
8           NEUTRAL  
9           NEUT

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np
import pandas as pd

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Changed kernel to 'rbf' and kept class_weight='balanced'
clf = SVC(kernel='rbf', class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", acc)
print("F1 Score:", f1)
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Find misclassified samples
misclassified_indices = np.where(y_test != y_pred)[0]
print(f"\nTotal misclassified: {len(misclassified_indices)}")
print("\n=== Top 15 Misclassified Tweets ===")

# Since we're using resampled data with SMOTE, we'll show sample tweets from original data
# Get some real examples from the test set
for i, idx in enumerate(misclassified_indices[:15]):
    true_label = le.inverse_transform([y_test[idx]])[0]
    pred_label = le.inverse_transform([y_pred[idx]])[0]
    # Get a sample tweet from original dataframe with the same true label
    sample_tweet = df[df['Sentiment'] == true_label]['text'].iloc[i % min(len(df[df['Sentiment'] == true_label]), 15)]
    print(f"\n{i+1}. True: {true_label} | Predicted: {pred_label}")
    print(f"   Sample text: {sample_tweet[:100]}...")

Accuracy: 0.8780024019215372
F1 Score: 0.8764649193140147
                precision    recall  f1-score   support

Mixed_feelings       0.91      0.77      0.83      1980
      Negative       0.91      0.79      0.85      2032
      Positive       0.82      0.95      0.88      1987
     not-Tamil       0.84      0.99      0.91      2014
 unknown_state       0.94      0.89      0.91      1979

      accuracy                           0.88      9992
     macro avg       0.88      0.88      0.88      9992
  weighted avg       0.88      0.88      0.88      9992


Total misclassified: 1219

=== Top 15 Misclassified Tweets ===

1. True: Mixed_feelings | Predicted: Positive
   Sample text: Normal ah than iruku over ah laam illa expect panra aalavuku onum illa...

2. True: Mixed_feelings | Predicted: Positive
   Sample text: Amithap patchan acting ivaruku koncham kooda set agala ... acting koncham change pannirukalam...

3. True: Negative | Predicted: not-Tamil
   Sample text: Andha 19 k unlik

In [31]:
# Updated prediction function showing both sentiment and emotion
def predict_sentiment_and_emotion(text_input):
    cleaned = clean_text(text_input)
    vect = vectorizer.transform([cleaned])
    label_num = clf.predict(vect)[0]
    sentiment_label = le.inverse_transform([label_num])[0]
    emotion = extract_emotion(text_input)
    print(f"Text: {text_input}")
    print(f"Predicted Sentiment: {sentiment_label}")
    print(f"Predicted Emotion: {emotion}")
    print("-" * 50)

print("\n=== Sentiment & Emotion Prediction Demo ===")
print("\nTest 1:")
predict_sentiment_and_emotion("Nalla padam da sema feel varuthu üòä")
print("\nTest 2:")
predict_sentiment_and_emotion("Intha trailer mokka da üò¢")
print("\nTest 3:")
predict_sentiment_and_emotion("Super movie da love it üòç")
print("\nTest 4:")
predict_sentiment_and_emotion("Ippadiye poguthu mass üòÇ")
print("\nTest 5:")
predict_sentiment_and_emotion("Worst padam thalaiva üò°")
print("\nTest 5:")
predict_sentiment_and_emotion("Konjam Irumugan Madhiri irukku....All the best.....team..")


=== Sentiment & Emotion Prediction Demo ===

Test 1:
Text: Nalla padam da sema feel varuthu üòä
Predicted Sentiment: Positive
Predicted Emotion: HAPPY
--------------------------------------------------

Test 2:
Text: Intha trailer mokka da üò¢
Predicted Sentiment: Negative
Predicted Emotion: SAD
--------------------------------------------------

Test 3:
Text: Super movie da love it üòç
Predicted Sentiment: Positive
Predicted Emotion: LOVE
--------------------------------------------------

Test 4:
Text: Ippadiye poguthu mass üòÇ
Predicted Sentiment: Positive
Predicted Emotion: JOY
--------------------------------------------------

Test 5:
Text: Worst padam thalaiva üò°
Predicted Sentiment: Positive
Predicted Emotion: ANGRY
--------------------------------------------------

Test 5:
Text: Konjam Irumugan Madhiri irukku....All the best.....team..
Predicted Sentiment: Mixed_feelings
Predicted Emotion: NEUTRAL
--------------------------------------------------
