#### 📥 Step 1: Load & Preprocess Data

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 🗂️ Step 2: Loading Datasets

### 📌 Sentiment Analysis Datasets
1️⃣ **Sentiment Dataset** → *Twitter Sentiment Dataset.csv*  
2️⃣ **Hate Speech Dataset** → *Twitter Sentiment Analysis (Hatred Speech) Training Dataset.csv*  
3️⃣ **Emotion Classification Dataset** → *Twitter Emotion Classification Training Dataset.csv*  
4️⃣ **Customer Support Dataset** → *Customer Support on Twitter Dataset.csv*  


In [None]:
# Define the common directory path
base_dir = "/content/drive/MyDrive/MTech/Sem 2/Mini Project/Mini Project All/Datasets/Selected /Training datasets/"

# Define dataset file names
dataset_files = {
    "Sentiment Dataset": "Twitter Sentiment Dataset.csv",
    "Hate Speech Dataset": "Twitter Sentiment Analysis (Hatred Speech) Training Dataset.csv",
    "Emotion Classification Dataset": "Twitter Emotion Classification Training Dataset.csv",
    # "Customer Support Dataset": "Customer_Support_Updated.csv"
}


In [None]:
# Load datasets into a dictionary
datasets = {}
for name, file in dataset_files.items():
    file_path = base_dir + file  # Construct full path
    try:
        datasets[name] = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip')
        print(f"✅ Loaded: {name}")
    except Exception as e:
        print(f"❌ Error loading {name}: {e}")

❌ Error loading Sentiment Dataset: [Errno 2] No such file or directory: '/content/drive/MyDrive/MTech/Sem 2/Mini Project/Mini Project All/Datasets/Selected /Training datasets/Twitter Sentiment Dataset.csv'
❌ Error loading Hate Speech Dataset: [Errno 2] No such file or directory: '/content/drive/MyDrive/MTech/Sem 2/Mini Project/Mini Project All/Datasets/Selected /Training datasets/Twitter Sentiment Analysis (Hatred Speech) Training Dataset.csv'
❌ Error loading Emotion Classification Dataset: [Errno 2] No such file or directory: '/content/drive/MyDrive/MTech/Sem 2/Mini Project/Mini Project All/Datasets/Selected /Training datasets/Twitter Emotion Classification Training Dataset.csv'


#Display the Datasets

In [None]:
for name, df in datasets.items():
    print(f"\n🔹 Dataset: {name}")
    print(df.head())
    print("=" * 80)

#Renaming for uniformity

In [None]:
# Rename label/category column
datasets["Sentiment Dataset"].rename(columns={"clean_text": "text", "category": "sentiment_label"}, inplace=True)
datasets["Hate Speech Dataset"].rename(columns={"tweet": "text", "label": "hate_speech_label"}, inplace=True)
datasets["Emotion Classification Dataset"].rename(columns={"label": "emotion_label"}, inplace=True)


KeyError: 'Sentiment Dataset'

In [None]:
for name, df in datasets.items():
    print(f"\n🔹 Dataset: {name}")
    print(df.shape)
    print("=" * 80)

In [None]:
for name, df in datasets.items():
    print(f"\n🔹 Dataset: {name}")
    print(df.head())
    print("=" * 80)

In [None]:
for name, df in datasets.items():
    print(f"\n🔹 Dataset: {name}")
    print(df.columns)
    print("=" * 80)


🔹 Dataset: Sentiment Dataset
Index(['text', 'sentiment_label'], dtype='object')

🔹 Dataset: Hate Speech Dataset
Index(['id', 'hate_speech_label', 'text'], dtype='object')

🔹 Dataset: Emotion Classification Dataset
Index(['text', 'emotion_label'], dtype='object')


#preprocessing the datasets before using

In [None]:
# Preprocessing function
def preprocess_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)

# Apply preprocessing
def apply_preprocessing(df, label_col):
    text_col='text'
    if text_col not in df.columns or label_col not in df.columns:
        raise KeyError(f"Columns {text_col} or {label_col} not found in DataFrame")
    df[text_col] = df[text_col].astype(str).apply(preprocess_text)
    df[label_col] = LabelEncoder().fit_transform(df[label_col])
    return df

In [None]:
  >>> import nltk
  >>> nltk.download('punkt_tab')
  >>> nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
datasets["Sentiment Dataset"] = apply_preprocessing(datasets["Sentiment Dataset"], label_col='sentiment_label')


In [None]:
datasets["Hate Speech Dataset"] = apply_preprocessing(datasets["Hate Speech Dataset"], label_col='hate_speech_label')


In [None]:
datasets["Emotion Classification Dataset"] = apply_preprocessing(datasets["Emotion Classification Dataset"], label_col='emotion_label')


In [None]:
# # Preprocess each dataset separately
# datasets["Customer Support Dataset"] = datasets["Customer Support Dataset"].rename(columns={"sentiment": "label"})
# datasets["Customer Support Dataset"] = apply_preprocessing(datasets["Customer Support Dataset"], text_col='text', label_col='label')


#show the filtered datasets

In [None]:
datasets["Hate Speech Dataset"]= datasets["Hate Speech Dataset"].drop(columns=["id"])

In [None]:
datasets["Sentiment Dataset"]

Unnamed: 0,text,sentiment_label
0,modi promised minimum government maximum gover...,0
1,talk nonsense continue drama vote modi,1
2,say vote modi welcome bjp told rahul main camp...,2
3,asking supporters prefix chowkidar names modi ...,2
4,answer among powerful world leader today trump...,2
...,...,...
162975,crores paid neerav modi recovered congress lea...,0
162976,dear rss terrorist payal gawar modi killing pl...,0
162977,cover interaction forum left,1
162978,big project came india modi dream project happ...,1


In [None]:
datasets["Emotion Classification Dataset"]

Unnamed: 0,text,emotion_label
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,3
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,3
...,...,...
15995,brief time beanbag said anna feel like beaten,0
15996,turning feel pathetic still waiting tables sub...,0
15997,feel strong good overall,1
15998,feel like rude comment im glad,3


#### 💊 Step 2: Train Models on Each Dataset Separately


In [None]:
# Sentiment Dataset
print("\n🔹 Training on Sentiment Dataset...")
vectorizer_sentiment = TfidfVectorizer(max_features=5000)
X_sentiment = vectorizer_sentiment.fit_transform(datasets["Sentiment Dataset"]['text'])
y_sentiment = datasets["Sentiment Dataset"]['sentiment_label']
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_sentiment, y_sentiment, test_size=0.2, random_state=42)
model_s = LogisticRegression(max_iter=200)
model_s.fit(X_train_s, y_train_s)
y_pred_s = model_s.predict(X_test_s)
print("Accuracy:", accuracy_score(y_test_s, y_pred_s))
print(classification_report(y_test_s, y_pred_s))



🔹 Training on Sentiment Dataset...
Accuracy: 0.8926862191679961
              precision    recall  f1-score   support

           0       0.88      0.78      0.83      7179
           1       0.87      0.97      0.92     11034
           2       0.92      0.89      0.90     14383

    accuracy                           0.89     32596
   macro avg       0.89      0.88      0.88     32596
weighted avg       0.89      0.89      0.89     32596



In [None]:

# Hate Speech Dataset
print("\n🔹 Training on Hate Speech Dataset...")
vectorizer_hate = TfidfVectorizer(max_features=5000)
X_hate = vectorizer_hate.fit_transform(datasets["Hate Speech Dataset"]['text'])
y_hate = datasets["Hate Speech Dataset"]['hate_speech_label']
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_hate, y_hate, test_size=0.2, random_state=42)
model_h = MultinomialNB()
model_h.fit(X_train_h, y_train_h)
y_pred_h = model_h.predict(X_test_h)
print("Accuracy:", accuracy_score(y_test_h, y_pred_h))
print(classification_report(y_test_h, y_pred_h))





🔹 Training on Hate Speech Dataset...
Accuracy: 0.9510402002189895
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      5937
           1       0.91      0.35      0.50       456

    accuracy                           0.95      6393
   macro avg       0.93      0.67      0.74      6393
weighted avg       0.95      0.95      0.94      6393



In [None]:
# Emotion Classification Dataset
print("\n🔹 Training on Emotion Classification Dataset...")
vectorizer_emotion = TfidfVectorizer(max_features=5000)
X_emotion = vectorizer_emotion.fit_transform(datasets["Emotion Classification Dataset"]['text'])
y_emotion = datasets["Emotion Classification Dataset"]['emotion_label']
X_train_e, X_test_e, y_train_e, y_test_e = train_test_split(X_emotion, y_emotion, test_size=0.2, random_state=42)
model_e = SVC()
model_e.fit(X_train_e, y_train_e)
y_pred_e = model_e.predict(X_test_e)
print("Accuracy:", accuracy_score(y_test_e, y_pred_e))
print(classification_report(y_test_e, y_pred_e))



🔹 Training on Emotion Classification Dataset...
Accuracy: 0.8559375
              precision    recall  f1-score   support

           0       0.90      0.93      0.92       946
           1       0.80      0.97      0.87      1021
           2       0.89      0.54      0.68       296
           3       0.91      0.81      0.86       427
           4       0.86      0.77      0.81       397
           5       0.84      0.51      0.64       113

    accuracy                           0.86      3200
   macro avg       0.87      0.76      0.80      3200
weighted avg       0.86      0.86      0.85      3200



In [None]:

# # Customer Support Dataset
# print("\n🔹 Training on Customer Support Dataset...")
# vectorizer_cs = TfidfVectorizer(max_features=5000)
# X_cs = vectorizer_cs.fit_transform(datasets["Customer Support Dataset"]['text'])
# y_cs = datasets["Customer Support Dataset"]['label']
# X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_cs, y_cs, test_size=0.2, random_state=42)
# model_c = RandomForestClassifier(n_estimators=100)
# model_c.fit(X_train_c, y_train_c)
# y_pred_c = model_c.predict(X_test_c)
# print("Accuracy:", accuracy_score(y_test_c, y_pred_c))
# print(classification_report(y_test_c, y_pred_c))

#modeling with combined dataset
###1. for overall sentiment
###2. for fine grained sentiment

In [None]:
# Load your combined dataset
# combined_df = pd.read_csv("/content/drive/MyDrive/MTech/Sem 2/Mini Project/Generated/Final.csv")  # adjust path
combined_df = pd.read_csv("/content/drive/MyDrive/Mini Project/Generated/Final.csv")  # adjust path


In [None]:
combined_df.head()

Unnamed: 0,text,sentiment_label,emotion_label,hate_speech_label,source_dataset,fine_grained_label,sentiment_label_text,predicted_sentiment_label,bert_predicted_sentiment_label,predicted_sentiment_label_numeric
0,when modi promised “minimum government maximum...,-1.0,1.0,0.0,sentiment_dataset,joyful,Negative,Positive,-1,1
1,talk all the nonsense and continue all the dra...,0.0,0.0,0.0,sentiment_dataset,melancholy,Neutral,Negative,-1,-1
2,what did just say vote for modi welcome bjp t...,1.0,1.0,0.0,sentiment_dataset,joyful,Positive,Positive,-1,1
3,asking his supporters prefix chowkidar their n...,1.0,1.0,0.0,sentiment_dataset,joyful,Positive,Negative,-1,-1
4,answer who among these the most powerful world...,1.0,1.0,0.0,sentiment_dataset,joyful,Positive,Positive,1,1


In [None]:
combined_df.shape

(210938, 10)

In [None]:
combined_df.columns

Index(['text', 'sentiment_label', 'emotion_label', 'hate_speech_label',
       'source_dataset', 'fine_grained_label', 'sentiment_label_text',
       'predicted_sentiment_label', 'bert_predicted_sentiment_label',
       'predicted_sentiment_label_numeric'],
      dtype='object')

###1. for overall sentiment

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
def train_and_evaluate(df, target_column, model_name='paraphrase-MiniLM-L6-v2', test_size=0.2, random_state=42):


    print(f"\n🚀 Starting training for: {target_column}")

    # 1. Preprocess
    df_clean = apply_preprocessing(df, target_column)

    # 2. Load SentenceTransformer model
    print(f"Loading SentenceTransformer model: {model_name}...")
    embedder = SentenceTransformer(model_name)

    # 3. Encode text
    print("Encoding text into embeddings...")
    X = embedder.encode(
    df_clean["text"].tolist(),
    batch_size=32,
    show_progress_bar=True)

    # 4. Target
    y = df_clean[target_column]

    # 5. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # 6. Train model
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)

    # 7. Predict
    y_pred = clf.predict(X_test)

    # 8. Evaluate
    acc = accuracy_score(y_test, y_pred)
    print(f"✅ Accuracy for {target_column}: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    return clf, embedder

In [None]:
# Train on original labels
model_orig, embedder_orig = train_and_evaluate(combined_df, "sentiment_label")




🚀 Starting training for: sentiment_label


NameError: name 'apply_preprocessing' is not defined

In [None]:
# Train on VADER predicted labels
model_vader, embedder_vader = train_and_evaluate(combined_df, "predicted_sentiment_label_numeric")


In [None]:

# Train on BERT predicted labels
model_bert, embedder_bert = train_and_evaluate(combined_df, "bert_predicted_sentiment_label")

 ###SentenceTransformer model: paraphrase-MiniLM-L6-v2.
 1. Accuracy for sentiment_label: 0.6659
 2. Accuracy for predicted_sentiment_label_numeric: 0.7101
 3. Accuracy for bert_predicted_sentiment_label: 0.7969

###2. for fine grained sentiment

In [None]:
# Train on original labels

model_fine, embedder_fine = train_and_evaluate(combined_df, "fine_grained_label")

#All combinations

In [None]:
import pandas as pd
import numpy as np
import re
import time
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sentence_transformers import SentenceTransformer


In [None]:
# 📉 Sample the dataset
def sample_dataset(df, frac=0.1, random_state=42):
    return df.sample(frac=frac, random_state=random_state).reset_index(drop=True)


In [None]:

# 🔥 Your preprocessing function
def preprocess_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)

# Apply preprocessing
def apply_preprocessing(df, label_col):
    text_col = 'text'
    if text_col not in df.columns or label_col not in df.columns:
        raise KeyError(f"Columns {text_col} or {label_col} not found in DataFrame")
    df[text_col] = df[text_col].astype(str).apply(preprocess_text)
    df[label_col] = LabelEncoder().fit_transform(df[label_col])
    return df

# # 🧠 Available Embedding Models
# embedding_models = {
#     "paraphrase-MiniLM-L6-v2": "paraphrase-MiniLM-L6-v2",
#     "all-MiniLM-L6-v2": "all-MiniLM-L6-v2",
#     "all-mpnet-base-v2": "all-mpnet-base-v2",
#     "distiluse-base-multilingual-cased": "distiluse-base-multilingual-cased"
# }

# 🧠 Available Classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Support Vector Machine": SVC(),
    "K Nearest Neighbors": KNeighborsClassifier(),
    # "Naive Bayes": GaussianNB()
}

# 📚 New Experiment Function
def run_experiment(df, label_col, embed_model_name="all-MiniLM-L6-v2"):
    df = df[['text', label_col]].copy()
    df = apply_preprocessing(df, label_col)

    results = []

    print(f"\n🧠 Using embedding model: {embed_model_name}")
    model = SentenceTransformer(embed_model_name, device='cpu')

    print("Encoding text into embeddings...")
    embeddings = model.encode(df['text'].tolist(), batch_size=256, show_progress_bar=True, device='cuda')

    X_train, X_test, y_train, y_test = train_test_split(
        embeddings, df[label_col], test_size=0.2, random_state=42, stratify=df[label_col]
    )

    for clf_name, clf in classifiers.items():
        print(f"\n🚀 Training classifier: {clf_name} with {embed_model_name}")

        start_time = time.time()
        clf.fit(X_train, y_train)
        training_time = time.time() - start_time

        y_pred = clf.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        f1_macro = report['macro avg']['f1-score']

        results.append({
            'Embedding Model': embed_model_name,
            'Classifier': clf_name,
            'Accuracy': acc,
            'Macro F1': f1_macro,
            'Training Time (s)': round(training_time, 2)
        })

        print(f"✅ {clf_name}: Accuracy={acc:.4f}, Macro F1={f1_macro:.4f}")

    results_df = pd.DataFrame(results)
    return results_df


In [None]:
label_col = 'fine_grained_label'  # <<< change to 'bert_predicted_sentiment_label' etc.
sampled_df = sample_dataset(combined_df, frac=0.5)
final_results = run_experiment(sampled_df, label_col, embed_model_name="all-MiniLM-L6-v2")


print("\n📊 FINAL RESULTS:")
print(final_results.sort_values(by='Accuracy', ascending=False).reset_index(drop=True))



🧠 Using embedding model: all-MiniLM-L6-v2
Encoding text into embeddings...


Batches:   0%|          | 0/330 [00:00<?, ?it/s]

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.