In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load dataset
csv_path = "E:\Github\Sentiment-Analysis-Speech-Emotion-Recognition\Dataset\dataset.csv"  # Update this path
df = pd.read_csv(csv_path)

In [3]:
# Initialize sentence embedding model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Encode sentences into embeddings
embeddings = model.encode(df["Sentence"].tolist())

# Label encoding
type_mapping = {"Affirmation": 0, "Negation": 1}
fact_subj_mapping = {"Factual": 0, "Subjective": 1}
sentiment_mapping = {"Sadness": 0, "Anger": 1, "Neutral": 2, "Happiness": 3, "Euphoria": 4}

df["Type"] = df["Type"].map(type_mapping).fillna(-1).astype(int)
df["Factual/Subjective"] = df["Factual/Subjective"].map(fact_subj_mapping).fillna(-1).astype(int)
df["Sentiment"] = df["Sentiment"].map(sentiment_mapping).fillna(-1).astype(int)

# Convert labels to NumPy arrays
type_labels = df["Type"].values
fact_subj_labels = df["Factual/Subjective"].values
sentiment_labels = df["Sentiment"].values

In [4]:
# Stratified Train-Test Split (based on Sentiment)
X_train, X_test, y_type_train, y_type_test, y_fact_train, y_fact_test, y_sent_train, y_sent_test = train_test_split(
    embeddings, type_labels, fact_subj_labels, sentiment_labels,
    test_size=0.2, random_state=42, stratify=sentiment_labels
)

In [5]:
# Hyperparameter Grid for Logistic Regression
logistic_param_grid = {"C": [0.01, 0.1, 1, 10, 100]}

In [6]:
def train_logistic_regression(X_train, y_train, X_test, y_test, name):
    model = LogisticRegression(max_iter=1000)
    grid_search = GridSearchCV(model, logistic_param_grid, cv=3, scoring="accuracy", n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    print(f"Best Logistic Regression Model for {name}: {grid_search.best_params_}")
    print(f"Accuracy for {name}: {acc:.4f}\n")
    
    return best_model

In [7]:
# Train and evaluate logistic regression models
type_model = train_logistic_regression(X_train, y_type_train, X_test, y_type_test, "Type Classification")


Best Logistic Regression Model for Type Classification: {'C': 10}
Accuracy for Type Classification: 0.9490



In [8]:
# Train and evaluate logistic regression models
fact_model = train_logistic_regression(X_train, y_fact_train, X_test, y_fact_test, "Factual/Subjective Classification")

Best Logistic Regression Model for Factual/Subjective Classification: {'C': 100}
Accuracy for Factual/Subjective Classification: 0.9388



In [9]:
# Hyperparameter Grid for XGBoost
xgb_param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.3],
}

In [10]:
def train_xgboost(X_train, y_train, X_test, y_test, name):
    model = XGBClassifier(eval_metric="mlogloss")
    grid_search = GridSearchCV(model, xgb_param_grid, cv=3, scoring="accuracy", n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    print(f"Best XGBoost Model for {name}: {grid_search.best_params_}")
    print(f"Accuracy for {name}: {acc:.4f}\n")
    
    return best_model

In [11]:
# Train and evaluate XGBoost model
sentiment_model = train_xgboost(X_train, y_sent_train, X_test, y_sent_test, "Sentiment Classification")


Best XGBoost Model for Sentiment Classification: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
Accuracy for Sentiment Classification: 0.8878



In [12]:
import pickle

# Save Logistic Regression models
with open("type_model.pkl", "wb") as f:
    pickle.dump(type_model, f)

with open("fact_model.pkl", "wb") as f:
    pickle.dump(fact_model, f)

# Save XGBoost model
with open("sentiment_model.pkl", "wb") as f:
    pickle.dump(sentiment_model, f)
