# Model Training for Sentiment Analysis

This notebook is dedicated to training various sentiment analysis models on the cleaned dataset of Malay and English tweets. We will implement the Naive Bayes algorithm, embedding-based models, and traditional sentiment analysis algorithms. The effectiveness of each model will be compared using confusion matrices and other performance metrics.

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from src.data_preprocessing.data_cleaner import clean_data
from src.models.naive_bayes import train_naive_bayes
from src.models.embedding_models import train_embedding_model
from src.models.traditional_models import train_traditional_model
from src.evaluation.metrics import calculate_metrics
from src.evaluation.confusion_matrix import plot_confusion_matrix

# Load the dataset
data_path = '../data/raw/semisupervised-bert-xlnet.csv'
df = pd.read_csv(data_path)

# Clean the dataset
df_cleaned = clean_data(df)

# Split the dataset into training and testing sets (80-20 split)
X = df_cleaned['text']
y = df_cleaned['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naive Bayes model
nb_model = train_naive_bayes(X_train, y_train)

# Train embedding-based model
embedding_model = train_embedding_model(X_train, y_train)

# Train traditional model
traditional_model = train_traditional_model(X_train, y_train)

# Evaluate models
nb_predictions = nb_model.predict(X_test)
embedding_predictions = embedding_model.predict(X_test)
traditional_predictions = traditional_model.predict(X_test)

# Calculate metrics
nb_metrics = calculate_metrics(y_test, nb_predictions)
embedding_metrics = calculate_metrics(y_test, embedding_predictions)
traditional_metrics = calculate_metrics(y_test, traditional_predictions)

# Plot confusion matrices
plot_confusion_matrix(y_test, nb_predictions, title='Naive Bayes Confusion Matrix')
plot_confusion_matrix(y_test, embedding_predictions, title='Embedding Model Confusion Matrix')
plot_confusion_matrix(y_test, traditional_predictions, title='Traditional Model Confusion Matrix')

# Display metrics
print('Naive Bayes Metrics:', nb_metrics)
print('Embedding Model Metrics:', embedding_metrics)
print('Traditional Model Metrics:', traditional_metrics)