## Import necessary libraries

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

## Load and splite the data


In [4]:
data = pd.read_csv('C:/Users/windows 10/MyProject/DeepLearning/Data/Translated_texts.csv')

# Clean the dataset (remove unnecessary columns)
columns_to_remove = ['name', 'date', 'likes', 'source']
data = data.drop(columns=columns_to_remove, errors='ignore')
data = data.dropna(subset=['comment', 'sentiment'])

# Define features and target
X = data['comment']
y = data['sentiment']

# Splite the data(80 % for Train, 20% for Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



## Set hyperparameters manually

In [7]:
manual_hyperparams = {
    "Naive Bayes": {
        "tfidf_params": {"max_df": 0.9, "min_df": 2, "ngram_range": (1, 2)},
        "model_params": {"alpha": 0.5}
    },
    "Decision Tree": {
        "tfidf_params": {"max_df": 0.95, "min_df": 2, "ngram_range": (1, 2)},
        "model_params": {"max_depth": None, "min_samples_split": 2}
    },
    "Support Vector Machine": {
        "tfidf_params": {"max_df": 0.95, "min_df": 2, "ngram_range": (1, 1)},
        "model_params": {"C": 1, "gamma": 1}
    },
    "Logistic Regression": {
        "tfidf_params": {"max_df": 0.9, "min_df": 2, "ngram_range": (1, 1)},
        "model_params": {"C": 1, "solver": "liblinear"}
    }
}

# Store models
models = {
    "Naive Bayes": MultinomialNB,
    "Decision Tree": DecisionTreeClassifier,
    "Support Vector Machine": SVC,
    "Logistic Regression": LogisticRegression
}



## Train, Test, Evaluate the model

In [10]:
for model_name, model_class in models.items():
    print(f"Training {model_name}...")
    
    # Create the pipeline 
    tfidf_params = manual_hyperparams[model_name]["tfidf_params"]
    model_params = manual_hyperparams[model_name]["model_params"]
    
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(**tfidf_params)),
        ('model', model_class(**model_params))
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Calculate training accuracy
    y_train_pred = pipeline.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    
    # Calculate testing accuracy
    y_test_pred = pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # Evaluate testing performance
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_test_pred, average='weighted')
    
    
    print(f"Model: {model_name}")
    print(f"=======>>>Training : {train_accuracy:.2f}")
    
    print(f"Testing Accuracy: {test_accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-score: {f1:.2f}")
    print("=" * 60)


Training Naive Bayes...
Model: Naive Bayes
Testing Accuracy: 0.77
Precision: 0.77
Recall: 0.77
F1-score: 0.76
Training Decision Tree...
Model: Decision Tree
Testing Accuracy: 0.73
Precision: 0.73
Recall: 0.73
F1-score: 0.72
Training Support Vector Machine...
Model: Support Vector Machine
Testing Accuracy: 0.77
Precision: 0.77
Recall: 0.77
F1-score: 0.76
Training Logistic Regression...
Model: Logistic Regression
Testing Accuracy: 0.76
Precision: 0.78
Recall: 0.76
F1-score: 0.75
