In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load Data
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("validation.csv")

# Handle Missing Values
train_df.dropna(subset=["message"], inplace=True)
val_df.dropna(subset=["message"], inplace=True)

# Vectorize Text
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train = vectorizer.fit_transform(train_df["message"].astype(str))
X_val = vectorizer.transform(val_df["message"].astype(str))

y_train = train_df["label"]
y_val = val_df["label"]

In [2]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Train SVM Model (Linear Kernel)
svm = SVC(kernel="linear")
svm.fit(X_train, y_train)

# Evaluate Function
def evaluate_model(model, X, y, dataset_name):
    y_pred = model.predict(X)
    print(f"Performance on {dataset_name}:")
    print("Accuracy:", accuracy_score(y, y_pred))
    print(classification_report(y, y_pred))

# Evaluate on Train & Validation
print("SVM Results:")
evaluate_model(svm, X_train, y_train, "Train")
evaluate_model(svm, X_val, y_val, "Validation")


SVM Results:
Performance on Train:
Accuracy: 0.9959605026929982
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3858
           1       1.00      0.97      0.98       598

    accuracy                           1.00      4456
   macro avg       1.00      0.99      0.99      4456
weighted avg       1.00      1.00      1.00      4456

Performance on Validation:
Accuracy: 0.9802513464991023
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       483
           1       1.00      0.85      0.92        74

    accuracy                           0.98       557
   macro avg       0.99      0.93      0.95       557
weighted avg       0.98      0.98      0.98       557



In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train Logistic Regression Model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Evaluate Function
def evaluate_model(model, X, y, dataset_name):
    y_pred = model.predict(X)
    print(f"Performance on {dataset_name}:")
    print("Accuracy:", accuracy_score(y, y_pred))
    print(classification_report(y, y_pred))

# Evaluate on Train & Validation
print("Logistic Regression Results:")
evaluate_model(logreg, X_train, y_train, "Train")
evaluate_model(logreg, X_val, y_val, "Validation")


Logistic Regression Results:
Performance on Train:
Accuracy: 0.9717235188509874
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      3858
           1       0.99      0.80      0.88       598

    accuracy                           0.97      4456
   macro avg       0.98      0.90      0.93      4456
weighted avg       0.97      0.97      0.97      4456

Performance on Validation:
Accuracy: 0.9658886894075404
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       483
           1       1.00      0.74      0.85        74

    accuracy                           0.97       557
   macro avg       0.98      0.87      0.92       557
weighted avg       0.97      0.97      0.96       557



In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Train Naïve Bayes Model
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Evaluate Function
def evaluate_model(model, X, y, dataset_name):
    y_pred = model.predict(X)
    print(f"Performance on {dataset_name}:")
    print("Accuracy:", accuracy_score(y, y_pred))
    print(classification_report(y, y_pred))

# Evaluate on Train & Validation
print("Naïve Bayes Results:")
evaluate_model(nb, X_train, y_train, "Train")
evaluate_model(nb, X_val, y_val, "Validation")

Naïve Bayes Results:
Performance on Train:
Accuracy: 0.9849640933572711
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3858
           1       1.00      0.89      0.94       598

    accuracy                           0.98      4456
   macro avg       0.99      0.94      0.97      4456
weighted avg       0.99      0.98      0.98      4456

Performance on Validation:
Accuracy: 0.9694793536804309
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       483
           1       1.00      0.77      0.87        74

    accuracy                           0.97       557
   macro avg       0.98      0.89      0.93       557
weighted avg       0.97      0.97      0.97       557



In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Train Data (Assuming you already have this from the previous steps)
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Handle Missing Values
train_df.dropna(subset=["message"], inplace=True)
test_df.dropna(subset=["message"], inplace=True)

# Vectorize Text
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train = vectorizer.fit_transform(train_df["message"].astype(str))
X_test = vectorizer.transform(test_df["message"].astype(str))

y_train = train_df["label"]
y_test = test_df["label"]

# Initialize models
logreg = LogisticRegression()
nb = MultinomialNB()
svm = SVC(kernel="linear")

# Fit models
logreg.fit(X_train, y_train)
nb.fit(X_train, y_train)
svm.fit(X_train, y_train)

# Evaluate Function
def evaluate_model(model, X, y, model_name):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    report = classification_report(y, y_pred)
    return accuracy, report

# Evaluate all models on test data
models = [logreg, nb, svm]
model_names = ["Logistic Regression", "Naïve Bayes", "SVM"]
best_model = None
best_accuracy = 0

for model, name in zip(models, model_names):
    accuracy, report = evaluate_model(model, X_test, y_test, name)
    print(f"{name} Performance on Test Data:")
    print("Accuracy:", accuracy)
    print(report)
    print("-" * 50)

    # Check for the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_model_name = name

print(f"Best Model: {best_model_name} with Accuracy: {best_accuracy}")

Logistic Regression Performance on Test Data:
Accuracy: 0.9730700179533214
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       482
           1       1.00      0.80      0.89        75

    accuracy                           0.97       557
   macro avg       0.98      0.90      0.94       557
weighted avg       0.97      0.97      0.97       557

--------------------------------------------------
Naïve Bayes Performance on Test Data:
Accuracy: 0.9730700179533214
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       482
           1       1.00      0.80      0.89        75

    accuracy                           0.97       557
   macro avg       0.98      0.90      0.94       557
weighted avg       0.97      0.97      0.97       557

--------------------------------------------------
SVM Performance on Test Data:
Accuracy: 0.9838420107719928
              precision    recall  f1-score