In [25]:
import pandas as pd
import numpy as np
import os
import sys
from dataclasses import dataclass, field
sys.path.append('..')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import RandomOverSampler
from utils import *
from logger import logging
from exception import CustomException

# appending a path 

import warnings
warnings.filterwarnings("ignore")

In [None]:
# !pip install imblearn

In [8]:
df = pd.read_csv('../data/toots_labeled.csv')
df.head()

Unnamed: 0,_id,user,content,y
0,111753937734745893,nowifeinthelaopobing,冷静下来仍然觉得增进自己贡献价值是正路，但我今年真的要学会为自己争取才可以，我不会再让了。,0
1,111753937703902969,marthadear,war neulich auf ner veranstaltung und (wieder ...,1
2,111753937695537419,IHasWisdom,Q:\tHow many marketing people does it take to ...,0
3,111753937678847359,Steffell,The everlasting romance of Education: from Yes...,0
4,111753937675453376,Deiru,I mean I called it right around when I barely ...,0


In [9]:
preprocessor = load_object('../artifacts/preprocessor.pkl')

In [23]:
X, y = df['content'], df['y']
X_emb = preprocessor.fit_transform(X)

# Initialize the RandomOverSampler
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)

# Fit and transform your dataset
X_resampled, y_resampled = oversampler.fit_resample(X_emb, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([299, 304], dtype=int64))

In [24]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Naive Bayes": MultinomialNB(),
}

params = {
    "Logistic Regression": {
        'C': [0.1, 1, 10],
    },
    "Random Forest": {
        'n_estimators': [10, 50, 100],
    },
    "SVM": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
    },
    "Naive Bayes": {},
}

In [28]:
model_report = {}
fitted_models = {}
for name, model in models.items():
    logging.info(f"Training {name}")
    if name in params:
        model = GridSearchCV(model, params[name], cv=5, scoring='f1')
        model.fit(X_train, y_train)
        best_model = model.best_estimator_  # Use the best estimator from GridSearchCV
        fitted_models[name] = model
    else:
        model.fit(X_train, y_train)
        best_model = model  # Use the directly fitted model
        fitted_models[name] = model

    predictions = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, zero_division=0)
    recall = recall_score(y_test, predictions, zero_division=0)
    f1 = f1_score(y_test, predictions, zero_division=0)
    model_report[name] = f1
    logging.info(f"Model {name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

best_model_score = max(model_report.values())
best_model_name = max(model_report, key=model_report.get)
# best_model = models[best_model_name]
best_model = fitted_models[best_model_name]

logging.info(f"Best model found on train and test dataset: {best_model_name}")

In [29]:
# Save the best model and the vectorizer
trained_model_file_path = "./artifacts/model.pkl"
preprocessor_file_path = "./artifacts/preprocessor.pkl"

save_object(
    file_path= trained_model_file_path,
    obj=best_model
)
save_object(
    file_path= preprocessor_file_path,
    obj= preprocessor
)

logging.info("Model and preprocessor saved to disk successfully")

In [30]:
print("best_model_name", best_model_name)
print("best_model_score", best_model_score)
print("accuracy", accuracy_score(y_test, best_model.predict(X_test)))
print("precision", precision_score(y_test, best_model.predict(X_test)))
print("recall", recall_score(y_test, best_model.predict(X_test)))
print("f1_score", f1_score(y_test, best_model.predict(X_test)))

best_model_name Logistic Regression
best_model_score 1.0
accuracy 1.0
precision 1.0
recall 1.0
f1_score 1.0
