# Basic Model: Naive Bayes

### Import necessary packages

In [None]:
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import make_scorer, f1_score
import nltk

import mlflow
import logging 
import modeling.config 

from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

from modeling.basic_functions import (
    get_preprocess_data,
    get_lemmatized_data,
    get_metrics
)

### Setup

In [None]:
MODEL_NAME = "Naive_Bayes_oversampling" 
TRACKING_URI = open("../.mlflow_uri").read().strip()
EXPERIMENT_NAME = modeling.config.EXPERIMENT_NAME

logging.basicConfig(format="%(asctime)s: %(message)s") # Configure logging format to show timestamp before every message

logger = logging.getLogger()
logger.setLevel(logging.INFO) # Only show logs that are INFO or more important (e.g., WARNING, ERROR) — but ignore DEBUG.

In [3]:
DATA_PATH = "../data/data_dropped_duplicates_small.csv"

### Get and process data

In [4]:
df = get_preprocess_data(DATA_PATH)

### Lemmatize text

In [None]:
nltk.download('wordnet')

In [6]:
df = get_lemmatized_data(df)

### Train-test split

In [7]:
y = df["logical_fallacies"]
X = df["text"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y)

### Naive Bayes

In [8]:
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run()
run = mlflow.active_run()
mlflow.set_tag("model_name", MODEL_NAME)
mlflow.set_tag("mlflow.runName", "naive bayes oversampling")
# mlflow.log_params(params)

### For Grid Search

In [9]:
# # Use TF-IDF Vecorizer to transform text into numerical data
# tfidf_vectorizer = TfidfVectorizer()
# X_vectorized = tfidf_vectorizer.fit_transform(X_train)
# X_vectorized_test = tfidf_vectorizer.transform(X_test)

In [10]:
# # Oversampling only on train dataset
# ros = RandomOverSampler(random_state=0)
# X_resampled_train, y_resampled_train = ros.fit_resample(X_vectorized, y_train)

In [11]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.naive_bayes import MultinomialNB


# # Define the parameter grid for Naive Bayes
# param_grid = {
#     'alpha': [0.1, 0.5, 1.0, 2.0],  # Smoothing parameter
#     'fit_prior': [True, False]       # Whether to learn class priors from data
# }

# # # Create a custom scoring function for macro F1
# # macro_f1_scorer = make_scorer(f1_score, average='macro')

# # Initialize Naive Bayes model
# nb = MultinomialNB()

# # Set up GridSearchCV with macro F1 scoring
# grid_search = GridSearchCV(nb, param_grid, cv=5, scoring="f1_weighted")
# grid_search.fit(X_resampled_train, y_resampled_train)

# # Get best parameters and model
# best_params = grid_search.best_params_
# print("Best parameters:", best_params)

# best_model = grid_search.best_estimator_

# # Predict on train and test data
# y_train_pred = best_model.predict(X_resampled_train)
# y_test_pred = best_model.predict(X_vectorized_test)

# # Log parameters to MLflow
# mlflow.log_params(best_params)

### Without Gridsearch in Pipeline

In [None]:
#Pipeline for TF-IFD and Naive Bayes
pipeline_bayes = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('ros', RandomOverSampler(random_state=10)),
    ('nb', MultinomialNB()),
])

# Train the model
pipeline_bayes.fit(X_train, y_train)

# Get predictions on the resampled training data
X_res, y_res = pipeline_bayes.named_steps['ros'].fit_resample(
    pipeline_bayes.named_steps['tfidf'].transform(X_train), 
    y_train
)
y_res_pred = pipeline_bayes.named_steps['nb'].predict(X_res)

# Predict on test data
y_test_pred = pipeline_bayes.predict(X_test)

### Evaluation


In [14]:
def log_metrics(cr, split):
    for key, value in cr.items():
        if (key == "accuracy"):
                # print(f"{split}_{key}", round(value,2))
                mlflow.log_metric(f"{split}_{key}", value)
        else:
            for metric in value:
                mlflow.log_metric(f"{split}_{key}_{metric}", value.get(metric))
                # print(f"{split}_{key}_{metric}", round(value.get(metri

In [None]:
logger.info('get_train_metrics')
classification_report_train = get_metrics(y_res, y_res_pred)
log_metrics(classification_report_train, "train")

In [None]:
logger.info('get_test_metrics')
classification_report_test = get_metrics(y_test, y_test_pred)
log_metrics(classification_report_test, "test")

In [None]:
mlflow.end_run()