# Basic Models: SVM and Naive Bayes

### Import necessary packages

In [1]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

import mlflow
import logging 
import config 

from basic_functions import (
    get_preprocess_data,
    get_lemmatized_data,
    get_metrics
)


### Setup

In [2]:
MODEL_NAME = "SVM" 
TRACKING_URI = open("../.mlflow_uri").read().strip()
EXPERIMENT_NAME = config.EXPERIMENT_NAME

logging.basicConfig(format="%(asctime)s: %(message)s") # Configure logging format to show timestamp before every message

logger = logging.getLogger()
logger.setLevel(logging.INFO) # Only show logs that are INFO or more important (e.g., WARNING, ERROR) — but ignore DEBUG.

In [3]:
DATA_PATH = "../data/data_dropped_duplicates_small.csv"

### Get and process data

In [4]:
df = get_preprocess_data(DATA_PATH)

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/maren/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Lemmatize text

In [6]:
df.head()

Unnamed: 0,dataset,text,logical_fallacies,source
18418,8,yes anything to help people live loinger,ad_hominem,
7891,3,"yes, i can see why some people would find this...",none,
19326,9,a sorry little post office that can't do anyth...,appeal_to_emotion,
19159,9,"american lives that have been lost, families t...",appeal_to_emotion,
8695,3,i kind of understand part of this problem. i h...,faulty_generalization,


In [7]:
df = get_lemmatized_data(df)

### Train-test split

In [8]:
y = df["logical_fallacies"]
X = df["text"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y)

### SVM

In [9]:
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run()
run = mlflow.active_run()
mlflow.set_tag("model_name", MODEL_NAME)
mlflow.set_tag("mlflow.runName", "svm baseline")
# mlflow.log_params(params)

In [10]:
# Use TF-IDF Vecorizer to transform text into numerical data
tfidf_vectorizer = TfidfVectorizer()
X_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_vectorized_test = tfidf_vectorizer.transform(X_test)

In [11]:

# Grid search
param_grid = {
'C': [0.1, 1, 10, 100],
'gamma': ['scale', 'auto'],
'kernel': ['linear', 'rbf']
}

# svm = SVC(probability=True)
svm = SVC()


grid_search = GridSearchCV(svm, param_grid, cv=5)
grid_search.fit(X_vectorized, y_train)

best_params = grid_search.best_params_
print(best_params)

# Predict on train and test data
best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_vectorized)
y_test_pred = best_model.predict(X_vectorized_test)

mlflow.log_params(best_params)


    


{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}


FileNotFoundError: [Errno 2] No such file or directory: '../models/svm/svm_model.pkl'

In [12]:
# Save model to pickle file:
with open('../models/svm/svm_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

### Evaluation


In [13]:
def log_metrics(cr, split):
    for key, value in cr.items():
        if (key == "accuracy"):
                # print(f"{split}_{key}", round(value,2))
                mlflow.log_metric(f"{split}_{key}", value)
        else:
            for metric in value:
                mlflow.log_metric(f"{split}_{key}_{metric}", value.get(metric))
                # print(f"{split}_{key}_{metric}", round(value.get(metri

In [None]:
logger.info('get_train_metrics')
classification_report_train = get_metrics(y_train, y_train_pred)
log_metrics(classification_report_train, "train")

2025-04-08 09:46:48,591: classification_report
2025-04-08 09:46:48,700: confusion_matrix


                       precision    recall  f1-score   support

           ad_hominem       0.93      0.86      0.89       331
  appeal_to_authority       0.99      0.72      0.83       227
    appeal_to_emotion       0.94      0.90      0.92       504
        false_dilemma       0.99      0.75      0.85       319
faulty_generalization       0.98      0.74      0.85       449
                 none       0.85      1.00      0.92      1670

             accuracy                           0.90      3500
            macro avg       0.95      0.83      0.88      3500
         weighted avg       0.91      0.90      0.89      3500

[[ 286    1    8    0    1   35]
 [   5  164    8    0    0   50]
 [   8    0  452    1    2   41]
 [   5    0    4  238    2   70]
 [   4    1    8    0  334  102]
 [   1    0    3    1    1 1664]]


In [16]:
logger.info('get_test_metrics')
classification_report_test = get_metrics(y_test, y_test_pred)
log_metrics(classification_report_test, "test")

2025-04-08 09:46:57,071: classification_report
2025-04-08 09:46:57,126: confusion_matrix


                       precision    recall  f1-score   support

           ad_hominem       0.54      0.41      0.46       142
  appeal_to_authority       0.67      0.16      0.26        97
    appeal_to_emotion       0.55      0.43      0.48       216
        false_dilemma       0.81      0.42      0.55       137
faulty_generalization       0.67      0.29      0.41       192
                 none       0.64      0.93      0.76       716

             accuracy                           0.63      1500
            macro avg       0.65      0.44      0.49      1500
         weighted avg       0.64      0.63      0.59      1500

[[ 58   0  28   1   2  53]
 [  6  16  14   0   3  58]
 [ 19   4  93   2   6  92]
 [  6   0   1  57   3  70]
 [ 10   1  14   4  56 107]
 [  9   3  18   6  13 667]]


In [17]:
mlflow.end_run()

🏃 View run svm baseline at: http://127.0.0.1:5001/#/experiments/823412171152425451/runs/17e21bfb91d642a6982283fa0fb62b6c
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/823412171152425451


### Naive Bayes

In [None]:
# Pipeline for TF-IFD and Naive Bayes
# pipeline_bayes = Pipeline([
#     ('tfidf', TfidfVectorizer()),
#     ('nb', MultinomialNB()),
# ])

# # Train the model
# pipeline_bayes.fit(X_train, y_train)

# # Predict on train and test data
# y_train_pred_bayes = pipeline_bayes.predict(X_train)
# y_test_pred_bayes = pipeline_bayes.predict(X_test)

# __compute_and_log_metrics(y_train, y_train_pred_bayes, "train")
# __compute_and_log_metrics(y_test, y_test_pred_bayes, "test")