# Basic Model: SVM

### Import necessary packages

In [1]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler

import mlflow
import logging 
import config 

from basic_functions import (
    get_preprocess_data,
    get_lemmatized_data,
    get_metrics
)


  from .autonotebook import tqdm as notebook_tqdm


### Setup

In [2]:
MODEL_NAME = "SVM" 
TRACKING_URI = open("../.mlflow_uri").read().strip()
EXPERIMENT_NAME = config.EXPERIMENT_NAME

logging.basicConfig(format="%(asctime)s: %(message)s") # Configure logging format to show timestamp before every message

logger = logging.getLogger()
logger.setLevel(logging.INFO) # Only show logs that are INFO or more important (e.g., WARNING, ERROR) — but ignore DEBUG.

In [3]:
DATA_PATH = "../data/data_dropped_duplicates_small.csv"

### Get and process data

In [4]:
df = get_preprocess_data(DATA_PATH)

### Lemmatize text

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aylinhanne/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
df.head()

Unnamed: 0,dataset,text,logical_fallacies,source
6159,3,it s of little shock that pakistani government...,faulty_generalization,
8590,3,addiction is never a good thing and maradona w...,faulty_generalization,
10725,3,outrageous this is just another example of the...,none,
9451,3,the us not being accessible to every country i...,none,
17962,8,global temperature have not risen in the past ...,none,


In [7]:
df = get_lemmatized_data(df)

### Train-test split

In [8]:
y = df["logical_fallacies"]
X = df["text"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y)

### SVM

In [9]:
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run()
run = mlflow.active_run()
mlflow.set_tag("model_name", MODEL_NAME)
mlflow.set_tag("mlflow.runName", "svm baseline")
# mlflow.log_params(params)

In [10]:
# Use TF-IDF Vecorizer to transform text into numerical data
tfidf_vectorizer = TfidfVectorizer()
X_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_vectorized_test = tfidf_vectorizer.transform(X_test)

In [11]:
# Oversampling only on train dataset
ros = RandomOverSampler(random_state=0)
X_resampled_train, y_resampled_train = ros.fit_resample(X_vectorized, y_train)

In [12]:
# Grid search
param_grid = {
'C': [0.1, 1, 10, 100],
'gamma': ['scale', 'auto'],
'kernel': ['linear', 'rbf']
}

svm = SVC()

grid_search = GridSearchCV(svm, param_grid, cv=5, scoring="f1_weighted")
grid_search.fit(X_resampled_train, y_resampled_train)

best_params = grid_search.best_params_
print(best_params)

# Predict on train and test data
best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_resampled_train)
y_test_pred = best_model.predict(X_vectorized_test)

mlflow.log_params(best_params)


{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


In [13]:
# Save model to pickle file:
with open('../models/svm/svm_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

### Evaluation


In [14]:
def log_metrics(cr, split):
    for key, value in cr.items():
        if (key == "accuracy"):
                # print(f"{split}_{key}", round(value,2))
                mlflow.log_metric(f"{split}_{key}", value)
        else:
            for metric in value:
                mlflow.log_metric(f"{split}_{key}_{metric}", value.get(metric))
                # print(f"{split}_{key}_{metric}", round(value.get(metri

In [15]:
logger.info('get_train_metrics')
classification_report_train = get_metrics(y_resampled_train, y_train_pred)
log_metrics(classification_report_train, "train")

INFO:root:get_train_metrics
INFO:basic_functions:classification_report
INFO:basic_functions:confusion_matrix


                       precision    recall  f1-score   support

           ad_hominem       1.00      1.00      1.00      1715
  appeal_to_authority       1.00      1.00      1.00      1715
    appeal_to_emotion       1.00      1.00      1.00      1715
        false_dilemma       1.00      1.00      1.00      1715
faulty_generalization       1.00      1.00      1.00      1715
                 none       1.00      1.00      1.00      1715

             accuracy                           1.00     10290
            macro avg       1.00      1.00      1.00     10290
         weighted avg       1.00      1.00      1.00     10290

[[1715    0    0    0    0    0]
 [   0 1715    0    0    0    0]
 [   0    0 1715    0    0    0]
 [   0    0    0 1715    0    0]
 [   0    0    0    0 1715    0]
 [   0    0    0    0    0 1715]]


In [16]:
logger.info('get_test_metrics')
classification_report_test = get_metrics(y_test, y_test_pred)
log_metrics(classification_report_test, "test")

INFO:root:get_test_metrics
INFO:basic_functions:classification_report
INFO:basic_functions:confusion_matrix


                       precision    recall  f1-score   support

           ad_hominem       0.69      0.44      0.53       140
  appeal_to_authority       0.62      0.16      0.26        98
    appeal_to_emotion       0.63      0.40      0.49       208
        false_dilemma       0.83      0.45      0.59       128
faulty_generalization       0.70      0.20      0.32       191
                 none       0.62      0.95      0.75       735

             accuracy                           0.64      1500
            macro avg       0.68      0.44      0.49      1500
         weighted avg       0.66      0.64      0.59      1500

[[ 61   5  12   3   2  57]
 [  3  16   9   0   1  69]
 [  6   3  84   0   7 108]
 [  2   1   7  58   0  60]
 [  9   1   9   0  39 133]
 [  8   0  12   9   7 699]]


In [17]:
mlflow.end_run()

🏃 View run svm baseline at: http://127.0.0.1:5001/#/experiments/861649519414732270/runs/dc3c512d69834f6f85eabda94035ee40
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/861649519414732270
