# Basic Model: SVM

### Import necessary packages

In [2]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler

import mlflow
import logging 
import config 

from basic_functions import (
    get_preprocess_data,
    get_lemmatized_data,
    get_metrics
)


### Setup

In [3]:
MODEL_NAME = "SVM" 
TRACKING_URI = open("../.mlflow_uri").read().strip()
EXPERIMENT_NAME = config.EXPERIMENT_NAME

logging.basicConfig(format="%(asctime)s: %(message)s") # Configure logging format to show timestamp before every message

logger = logging.getLogger()
logger.setLevel(logging.INFO) # Only show logs that are INFO or more important (e.g., WARNING, ERROR) — but ignore DEBUG.

In [4]:
DATA_PATH = "../data/data_dropped_duplicates_small.csv"

### Get and process data

In [5]:
df = get_preprocess_data(DATA_PATH)

In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aylinhanne/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Lemmatize text

In [7]:
df.head()

Unnamed: 0,dataset,text,logical_fallacies,source
18418,8,yes anything to help people live loinger,ad_hominem,
7891,3,"yes, i can see why some people would find this...",none,
19326,9,a sorry little post office that can't do anyth...,appeal_to_emotion,
19159,9,"american lives that have been lost, families t...",appeal_to_emotion,
8695,3,i kind of understand part of this problem. i h...,faulty_generalization,


In [8]:
df = get_lemmatized_data(df)

### Train-test split

In [9]:
y = df["logical_fallacies"]
X = df["text"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y)

### SVM

In [10]:
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run()
run = mlflow.active_run()
mlflow.set_tag("model_name", MODEL_NAME)
mlflow.set_tag("mlflow.runName", "svm baseline")
# mlflow.log_params(params)

In [11]:
# Use TF-IDF Vecorizer to transform text into numerical data
tfidf_vectorizer = TfidfVectorizer()
X_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_vectorized_test = tfidf_vectorizer.transform(X_test)

In [12]:
# Oversampling only on train dataset
ros = RandomOverSampler(random_state=0)
X_resampled_train, y_resampled_train = ros.fit_resample(X_vectorized, y_train)

In [13]:
# Grid search
param_grid = {
'C': [0.1, 1, 10, 100],
'gamma': ['scale', 'auto'],
'kernel': ['linear', 'rbf']
}

# svm = SVC(probability=True)
svm = SVC()


grid_search = GridSearchCV(svm, param_grid, cv=5)
grid_search.fit(X_resampled_train, y_resampled_train)

best_params = grid_search.best_params_
print(best_params)

# Predict on train and test data
best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_resampled_train)
y_test_pred = best_model.predict(X_vectorized_test)

mlflow.log_params(best_params)


    


{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


In [15]:
# Save model to pickle file:
with open('../models/svm/svm_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

### Evaluation


In [16]:
def log_metrics(cr, split):
    for key, value in cr.items():
        if (key == "accuracy"):
                # print(f"{split}_{key}", round(value,2))
                mlflow.log_metric(f"{split}_{key}", value)
        else:
            for metric in value:
                mlflow.log_metric(f"{split}_{key}_{metric}", value.get(metric))
                # print(f"{split}_{key}_{metric}", round(value.get(metri

In [18]:
logger.info('get_train_metrics')
classification_report_train = get_metrics(y_resampled_train, y_train_pred)
log_metrics(classification_report_train, "train")

INFO:root:get_train_metrics
INFO:basic_functions:classification_report
INFO:basic_functions:confusion_matrix


                       precision    recall  f1-score   support

           ad_hominem       1.00      1.00      1.00      1670
  appeal_to_authority       1.00      1.00      1.00      1670
    appeal_to_emotion       1.00      1.00      1.00      1670
        false_dilemma       1.00      1.00      1.00      1670
faulty_generalization       1.00      1.00      1.00      1670
                 none       1.00      1.00      1.00      1670

             accuracy                           1.00     10020
            macro avg       1.00      1.00      1.00     10020
         weighted avg       1.00      1.00      1.00     10020

[[1670    0    0    0    0    0]
 [   0 1670    0    0    0    0]
 [   0    0 1670    0    0    0]
 [   0    0    0 1670    0    0]
 [   0    0    0    0 1670    0]
 [   0    0    0    0    0 1670]]


In [19]:
logger.info('get_test_metrics')
classification_report_test = get_metrics(y_test, y_test_pred)
log_metrics(classification_report_test, "test")

INFO:root:get_test_metrics
INFO:basic_functions:classification_report
INFO:basic_functions:confusion_matrix


                       precision    recall  f1-score   support

           ad_hominem       0.58      0.37      0.45       142
  appeal_to_authority       0.54      0.07      0.13        97
    appeal_to_emotion       0.58      0.42      0.49       216
        false_dilemma       0.83      0.42      0.56       137
faulty_generalization       0.74      0.25      0.37       192
                 none       0.61      0.95      0.74       716

             accuracy                           0.62      1500
            macro avg       0.65      0.41      0.46      1500
         weighted avg       0.64      0.62      0.58      1500

[[ 53   0  23   1   2  63]
 [  3   7  11   0   1  75]
 [ 16   2  90   2   5 101]
 [  3   0   2  58   3  71]
 [ 10   1  12   2  48 119]
 [  6   3  16   7   6 678]]


In [20]:
mlflow.end_run()

🏃 View run svm baseline at: http://127.0.0.1:5001/#/experiments/861649519414732270/runs/98c91e791c054ea1a6821558c75398ad
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/861649519414732270
