# Basic Model: SVM

### Import necessary packages

In [16]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler

import mlflow
import logging 
import config 

from basic_functions import (
    get_preprocess_data,
    get_lemmatized_data,
    get_metrics
)


### Setup

In [17]:
MODEL_NAME = "SVM" 
TRACKING_URI = open("../.mlflow_uri").read().strip()
EXPERIMENT_NAME = config.EXPERIMENT_NAME

logging.basicConfig(format="%(asctime)s: %(message)s") # Configure logging format to show timestamp before every message

logger = logging.getLogger()
logger.setLevel(logging.INFO) # Only show logs that are INFO or more important (e.g., WARNING, ERROR) — but ignore DEBUG.

In [18]:
DATA_PATH = "../data/data_dropped_duplicates_small.csv"

### Get and process data

In [19]:
df = get_preprocess_data(DATA_PATH)

### Lemmatize text

In [20]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aylinhanne/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
df.head()

Unnamed: 0,dataset,text,logical_fallacies,source
6159,3,it s of little shock that pakistani government...,faulty_generalization,
8590,3,addiction is never a good thing and maradona w...,faulty_generalization,
10725,3,outrageous this is just another example of the...,none,
9451,3,the us not being accessible to every country i...,none,
17962,8,global temperature have not risen in the past ...,none,


In [22]:
df = get_lemmatized_data(df)

### Train-test split

In [23]:
y = df["logical_fallacies"]
X = df["text"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y)

### SVM

In [24]:
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run()
run = mlflow.active_run()
mlflow.set_tag("model_name", MODEL_NAME)
mlflow.set_tag("mlflow.runName", "svm baseline")
# mlflow.log_params(params)

In [25]:
# Use TF-IDF Vecorizer to transform text into numerical data
tfidf_vectorizer = TfidfVectorizer()
X_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_vectorized_test = tfidf_vectorizer.transform(X_test)

In [26]:
# Oversampling only on train dataset
ros = RandomOverSampler(random_state=0)
X_resampled_train, y_resampled_train = ros.fit_resample(X_vectorized, y_train)

In [27]:
# Grid search
param_grid = {
'C': [0.1, 1, 10, 100],
'gamma': ['scale', 'auto'],
'kernel': ['linear', 'rbf']
}

svm = SVC()

grid_search = GridSearchCV(svm, param_grid, cv=5, scoring="f1_weighted")
grid_search.fit(X_resampled_train, y_resampled_train)

best_params = grid_search.best_params_
print(best_params)

# Predict on train and test data
best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_resampled_train)
y_test_pred = best_model.predict(X_vectorized_test)

mlflow.log_params(best_params)


{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}




MlflowException: API request to http://127.0.0.1:5001/api/2.0/mlflow/runs/log-batch failed with exception HTTPConnectionPool(host='127.0.0.1', port=5001): Max retries exceeded with url: /api/2.0/mlflow/runs/log-batch (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x2a7264090>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [56]:
# Save model to pickle file:
with open('../models/svm/svm_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

### Evaluation


In [1]:
def log_metrics(cr, split):
    for key, value in cr.items():
        if (key == "accuracy"):
                # print(f"{split}_{key}", round(value,2))
                mlflow.log_metric(f"{split}_{key}", value)
        else:
            for metric in value:
                mlflow.log_metric(f"{split}_{key}_{metric}", value.get(metric))
                # print(f"{split}_{key}_{metric}", round(value.get(metri

In [None]:
logger.info('get_train_metrics')
classification_report_train = get_metrics(y_resampled_train, y_train_pred)
log_metrics(classification_report_train, "train")

NameError: name 'logger' is not defined

In [61]:
logger.info('get_test_metrics')
classification_report_test = get_metrics(y_test, y_test_pred)
log_metrics(classification_report_test, "test")

INFO:root:get_test_metrics
INFO:basic_functions:classification_report
INFO:basic_functions:confusion_matrix


                       precision    recall  f1-score   support

           ad_hominem       0.46      0.56      0.51       142
  appeal_to_authority       0.32      0.44      0.37        97
    appeal_to_emotion       0.41      0.55      0.47       216
        false_dilemma       0.63      0.58      0.61       137
faulty_generalization       0.46      0.45      0.46       192
                 none       0.75      0.62      0.68       716

             accuracy                           0.57      1500
            macro avg       0.51      0.53      0.51      1500
         weighted avg       0.60      0.57      0.58      1500

[[ 79   5  35   3   8  12]
 [  5  43  15   2   6  26]
 [ 31  17 118   5  19  26]
 [  3   1  10  80   4  39]
 [ 13  10  34   5  86  44]
 [ 39  59  79  31  62 446]]


In [None]:
mlflow.end_run()

🏃 View run svm baseline at: http://127.0.0.1:5001/#/experiments/861649519414732270/runs/73743cfed4d148ff8f9d654664d4d131
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/861649519414732270
