In [71]:
from tqdm.notebook import tqdm 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import ast
import csv


from sklearn import preprocessing

from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

pd.options.mode.chained_assignment = None
%matplotlib inline

In [62]:
def replace_chars(vector):
    to_replace = "[] \n"
    for char in to_replace:
        vector = vector.replace(char, " ")
    return vector.split()

In [63]:
def convert_list_to_series(vector):
    return pd.to_numeric(pd.Series(vec))

## Data preporation

In [2]:
data = pd.read_csv("labeled_dataset.csv")

In [69]:
data.head()

Unnamed: 0.1,Unnamed: 0,_id,abstract,vector,labels
0,2500640,573696806e3b12023e58bded,consider distributed dson architecture interac...,[[ 1.92877650e-02]\n [ 2.52012283e-01]\n [-4.0...,4
1,1939546,5550432e45ce0a409eb46401,web applications prevalent platforms informati...,[[ 0.24369365]\n [ 0.5338475 ]\n [-0.06631115]...,1
2,2451165,573695d36e3b12023e4eb9ff,analysis fuzzyoverlapping community structure ...,[[-1.80163700e-02]\n [ 2.04452261e-01]\n [ 3.2...,2
3,919218,53e9ab78b7602d970351a4a3,approach clustering timeseries approach discov...,[[-0.01064459]\n [ 0.13473687]\n [ 0.1160049 ]...,2
4,254711,53e99cdfb7602d970259505a,symbolic checking successful technique verifyi...,[[-0.03889668]\n [ 0.30423757]\n [-0.02531016]...,4


In [61]:
vectors = data["vector"]
VECTOR_SIZE = len(replace_chars(vectors[0]))

In [64]:
X = pd.DataFrame(columns=list(range(VECTOR_SIZE)))
for i, vec in enumerate(tqdm(vectors)):
    vec = replace_chars(vec)
    X.loc[i] = convert_list_to_series(vec)

Y = data["labels"]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [67]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.019288,0.252012,-0.004028,0.10544,0.192244,-0.420156,0.122775,0.804742,0.239669,-0.107765,...,0.123394,0.411098,0.196681,0.108734,0.316778,0.402116,0.063839,-0.3591,0.336358,0.047863
1,0.243694,0.533848,-0.066311,-0.092245,0.501947,-0.039216,-0.014495,0.658079,0.294956,-0.007597,...,0.264527,0.214714,0.232022,-0.030819,-0.080473,0.314592,-0.288054,-0.491084,-0.026807,0.5798
2,-0.018016,0.204452,0.032094,0.1587,0.195748,-0.519167,0.288753,0.951722,0.426394,-0.083423,...,0.087602,0.503279,0.328429,0.087483,0.448525,0.503638,0.115408,-0.470179,0.515303,0.060216
3,-0.010645,0.134737,0.116005,0.262162,0.061192,-0.53051,0.33068,0.833992,0.354102,-0.030746,...,0.018479,0.386978,0.317865,0.076633,0.39436,0.452293,0.167613,-0.546751,0.592078,0.02391
4,-0.038897,0.304238,-0.02531,0.148247,0.107871,-0.448753,0.193785,0.784141,0.251807,-0.115472,...,0.086166,0.361546,0.202321,0.080887,0.364324,0.494056,0.052042,-0.453458,0.381452,0.056816


In [68]:
Y.head()

0    4
1    1
2    2
3    2
4    4
Name: labels, dtype: int64

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

min_max_scaler = preprocessing.MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

## Model training

In [88]:
def fit_predict_CV(X_train, X_test, y_train, y_test, model_type, params_grid=dict()):
    model = model_type()
    
    model_cv = GridSearchCV(model, params_grid, cv=KFold(), refit=True, verbose=1)
    model_cv.fit(X_train, y_train) 
    
    best_model = model_cv.best_estimator_

    prediction_train = best_model.predict(X_train)
    prediction_test = best_model.predict(X_test)
    acc_train = accuracy_score(y_train, prediction_train)
    acc_test = accuracy_score(y_test, prediction_test)
    
    
    print("Best hyperparameters: ", model_cv.best_params_)
    print("Train accuracy:       ", round(acc_train, 5))
    print("Test accuracy:        ", round(acc_test, 5))

    
    return best_model

In [89]:
grid = {
    "class_weight": ["balanced"],
    "kernel": ["linear", "poly", "rbf"],
    "degree": np.arange(1, 10),
    "C": np.linspace(1, 10, 10)
}
svm = fit_predict_CV(X_train, X_test, y_train, y_test, SVC, grid)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
Best hyperparameters:  {'C': 10.0, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}
Train accuracy:        0.99675
Test accuracy:         0.987


In [92]:
import pickle
with open('svm_classifier.pkl', 'wb') as fid:
    pickle.dump(svm, fid)    

In [None]:
    scikit-learn==1.1.0
    pickleshare==0.7.5           
    numpy==1.23.4                   
    matplotlib==3.5.2            
    pandas==1.4.4            
tqdm==4.64.1           