# SVM06 - Normalizer + SMOTE + RFE

In [1]:
import numpy as np
import pandas as pd
import sklearn

#Model selection
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

#Pipelines
from imblearn.pipeline import Pipeline as imbpipeline

#Balanceo de clases
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE

#Preprocessing
from sklearn.preprocessing import StandardScaler, Normalizer

#Feature selection
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA, KernelPCA

#Classifiers
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

#Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

#Others
from collections import Counter
import datetime
import warnings

warnings.filterwarnings('ignore')


# Cargar datos

In [2]:
ds = pd.read_csv('train.csv')
#ds.head()

In [3]:
#ds.info()

In [4]:
X = ds.drop('clase', axis=1)
X.shape

(1387, 59)

In [5]:
y = ds['clase']
#y.value_counts()

In [6]:
Counter(y)

Counter({'High': 231, 'Medium': 334, 'Low': 822})

# SVM

In [7]:
clf = LinearSVC()

Preprocesamiento

In [8]:
#prep1 = StandardScaler() 
prep2 = Normalizer() 

Balanceo de clases

In [9]:
smt1 = SMOTE(sampling_strategy='auto',n_jobs=-1,random_state=42)
smt2 = BorderlineSMOTE(sampling_strategy='auto',n_jobs=-1,random_state=42)

Feature selection

In [10]:
#fs1 = SelectKBest()
fs2 = RFE(clf)
#fs3 = SelectFromModel(clf)

Pipelines

In [11]:
#smt + clf
pipeline01 = imbpipeline([('prep', prep2), ('smote', smt1), ('fs', fs2), ('classifier', clf)])
pipeline02 = imbpipeline([('prep', prep2), ('smote', smt2), ('fs', fs2), ('classifier', clf)])


Parameters

In [12]:

#SMOTE
params01_01 = {}
params01_01['smote__k_neighbors'] =  [3]#, 5, 7]
params01_01['fs__n_features_to_select'] = range(55, 59) 
params01_01['classifier_c'] =  [0.001]#, 0.01, 0.1, 1.0, 10, 100, 1000]
params01_01['classifier_gamma'] =  [1]#, 0.1, 0.01, 0.001]
params01_01['classifier_kernel'] =  ['rbf']#, 'poly', 'sigmoid', 'linear']
params01_01['classifier'] = [clf]

params01_02 = {}
params01_02['smote__k_neighbors'] =  [9, 11, 13]
params01_02['fs__n_features_to_select'] = range(55, 59) 
params01_02['classifier_c'] =  [0.001]#, 0.01, 0.1, 1.0, 10, 100, 1000]
params01_02['classifier_gamma'] =  [1]#, 0.1, 0.01, 0.001]
params01_02['classifier_kernel'] =  ['rbf']#, 'poly', 'sigmoid', 'linear']
params01_02['classifier'] = [clf]

#BordelineSMOTE
params02_01 = {}
params02_01['smote__k_neighbors'] =  [3, 5, 7]
params02_01['smote__m_neighbors'] = [3, 5, 7]
params02_01['smote__kind'] = ['borderline-1','borderline-2']
params02_01['fs__n_features_to_select'] = range(55, 59) 
params02_01['classifier_c'] =  [0.001]#, 0.01, 0.1, 1.0, 10, 100, 1000]
params02_01['classifier_gamma'] =  [1]#, 0.1, 0.01, 0.001]
params02_01['classifier_kernel'] =  ['rbf']#, 'poly', 'sigmoid', 'linear']
params02_01['classifier'] = [clf]

params02_02 = {}
params02_02['smote__k_neighbors'] =  [9, 11, 13]
params02_02['smote__m_neighbors'] = [3, 5, 7]
params02_02['smote__kind'] = ['borderline-1','borderline-2']
params02_02['fs__n_features_to_select'] = range(55, 59) 
params02_02['classifier_c'] =  [0.001]#, 0.01, 0.1, 1.0, 10, 100, 1000]
params02_02['classifier_gamma'] =  [1]#, 0.1, 0.01, 0.001]
params02_02['classifier_kernel'] =  ['rbf']#, 'poly', 'sigmoid', 'linear']
params02_02['classifier'] = [clf]

params02_03 = {}
params02_03['smote__k_neighbors'] =  [3, 5, 7]
params02_03['smote__m_neighbors'] = [9, 10, 11]
params02_03['smote__kind'] = ['borderline-1','borderline-2']
params02_03['fs__n_features_to_select'] = range(55, 59) 
params02_03['classifier_c'] =  [0.001]#, 0.01, 0.1, 1.0, 10, 100, 1000]
params02_03['classifier_gamma'] =  [1]#, 0.1, 0.01, 0.001]
params02_03['classifier_kernel'] =  ['rbf']#, 'poly', 'sigmoid', 'linear']
params02_03['classifier'] = [clf]

params02_04 = {}
params02_04['smote__k_neighbors'] =  [9, 11, 13]
params02_04['smote__m_neighbors'] = [9, 10, 11]
params02_04['smote__kind'] = ['borderline-1','borderline-2']
params02_04['fs__n_features_to_select'] = range(55, 59) 
params02_04['classifier_c'] =  [0.001]#, 0.01, 0.1, 1.0, 10, 100, 1000]
params02_04['classifier_gamma'] =  [1]#, 0.1, 0.01, 0.001]
params02_04['classifier_kernel'] =  ['rbf']#, 'poly', 'sigmoid', 'linear']
params02_04['classifier'] = [clf]




In [13]:
#Aqui es donde hay que cambiarle manualmente los datos 
# pipeline01, luego pipeline02
#al igual en params y file2

pipeline = pipeline01
params = params01_01
file1='SVM06-test-'
file2='params01_01.csv'

CV

In [14]:
cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=10,random_state=42)
scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']

# GridSearchCV

In [15]:
ct = datetime.datetime.now()
print("Ini: ", ct, " ")
grid=GridSearchCV(pipeline, params, cv=cv, scoring=scoring, n_jobs=-1, refit=False, verbose=3)
grid.fit(X, y)
ct = datetime.datetime.now()
print("Fin: ", ct, " ")

Ini:  2024-07-01 15:55:44.669719  
Fitting 100 folds for each of 1 candidates, totalling 100 fits


ValueError: Invalid parameter 'classifier_c' for estimator Pipeline(steps=[('prep', Normalizer()),
                ('smote', SMOTE(n_jobs=-1, random_state=42)),
                ('fs', RFE(estimator=LinearSVC())),
                ('classifier', LinearSVC())]). Valid parameters are: ['memory', 'steps', 'verbose'].

In [None]:
results=pd.DataFrame(grid.cv_results_)
results.shape

In [None]:
results.to_csv(file1+file2, index=False, encoding='utf-8-sig')

In [None]:
# Este es el resultado que hay que anotar en excel
results['mean_test_f1_weighted'].max()

In [None]:
print("Finished!")