# Importing modules and input data

In [14]:
# manipulacja danymi
import numpy as np
import pandas as pd

# wizualizacja
import matplotlib.pyplot as plt

# podział danych na zbiory treningowe/walidacyjne/testowe
from sklearn.model_selection import train_test_split, GridSearchCV

# budowa Pipeline
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, PowerTransformer

# redukcja wymiarowości
from sklearn.decomposition import PCA

# model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# ewaluacja
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, f1_score, roc_auc_score
diabetes = pd.read_csv('diabetes.csv')

# Training, validation and test data set

In [15]:
num_features = ['Pregnancies','Age']
target = 'Diabetic'
X, y = diabetes[num_features], diabetes[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0) 

# Pipeline and evaluation

In [16]:
# przygotowanie wartości numerycznych
num_preparation = Pipeline(steps=[
    ('fill_missings', SimpleImputer(strategy='mean')),
    ('polynomial_features', PolynomialFeatures(degree=3)),
    ('scaler_1', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('scaler_2', StandardScaler())
])

# transformer = wartości numeryczne 
data_preparation = ColumnTransformer(transformers=[
    ('numeric_preprocessing', num_preparation, num_features)
    
])

model_pipeline_v1 = Pipeline(steps=[('preprocessor', data_preparation),
                                    ('model', SVC(kernel='rbf',probability=True))])
model_pipeline_v1.fit(X_train, y_train)


params = {
    'preprocessor__numeric_preprocessing__fill_missings__strategy': ['mean', 'median'],
    'preprocessor__numeric_preprocessing__polynomial_features__degree': [1,2],
    'preprocessor__numeric_preprocessing__pca__n_components': [0.85,0.90],
    'model__C': np.logspace(-2, 2, 10)
}

grid_search = GridSearchCV(model_pipeline_v1, params, cv=10, n_jobs=-1, verbose=10, scoring='f1_macro')
grid_search.fit(X_val, y_val)
print('Wybrane hiperparametry: ', grid_search.best_params_)
model_v3 = grid_search.best_estimator_
def metric(model,X_test):
    predictions_test = model.predict(X_test)
    f1_score_test=f1_score(y_test, predictions_test)
    print(f"F1_score_test: {f1_score_test}")
metric(model_v3, X_test)

Fitting 10 folds for each of 80 candidates, totalling 800 fits
Wybrane hiperparametry:  {'model__C': 100.0, 'preprocessor__numeric_preprocessing__fill_missings__strategy': 'mean', 'preprocessor__numeric_preprocessing__pca__n_components': 0.85, 'preprocessor__numeric_preprocessing__polynomial_features__degree': 1}
F1_score_test: 0.7928818586258033


### The grid search results in a similar quality.