# Importing modules and input data

In [10]:
# manipulacja danymi
import numpy as np
import pandas as pd

# wizualizacja
import matplotlib.pyplot as plt

# podział danych na zbiory treningowe/walidacyjne/testowe
from sklearn.model_selection import train_test_split, GridSearchCV

# budowa Pipeline
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, PowerTransformer

# redukcja wymiarowości
from sklearn.decomposition import PCA

# model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# ewaluacja
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, f1_score, roc_auc_score
diabetes = pd.read_csv('diabetes.csv')

# Training and test data set

In [11]:
num_features = ['Pregnancies','Age']
target = 'Diabetic'
X, y = diabetes[num_features], diabetes[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0, stratify=y)

# Pipeline and evaluation

In [12]:
# przygotowanie wartości numerycznych
num_preparation = Pipeline(steps=[
    ('fill_missings', SimpleImputer(strategy='mean')),
    ('polynomial_features', PolynomialFeatures(degree=3)),
    ('scaler_1', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('scaler_2', StandardScaler())
])

# transformer = wartości numeryczne
data_preparation = ColumnTransformer(transformers=[
    ('numeric_preprocessing', num_preparation, num_features)
    
])

model_pipeline_v1 = Pipeline(steps=[('preprocessor', data_preparation),
                                    ('model', SVC(kernel='rbf',probability=True))])
model_pipeline_v1.fit(X_train, y_train)

def metric(model, X_train, X_test):
    predictions_train = model.predict(X_train)
    predictions_test = model.predict(X_test)
    f1_score_train=f1_score(y_train, predictions_train)
    f1_score_test=f1_score(y_test, predictions_test)
    print(f"F1_score_train: {f1_score_train}, F1_score_test: {f1_score_test}")
metric(model_pipeline_v1, X_train, X_test)

F1_score_train: 0.8279666070363746, F1_score_test: 0.7954467057606072


### The F1 scores are significantly better than in the case of the logistic regression model. The SVM model performes way better for this data set.