In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold
from sklearn.metrics import make_scorer, f1_score, roc_curve,auc,confusion_matrix, precision_score
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA, KernelPCA
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
df = pd.read_csv('magic04.data',
                header=None)
df.columns = ['Length' , 'Width', 'Size', 'Conc', 'Conc1', 'Asym', 'M3Long', 'M3Trans', 'Alpha', 'Dist', 'Class']
x = df.iloc[:,0:9].values
y = df.iloc[:,10].values

le = LabelEncoder()
y = le.fit_transform(y)

x_train , x_test , y_train , y_test = train_test_split(x,
                                        y, 
                                        test_size=0.2, 
                                        stratify=y, 
                                        random_state=1)

f1_scorer=make_scorer(f1_score,pos_label=0)
precision_scorer=make_scorer(precision_score,pos_label=0)
df

In [None]:
sns.pairplot(df, size =2.5, hue="Class")
plt.tight_layout()
plt.show()

## Nessuna Riduzione Dimensionale

### Random Forest

In [None]:
n_estimators = [400,800,1000]
max_depth=[10,100,200]
pipe_forest = make_pipeline(RandomForestClassifier(random_state=1))
param_grid_forest = [{'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__max_depth':max_depth, 'randomforestclassifier__criterion': ['gini']},
                        {'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__max_depth':max_depth, 'randomforestclassifier__criterion': ['entropy']}]

OTTIMIZZAZIONE F1

In [None]:
gs_forest_f1 = GridSearchCV(estimator=pipe_forest,
                        param_grid=param_grid_forest,
                        scoring=f1_scorer,
                        n_jobs=-1,
                        cv=10,
                        verbose=10)
gs_forest_f1=gs_forest_f1.fit(x_train, y_train)
print('Migliore F1_score: ', gs_forest_f1.best_score_)
print('Parametri: ', gs_forest_f1.best_params_)
best_gs_forest_f1=gs_forest_f1.best_estimator_

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=best_gs_forest_f1.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()

#--------------------------------------------------

y_pred = best_gs_forest_f1.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

OTTIMIZZAZIONE PRECISION

In [None]:
gs_forest_precision = GridSearchCV(estimator=pipe_forest,
                        param_grid=param_grid_forest,
                        scoring=precision_scorer,
                        n_jobs=-1,
                        cv=10,
                        verbose=10)
gs_forest_precision=gs_forest_precision.fit(x_train, y_train)
print('Migliore precision_score: ', gs_forest_precision.best_score_)
print('Parametri: ', gs_forest_precision.best_params_)
best_gs_forest_precision=gs_forest_precision.best_estimator_

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=best_gs_forest_precision.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()

#--------------------------------------------------

y_pred = best_gs_forest_precision.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

### SVM

In [None]:
c_range = [10.0,100.0]
g_range = [0.01,1.0,10.0,100.0]
param_grid_svm = [{'svc__C': c_range,'svc__gamma': c_range,'svc__kernel': ['rbf']}]

pipe_svm = make_pipeline(StandardScaler(),
                            SVC(random_state=1,
                            probability=True))

OTTIMIZZAZIONE F1

In [None]:
gs_svm_f1 = GridSearchCV(estimator=pipe_svm,
                        param_grid=param_grid_svm,
                        scoring=f1_scorer,
                        n_jobs=-1,
                        refit=True,
                        cv=10,
                        verbose=10)
gs_svm_f1=gs_svm_f1.fit(x_train, y_train)
print('Migliore F1_score: ', gs_svm_f1.best_score_)
print('Parametri: ', gs_svm_f1.best_params_)
best_gs_svm_f1=gs_svm_f1.best_estimator_

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=best_gs_svm_f1.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()

#--------------------------------------------------

y_pred = best_gs_svm_f1.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

OTTIMIZZAZIONE PRECISION

In [None]:
gs_svm_precision = GridSearchCV(estimator=pipe_svm,
                        param_grid=param_grid_svm,
                        scoring=precision_scorer,
                        n_jobs=-1,
                        refit=True,
                        cv=10,
                        verbose=10)
gs_svm_precision=gs_svm_precision.fit(x_train, y_train)
print('Migliore precision_score: ', gs_svm_precision.best_score_)
print('Parametri: ', gs_svm_precision.best_params_)
best_gs_svm_precision=gs_svm_precision.best_estimator_

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=best_gs_svm_precision.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()

#--------------------------------------------------

y_pred = best_gs_svm_precision.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

### Logistic Regression

In [None]:
max_iter = [400,800,1000]
c_values=[0.1 , 1, 10, 100]

pipe_lr = make_pipeline(StandardScaler(),
                        LogisticRegression())
param_grid_lr = [{'logisticregression__C': c_values, 'logisticregression__max_iter': max_iter}]

OTTIMIZZAZIONE F1

In [None]:
gs_lr = GridSearchCV(estimator=pipe_lr,
                        param_grid=param_grid_lr,
                        scoring=f1_scorer,
                        n_jobs=-1,
                        cv=10,
                        verbose=10)
gs_lr=gs_lr.fit(x_train, y_train)
print('Migliore F1_score: ', gs_lr.best_score_)
print('Parametri: ', gs_lr.best_params_)
best_gs_lr_f1=gs_lr.best_estimator_

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=best_gs_lr_f1.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()

#--------------------------------------------------

y_pred = best_gs_lr_f1.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

OTTIMIZZAZIONE PRECISION

In [None]:
gs_lr = GridSearchCV(estimator=pipe_lr,
                        param_grid=param_grid_lr,
                        scoring=precision_scorer,
                        n_jobs=-1,
                        cv=10,
                        verbose=10)
gs_lr=gs_lr.fit(x_train, y_train)
print('Migliore F1_score: ', gs_lr.best_score_)
print('Parametri: ', gs_lr.best_params_)
best_gs_lr_precision=gs_lr.best_estimator_

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=best_gs_lr_precision.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()

#--------------------------------------------------

y_pred =precision.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

## Feature Extraction

### PCA

In [None]:
ca = PCA(n_components=None)
pca.fit(x_train)
varExps = pca.explained_variance_ratio_
cumVarExps = np.cumsum(varExps)

plt.bar(range(9), varExps, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(9), cumVarExps, where='mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()

#### Random Forest

In [None]:
n_estimators = [100, 500]
max_depth=[10,100,200]
n_comps=[6,7]
pipe_forest = make_pipeline(StandardScaler(),
                            PCA(),
                            RandomForestClassifier(random_state=1))
param_grid_forest = [{'pca__n_components':n_comps,'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__max_depth':max_depth, 'randomforestclassifier__criterion': ['gini']},
                        {'pca__n_components':n_comps,'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__max_depth':max_depth, 'randomforestclassifier__criterion': ['entropy']}]

OTTIMIZZAZIONE F1

In [None]:
gs_forest_f1_pca = GridSearchCV(estimator=pipe_forest,
                        param_grid=param_grid_forest,
                        scoring=f1_scorer,
                        n_jobs=-1,
                        cv=10,
                        verbose=10)
gs_forest_f1_pca=gs_forest_f1_pca.fit(x_train, y_train)
print('Migliore F1_score: ', gs_forest_f1_pca.best_score_)
print('Parametri: ', gs_forest_f1_pca.best_params_)
best_gs_forest_f1_pca=gs_forest_f1_pca.best_estimator_

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=best_gs_forest_f1_pca.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()

#--------------------------------------------------

y_pred = best_gs_forest_f1_pca.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

OTTIMIZZAZIONE PRECISION

In [None]:
gs_forest_precision_pca = GridSearchCV(estimator=pipe_forest,
                        param_grid=param_grid_forest,
                        scoring=precision_scorer,
                        n_jobs=-1,
                        cv=10,
                        verbose=10)
gs_forest_precision_pca=gs_forest_precision_pca.fit(x_train, y_train)
print('Migliore precision_score: ', gs_forest_precision_pca.best_score_)
print('Parametri: ', gs_forest_precision_pca.best_params_)
best_gs_forest_precision_pca=gs_forest_precision_pca.best_estimator_

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=best_gs_forest_precision_pca.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve(y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)

plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()
#--------------------------------------------------
#Confusion Matrix
y_pred = best_gs_forest_precision_pca.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

#### SVM

In [None]:
degree=[4,5,6,7]
c_range = [0.01,1.0,10.0,100.0]
n_comps = [6,7,8]
param_grid_svm = [{'pca__n_components':n_comps,'svc__C': c_range,'svc__gamma': c_range,'svc__kernel': ['rbf']},
                    {'pca__n_components':n_comps,'svc__C': c_range , 'svc__degree': degree , 'svc__kernel':['poly']}]

pipe_svm = make_pipeline(StandardScaler(),
                            PCA(),
                            SVC(random_state=1, probability=True))

OTTIMIZZAZIONE F1

In [None]:
gs_svm_f1 = GridSearchCV(estimator=pipe_svm,
                        param_grid=param_grid_svm,
                        scoring=f1_scorer,
                        n_jobs=-1,
                        refit=True,
                        cv=10,
                        verbose=10)
gs_svm_f1=gs_svm_f1.fit(x_train, y_train)
print('Migliore F1_score: ', gs_svm_f1.best_score_)
print('Parametri: ', gs_svm_f1.best_params_)
best_gs_svm_f1 = gs_svm_f1.best_estimator_

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=best_gs_svm_f1.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()
#--------------------------------------------------
#Confusion Matrix
y_pred = best_gs_svm_f1.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

OTTIMIZZAZIONE PRECISION

In [None]:
gs_svm_precision = GridSearchCV(estimator=pipe_svm,
                        param_grid=param_grid_svm,
                        scoring=precision_scorer,
                        n_jobs=-1,
                        refit=True,
                        cv=10,
                        verbose=10)
gs_svm_precision=gs_svm_precision.fit(x_train, y_train)
print('Migliore precision_score: ', gs_svm_precision.best_score_)
print('Parametri: ', gs_svm_precision.best_params_)
best_gs_svm_precision = gs_svm_precision.best_estimator_

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=best_gs_svm_precision.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()

#--------------------------------------------------

y_pred = best_gs_svm_precision.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

### Kernel PCA

#### Random Forest

In [None]:
n_estimators = [100, 500, 800]
max_depth=[10,100,150]
n_comps=[6,7]
pipe_forest = make_pipeline(StandardScaler(),
                            KernelPCA(),
                            RandomForestClassifier(random_state=1))
param_grid_forest = [{'kernelpca__n_components':n_comps,'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__max_depth':max_depth, 'randomforestclassifier__criterion': ['gini']},
                        {'kernelpca__n_components':n_comps,'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__max_depth':max_depth, 'randomforestclassifier__criterion': ['entropy']}]

OTTIMIZZAZIONE F1

In [None]:
gs_forest_f1_pca = GridSearchCV(estimator=pipe_forest,
                        param_grid=param_grid_forest,
                        scoring=f1_scorer,
                        n_jobs=-1,
                        cv=10,
                        verbose=10)
gs_forest_f1_pca=gs_forest_f1_pca.fit(x_train, y_train)
print('Migliore F1_score: ', gs_forest_f1_pca.best_score_)
print('Parametri: ', gs_forest_f1_pca.best_params_)
best_gs_forest_f1_pca=gs_forest_f1_pca.best_estimator_

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=best_gs_forest_f1_pca.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()
#--------------------------------------------------
#Confusion Matrix
y_pred = best_gs_forest_f1_pca.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

OTTIMIZZAZIONE PRECISION

In [None]:
gs_forest_precision_pca = GridSearchCV(estimator=pipe_forest,
                        param_grid=param_grid_forest,
                        scoring=precision_scorer,
                        n_jobs=-1,
                        cv=10,
                        verbose=10)
gs_forest_precision_pca=gs_forest_precision_pca.fit(x_train, y_train)
print('Migliore precision_score: ', gs_forest_precision_pca.best_score_)
print('Parametri: ', gs_forest_precision_pca.best_params_)
best_gs_forest_precision_pca=gs_forest_precision_pca.best_estimator_

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=best_gs_forest_precision_pca.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()
#--------------------------------------------------
#Confusion Matrix
y_pred = best_gs_forest_precision_pca.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

#### SVM

In [None]:
degree=[4,5]
c_range = [0.01,1.0,10.0,100.0]
n_comps = [6,7]
param_grid_svm = [{'kernelpca__n_components':n_comps,'svc__C': c_range,'svc__gamma': c_range,'svc__kernel': ['rbf']},
                    {'kernelpca__n_components':n_comps,'svc__C': c_range , 'svc__degree': degree , 'svc__kernel':['poly']}]

pipe_svm = make_pipeline(StandardScaler(),
                            KernelPCA(),
                            SVC(random_state=1, probability=True))

OTTIMIZZAZIONE F1

In [None]:
gs_svm_f1 = GridSearchCV(estimator=pipe_svm,
                        param_grid=param_grid_svm,
                        scoring=f1_scorer,
                        n_jobs=-1,
                        refit=True,
                        cv=10,
                        verbose=10)
gs_svm_f1=gs_svm_f1.fit(x_train, y_train)
print('Migliore F1_score: ', gs_svm_f1.best_score_)
print('Parametri: ', gs_svm_f1.best_params_)
best_gs_svm_f1 = gs_svm_f1.best_estimator_

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=best_gs_svm_f1.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()

#--------------------------------------------------

y_pred = best_gs_svm_f1.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

OTTIMIZZAZIONE PRECISION

In [None]:
gs_svm_f1 = GridSearchCV(estimator=pipe_svm,
                        param_grid=param_grid_svm,
                        scoring=precision_scorer,
                        n_jobs=-1,
                        refit=True,
                        cv=10,
                        verbose=10)
gs_svm_f1=gs_svm_f1.fit(x_train, y_train)
print('Migliore F1_score: ', gs_svm_f1.best_score_)
print('Parametri: ', gs_svm_f1.best_params_)
best_gs_svm_precision = gs_svm_f1.best_estimator_

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=best_gs_svm_precision.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()
#--------------------------------------------------
#Confusion Matrix
y_pred = best_gs_svm_precision.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

## Tecniche Ensemble

### AdaBoost

Classificatore poco performante selezionato

In [None]:
frst = RandomForestClassifier(n_estimators=15 , max_depth=5 , random_state=1)
frst.fit(x_train, y_train)
y_pred = frst.predict(x_train)
f1_score(y_train,y_pred)

In [None]:
ada=AdaBoostClassifier(base_estimator=frst,
                        n_estimators=100,
                        random_state=1)
ada=ada.fit(x_train, y_train)
y_train_pred = ada.predict(x_train)

### Bagging

Poiché questo passaggio è stato fatto successivamente agli altri i modelli utilizzati non sono tutti, ma sono i migliori selezionati dalle gridSearch

#### Forest

In [None]:
forest = RandomForestClassifier(n_estimators=800, 
                                max_depth=100, 
                                random_state=1, 
                                criterion='gini')
                                
bag_forest = BaggingClassifier(base_estimator=forest,
        n_estimators=25,
        max_features=1.0,
        max_samples=1.0,
        bootstrap=True,
        bootstrap_features=False,
        n_jobs=1,
        random_state=1)

bag_forest=bag_forest.fit(x_train, y_train)

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=bag_forest.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()

#--------------------------------------------------

y_pred = bag_forest.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

#### SVM

In [None]:
SVM = SVC(kernel='rbf', C=10.0, gamma=10.0)
                                
bag_svm = BaggingClassifier(base_estimator=SVM,
        n_estimators=25,
        max_features=1.0,
        max_samples=1.0,
        bootstrap=True,
        bootstrap_features=False,
        n_jobs=1,
        random_state=1)

bag_svm=bag_svm.fit(x_train, y_train)

Il seguente blocco è stato praticamente impossibile da eseguire, dopo oltre 120 minuti continuava a girare senza andare avanti. La ROC per il bagging fatto con l'SVM, inserita nella presentazione, è stata fatta con un sotto-dataset di solo 2000 istanze.

In [None]:
cv1 = list(StratifiedKFold(n_splits=10, random_state=1, shuffle=True).split(x_train, y_train))
fig=plt.figure(figsize=(7,5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate (cv1):
    probas=bag_svm.fit(x_train, y_train).predict_proba(x_test)
    fpr, tpr, thresholds = roc_curve (y_test, probas[:, 1], pos_label=1)
    mean_tpr += np.interp (mean_fpr, fpr, tpr)
    mean_tpr[0]=0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label="ROC fold %d (area=%0.2f)" %(i+1, roc_auc))
plt.plot([0, 1],[0, 1], linestyle="--", color=(0.6, 0.6, 0.6), label="Random guessing")
mean_tpr/=len(cv1)
mean_tpr[-1] = 1.0
mean_auc =auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=":", color="black", label="Perfect performance")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="lower right")
plt.show()

#--------------------------------------------------

y_pred = bag_svm.predict(x_test)
confmat = confusion_matrix(y_true=y_test,
            y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat,
    cmap=plt.cm.Blues,
    alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,
            y=i,
            s=confmat[i,j],
            va='center',
            ha='center',)
plt.xlabel('Classe Predetta')
plt.ylabel('Classe Vera')
plt.show()

## Testing Finali dei migliori modelli

### Modelli

In [None]:
forest_1 = RandomForestClassifier(random_state=1,
                                criterion='gini',
                                max_depth=100,
                                n_estimators=800)
forest_1.fit(x_train,y_train)

svm_1 = SVC(C=10.0,
            kernel='rbf',
            gamma=10.0)
svm_1.fit(x_train,y_train)

lr_1 = make_pipeline(StandardScaler(),
                        LogisticRegression(C=10, 
                                            max_iter=400))
lr_1.fit(x_train,y_train)

forest_pca = make_pipeline(StandardScaler(),
                        PCA(n_components=7),
                        RandomForestClassifier(random_state=1, 
                                                criterion='entropy', 
                                                max_depth=100, 
                                                n_estimators=100))
forest_pca.fit(x_train,y_train)

svm1_pca = make_pipeline(StandardScaler(),
                        PCA(n_components=8),
                        SVC(C=100.0, 
                            gamma=0.01, 
                            kernel='rbf'))
svm1_pca.fit(x_train,y_train)

svm2_pca = make_pipeline(StandardScaler(),
                        PCA(n_components=8),
                        SVC(C=100.0, 
                            gamma=1.0, 
                            kernel='rbf'))
svm2_pca.fit(x_train,y_train)

forest1_kpca = make_pipeline(StandardScaler(),
                        KernelPCA(n_components=7),
                        RandomForestClassifier(random_state=1, 
                                                criterion='entropy', 
                                                max_depth=100, 
                                                n_estimators=100))
forest1_kpca.fit(x_train,y_train)

forest2_kpca = make_pipeline(StandardScaler(),
                        KernelPCA(n_components=7),
                        RandomForestClassifier(random_state=1, 
                                                criterion='gini', 
                                                max_depth=100, 
                                                n_estimators=100))
forest2_kpca.fit(x_train,y_train)

svm_kpca = make_pipeline(StandardScaler(),
                        KernelPCA(n_components=7),
                        SVC(C=100.0, 
                            gamma=0.01, 
                            kernel='rbf'))
svm_kpca.fit(x_train,y_train)



In [None]:
y_train_pred = forest_1.predict(x_train)
y_test_pred = forest_1.predict(x_test)
train_f1 = f1_score(y_train , y_train_pred)
test_f1 = f1_score(y_test , y_test_pred)
train_precision = precision_score(y_train , y_train_pred)
test_precision = precision_score(y_test , y_test_pred)
print('forest_1 train/test f1_scores: %.3f/%.3f' % (train_f1 , test_f1))
print('forest_1 train/test precisions: %.3f/%.3f' % (train_precision , test_precision))

y_train_pred = svm_1.predict(x_train)
y_test_pred = svm_1.predict(x_test)
train_f1 = f1_score(y_train , y_train_pred)
test_f1 = f1_score(y_test , y_test_pred)
train_precision = precision_score(y_train , y_train_pred)
test_precision = precision_score(y_test , y_test_pred)
print('svm_1 train/test f1_scores: %.3f/%.3f' % (train_f1 , test_f1))
print('svm_1 train/test precisions: %.3f/%.3f' % (train_precision , test_precision))

y_train_pred = lr_1.predict(x_train)
y_test_pred = lr_1.predict(x_test)
train_f1 = f1_score(y_train , y_train_pred)
test_f1 = f1_score(y_test , y_test_pred)
train_precision = precision_score(y_train , y_train_pred)
test_precision = precision_score(y_test , y_test_pred)
print('lr_1 train/test f1_scores: %.3f/%.3f' % (train_f1 , test_f1))
print('lr_1 train/test precisions: %.3f/%.3f' % (train_precision , test_precision))

y_train_pred = forest_pca.predict(x_train)
y_test_pred = forest_pca.predict(x_test)
train_f1 = f1_score(y_train , y_train_pred)
test_f1 = f1_score(y_test , y_test_pred)
train_precision = precision_score(y_train , y_train_pred)
test_precision = precision_score(y_test , y_test_pred)
print('forest_pca train/test f1_scores: %.3f/%.3f' % (train_f1 , test_f1))
print('forest_pca train/test precisions: %.3f/%.3f' % (train_precision , test_precision))

y_train_pred = svm1_pca.predict(x_train)
y_test_pred = svm1_pca.predict(x_test)
train_f1 = f1_score(y_train , y_train_pred)
test_f1 = f1_score(y_test , y_test_pred)
train_precision = precision_score(y_train , y_train_pred)
test_precision = precision_score(y_test , y_test_pred)
print('svm1_pca train/test f1_scores: %.3f/%.3f' % (train_f1 , test_f1))
print('svm1_pca train/test precisions: %.3f/%.3f' % (train_precision , test_precision))

y_train_pred = svm2_pca.predict(x_train)
y_test_pred = svm2_pca.predict(x_test)
train_f1 = f1_score(y_train , y_train_pred)
test_f1 = f1_score(y_test , y_test_pred)
train_precision = precision_score(y_train , y_train_pred)
test_precision = precision_score(y_test , y_test_pred)
print('svm2_pca train/test f1_scores: %.3f/%.3f' % (train_f1 , test_f1))
print('svm2_pca train/test precisions: %.3f/%.3f' % (train_precision , test_precision))

y_train_pred = forest1_kpca.predict(x_train)
y_test_pred = forest1_kpca.predict(x_test)
train_f1 = f1_score(y_train , y_train_pred)
test_f1 = f1_score(y_test , y_test_pred)
train_precision = precision_score(y_train , y_train_pred)
test_precision = precision_score(y_test , y_test_pred)
print('forest1_kpca train/test f1_scores: %.3f/%.3f' % (train_f1 , test_f1))
print('forest1_kpca train/test precisions: %.3f/%.3f' % (train_precision , test_precision))

y_train_pred = forest2_kpca.predict(x_train)
y_test_pred = forest2_kpca.predict(x_test)
train_f1 = f1_score(y_train , y_train_pred)
test_f1 = f1_score(y_test , y_test_pred)
train_precision = precision_score(y_train , y_train_pred)
test_precision = precision_score(y_test , y_test_pred)
print('forest2_kpca train/test f1_scores: %.3f/%.3f' % (train_f1 , test_f1))
print('forest2_kpca train/test precisions: %.3f/%.3f' % (train_precision , test_precision))

y_train_pred = svm_kpca.predict(x_train)
y_test_pred = svm_kpca.predict(x_test)
train_f1 = f1_score(y_train , y_train_pred)
test_f1 = f1_score(y_test , y_test_pred)
train_precision = precision_score(y_train , y_train_pred)
test_precision = precision_score(y_test , y_test_pred)
print('svm_kpca train/test f1_scores: %.3f/%.3f' % (train_f1 , test_f1))
print('svm_kpca train/test precisions: %.3f/%.3f' % (train_precision , test_precision))

y_train_pred = ada.predict(x_train)
y_test_pred = ada.predict(x_test)
train_f1 = f1_score(y_train , y_train_pred)
test_f1 = f1_score(y_test , y_test_pred)
train_precision = precision_score(y_train , y_train_pred)
test_precision = precision_score(y_test , y_test_pred)
print('ada train/test f1_scores: %.3f/%.3f' % (train_f1 , test_f1))
print('ada train/test precisions: %.3f/%.3f' % (train_precision , test_precision))

y_train_pred = bag_forest.predict(x_train)
y_test_pred = bag_forest.predict(x_test)
train_f1 = f1_score(y_train , y_train_pred)
test_f1 = f1_score(y_test , y_test_pred)
train_precision = precision_score(y_train , y_train_pred)
test_precision = precision_score(y_test , y_test_pred)
print('bag_forest train/test f1_scores: %.3f/%.3f' % (train_f1 , test_f1))
print('bag_forest train/test precisions: %.3f/%.3f' % (train_precision , test_precision))

y_train_pred = bag_svm.predict(x_train)
y_test_pred = bag_svm.predict(x_test)
train_f1 = f1_score(y_train , y_train_pred)
test_f1 = f1_score(y_test , y_test_pred)
train_precision = precision_score(y_train , y_train_pred)
test_precision = precision_score(y_test , y_test_pred)
print('bag_svm train/test f1_scores: %.3f/%.3f' % (train_f1 , test_f1))
print('bag_svm train/test precisions: %.3f/%.3f' % (train_precision , test_precision))
