Kod bezproblemowo uruchamia się na komputerze innym niż grupy modelującej. Proces EDA przeprowadzony bez zastrzeżeń. Podczas preprocessingu zastosowano sensowne transformacje danych i sprawdzono ich skuteczność. Cały proces został zautomatyzowany pipelinami co znacząco zwiększa przejrzystość kodu. Podczas oceny jakości modelu użyto wielu metryk,którę mają uzasadnienie i interpretację biznesową przy zastosowaniu modelu. 

Uwagi:
- w pliku model_and_hyperparameters w komórce pod napisem Ostateczne Wykresy kilka razy przypisujecie różne modele do zmiennej xgb1 i xgb2 (ostatecznie do ewaluacji trafia xgb1 i xgb2 bez pipeline)
- brak wniosku który lub które modele są według was najlepsze?
- wytrenowane, najlepsze według was modele można zapisać do pliku pkl co ułatwi korzystanie z kodu innym użytkownikom



sprawdzamy perforamance modeli na danych walidacyjnych

In [20]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
#from skopt.space import Real, Categorical, Integer
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

In [21]:
seed = 17
bank_data = pd.read_csv("./Data/bank_train_data.csv", index_col=0)

## Kategoryczne niebinarne
# Oridinal education
bank_data['education'] = bank_data['education'].map({'unknown': 0, 'primary': 1, 'secondary': 2, 'tertiary': 3})

# OneHot reszty
bank_data = pd.get_dummies(bank_data, columns=['job','contact','marital','poutcome'])

## Kategoryczne binarne
binary = ["default", "housing", "loan", "y"]
for col in binary:
    bank_data[col] = bank_data[col].map({"yes": 1, "no": 0})

# Month (tranfosrmacja na okrąg)
bank_data['month'] = bank_data["month"].map({"jan": 0, "feb": 1, "mar": 2, "apr": 3, "may": 4, "jun": 5, "jul": 6, "aug": 7, "sep": 8, "oct": 9, "nov": 10, "dec": 11})
bank_data['month'] = bank_data["month"].apply(lambda x: np.sin(x * (2 * np.pi / 12)))

# Day (transformacja na okrąg)
bank_data['day'] = bank_data["day"].apply(lambda x: np.sin(x * (2 * np.pi / 31)))
X = bank_data.drop("y", axis=1)
y = bank_data["y"]

In [22]:
# Ramka danych i preproccesing wstępny
from sklearn.model_selection import train_test_split

seed = 17
bank_data = pd.read_csv("./Dont use this/bank_val_data.csv", index_col=0)

## Kategoryczne niebinarne
# Oridinal education
bank_data['education'] = bank_data['education'].map({'unknown': 0, 'primary': 1, 'secondary': 2, 'tertiary': 3})

# OneHot reszty
bank_data = pd.get_dummies(bank_data, columns=['job','contact','marital','poutcome'])

## Kategoryczne binarne
binary = ["default", "housing", "loan", "y"]
for col in binary:
    bank_data[col] = bank_data[col].map({"yes": 1, "no": 0})

# Month (tranfosrmacja na okrąg)
bank_data['month'] = bank_data["month"].map({"jan": 0, "feb": 1, "mar": 2, "apr": 3, "may": 4, "jun": 5, "jul": 6, "aug": 7, "sep": 8, "oct": 9, "nov": 10, "dec": 11})
bank_data['month'] = bank_data["month"].apply(lambda x: np.sin(x * (2 * np.pi / 12)))

# Day (transformacja na okrąg)
bank_data['day'] = bank_data["day"].apply(lambda x: np.sin(x * (2 * np.pi / 31)))
X_val = bank_data.drop("y", axis=1)
y_val = bank_data["y"]

In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin

# Klasa do wyrzucania kolumn w pipelinie
class ColumnDropper(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop(self.columns, axis = 1)
    
# Klasa do logarytmowania i następnie standaryzowania kolumn
class LogStdTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.std_scalers = [StandardScaler() for _ in range(len(columns))]
    
    def fit(self, X, y=None):
        
        for i, col in enumerate(self.columns):
            self.std_scalers[i].fit(np.log1p(X[[col]]))
            
        return self
    
    def transform(self, X, y=None):
        
        df = X.copy()
        
        for i, col in enumerate(self.columns):
            df[col] = self.std_scalers[i].transform(np.log1p(X[[col]]))
        
        return df

# Klasa do binnowania cechy balance
class BalanceBinner(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        curr_balance = X["balance"]
        new_balance = balance = pd.cut(curr_balance, bins = [-np.inf, 0, 1500, np.inf], labels=[0, 1, 2]).cat.codes
        return X.drop("balance", axis=1).assign(balance = new_balance)
    
# Klasa do transoformowania cech na przedział (0, 1) (jednostajnie)
class MinMaxTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.scalers = [MinMaxScaler() for i in range(len(columns))]
        
    def fit(self, X, y=None):
        for i, col in enumerate(self.columns):
            self.scalers[i].fit(X[[col]])
        return self
    
    def transform(self, X, y=None):
        df = X.copy()
        
        for i, col in enumerate(self.columns):
            df[col] = self.scalers[i].transform(X[[col]])
        
        return df

# Klasa do binnowania cechy previous
class PreviousBinner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.assign(previous = np.where(X["previous"] == 0, 0, 1))

In [29]:
from sklearn.metrics import precision_score, accuracy_score, confusion_matrix, fbeta_score, roc_curve, auc, recall_score, f1_score,balanced_accuracy_score
from imblearn.metrics import geometric_mean_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold

# Do plotowania krzywej roc
def plot_roc_curve(model, title_model, X_test, y_test):
    fpr, tpr, threshold = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, 'b', label = "AUC = %0.2f" % roc_auc)
    plt.title("ROC for: " + str(title_model).split('(')[0])
    plt.legend(loc = "lower right")
    plt.plot([0, 1], [0, 1], "r--")
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel("TPR")
    plt.xlabel("FPR")
    plt.show()
    
# Funkcja do badania podstawowych scorów do baselinu
def base_test_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    
    precision = precision_score(y_test, y_hat)
    g_mean = geometric_mean_score(y_test, y_hat)
    accuracy = accuracy_score(y_test, y_hat)
    fbeta = fbeta_score(y_test, y_hat, beta=0.5)
    recall = recall_score(y_test,y_hat)
    print("Precision score:", precision)
    print("G-mean score:   ", g_mean)
    print("Accuracy score: ", accuracy)
    print("Fbeta_score:    ", fbeta)
    print("Recall score:", recall)
    print("Confusion matrix: \n", confusion_matrix(y_test, y_hat))
    return precision, g_mean, accuracy, fbeta, recall

def custom_fbeta_scorer(estimator, X, y):
    y_hat = estimator.predict(X)
    return fbeta_score(y, y_hat, beta=0.5)

def get_auc(model, X_test, y_test):
    fpr, tpr, threshold = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    return auc(fpr, tpr)   

def custom_auc_scorer(estimator, X, y):
    return get_auc(estimator, X, y)

def custom_gmean_scorer(estimator, X, y):
    return geometric_mean_score(y, estimator.predict(X))

def get_my_pipe_with_model(model):
    pipe = Pipeline([
        ("dropper", ColumnDropper(["pdays", "default", 'poutcome_unknown', 'poutcome_other', 'contact_unknown', 'job_unknown'])),
        ("balance_binner", BalanceBinner()),
        ("bin_previous", PreviousBinner()),
        ("transformer _ minmax", MinMaxTransformer(["duration", "age", "campaign"])),
        ("model", model)
        ])
    return pipe

def crosswalidate_model_plot(model_cal, param_dict, X, y, n_repeats):
    skf = RepeatedStratifiedKFold(n_splits=5, n_repeats=n_repeats, random_state=seed)
    stats = {"precision": np.array([]), "g-mean": np.array([]), "accuracy": np.array([]), "fbeta": np.array([]), "recall": np.array([]), "auc_score": np.array([])}
    
    for i, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        model = model_cal(**param_dict)
        pipe = get_my_pipe_with_model(model)
        pipe.fit(X.loc[train_idx], y.loc[train_idx])
        y_hat = pipe.predict(X.loc[test_idx])
        
        stats["precision"] = np.append(stats["precision"], precision_score(y.loc[test_idx], y_hat))
        stats["g-mean"] = np.append(stats["g-mean"], geometric_mean_score(y.loc[test_idx], y_hat))
        stats["accuracy"] = np.append(stats["accuracy"], accuracy_score(y.loc[test_idx], y_hat))
        stats["fbeta"] = np.append(stats["fbeta"], fbeta_score(y.loc[test_idx], y_hat, beta=0.5))
        stats["recall"] = np.append(stats["recall"], recall_score(y.loc[test_idx], y_hat))
        stats["auc_score"] = np.append(stats["auc_score"], get_auc(pipe, X.loc[test_idx], y.loc[test_idx]))
    
    df = pd.DataFrame(data=stats, columns=["precision", "g-mean", "accuracy", "fbeta", "recall", "auc_score"])
    ax = sns.boxplot(x="variable", y="value", data=pd.melt(df))
    ax.set_ylim([0, 1])
    ax.set_title("Crosswalidation results for: " + str(model))
    plt.show()
        
    

In [5]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
def evaluate_model(model, X, y, scoring):
    
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=17)
    scores = cross_val_score(model, X, y, scoring=scoring, cv=cv, n_jobs=-1, error_score='raise')

    return scores

In [6]:
# Parametry RandomForest
dict_forest_fbeta = {
    'criterion': 'gini',
    'max_depth': 15, 
    'max_features': 7, 
    'min_samples_split': 8, 
    'n_estimators': 149,
    "random_state": 17
}
dict_forest_gmean = {
    'n_jobs': -1,
    'n_estimators': 145,
    'criterion': 'gini',
    'max_depth': 21,
    'min_samples_split': 2,
    'random_state': 17
}
# Parametry GradientBooster
dict_gradient_fbeta = {
    'l2_regularization': 10, 
    'learning_rate': 0.05, 
    'max_depth': 2, 
    'max_iter': 1000, 
    'max_leaf_nodes': 10, 
    'random_state': 17
}
dict_gradient_gmean = {
    'l2_regularization': 0.5,
    'learning_rate': 0.2,
    'max_depth': 3,
    'max_leaf_nodes': 10,
    'max_iter': 1000,
    'random_state': 17
}
# Parametry XGBoost
dict_xgboost_fbeta = {
    'alpha': 1, 
    'eta': 0.2, 
    'gamma': 0, 
    'lambda': 5, 
    'max_depth': 12, 
    'min_child_weight': 1, 
    'random_state': 17, 
    'scale_pos_weight': 7.67
}
dict_xgboost_gmean = {
    'eta': 0.1,
    'max_depth': 3,
    'gamma': 5,
    'min_child_weight': 20,
    'lambda': 0.5,
    'alpha': 5,
    'scale_pos_weight': 7.67,
    'random_state': 17
}
# Parametry SVM
dict_svm_fbeta = {
    'kernel': 'poly',
    'degree': 1,
    'max_iter': 150000,
    'random_state': 17,
    'probability': True
}
dict_svm_gmean = {
    'degree': 7, 
    'kernel': 'poly', 
    'max_iter': 150000, 
    'random_state': 17,
    'probability': True}

Trenowanie modeli na danych grupy modelującej

In [16]:
rf1 = get_my_pipe_with_model(RandomForestClassifier(**dict_forest_fbeta))
rf2 = get_my_pipe_with_model(RandomForestClassifier(**dict_forest_gmean))
gra1 = get_my_pipe_with_model(HistGradientBoostingClassifier(**dict_gradient_fbeta))
gra2 = get_my_pipe_with_model(HistGradientBoostingClassifier(**dict_gradient_gmean))
xgb1 = get_my_pipe_with_model(XGBClassifier(**dict_xgboost_fbeta))
xgb2 = get_my_pipe_with_model(XGBClassifier(**dict_xgboost_gmean))
svc1 = get_my_pipe_with_model(SVC(**dict_svm_fbeta))
svc2 = get_my_pipe_with_model(SVC(**dict_svm_gmean))

# Definicja modelu stacked
xgb3 = XGBClassifier(**dict_xgboost_fbeta)
xgb4 = XGBClassifier(**dict_xgboost_gmean)
level0 = [("beta", xgb3), ("gmean", xgb4)]
level1 = LogisticRegression()
stacked = get_my_pipe_with_model(StackingClassifier(estimators=level0, final_estimator=level1, cv=5))

# Definicja modelu soft
xgb5 = XGBClassifier(**dict_xgboost_fbeta)
xgb6 = XGBClassifier(**dict_xgboost_gmean)
level0 = [("beta", xgb5), ("gmean", xgb6)]
soft = get_my_pipe_with_model(VotingClassifier(estimators=level0, voting="soft"))

models = [rf1, rf2, gra1, gra2, xgb1, xgb2, svc1, svc2, stacked, soft]

for model in models:
    model.fit(X,y)

    

# Sprawdzenie wyników modeli

In [30]:
model_names = ["RF-beta", "RF-gmean", "GRA-beta", "GRA-gmean", "XGB-beta", "XGB-gmean", "SVM-beta", "SVM-gmean", "Stacked", "Soft"]

i = 0
for model in models:
    y_hat = model.predict(X_val)
    print(model_names[i])
    print("precision: " + str(precision_score(y_val, y_hat)))
    print("gmean: " + str(geometric_mean_score(y_val, y_hat)))
    print("fbeta: " + str(fbeta_score(y_val, y_hat,beta=0.5)))
    print("balanced accuracy: " + str(balanced_accuracy_score(y_val, y_hat)))
    print("recall: " + str(recall_score(y_val, y_hat)))
    print()
    i += 1

RF-beta
precision: 0.5853658536585366
gmean: 0.546792115783077
fbeta: 0.49586776859504134
balanced accuracy: 0.6396912829052712
recall: 0.3076923076923077

RF-gmean
precision: 0.5970149253731343
gmean: 0.5006454127726119
fbeta: 0.4716981132075472
balanced accuracy: 0.6169644953991331
recall: 0.2564102564102564

GRA-beta
precision: 0.5760869565217391
gmean: 0.5733333382983868
fbeta: 0.5057251908396947
balanced accuracy: 0.6536353252631355
recall: 0.33974358974358976

GRA-gmean
precision: 0.5405405405405406
gmean: 0.6068631157205541
fbeta: 0.5
balanced accuracy: 0.6710753858963684
recall: 0.38461538461538464

XGB-beta
precision: 0.5194805194805194
gmean: 0.693702322707771
fbeta: 0.5181347150259067
balanced accuracy: 0.725602596127159
recall: 0.5128205128205128

XGB-gmean
precision: 0.39197530864197533
gmean: 0.8249638421460896
fbeta: 0.43732782369146006
balanced accuracy: 0.825036294540874
recall: 0.8141025641025641

SVM-beta
precision: 0.5555555555555556
gmean: 0.3556643780462908
fbeta:

In [1]:
import os
os.system('jupyter nbconvert --to html validation.ipynb')

0