In [593]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,balanced_accuracy_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline

In [550]:
df = pd.read_csv("../Data/Modeling_dataset/credit_PRO.csv")

# Czyszczenie danych
Zajmujemy sie anomaliami i bledami wykrytymi podczas EDA 

usuwamy odstajace obserwacje z kolumny AMT_INCOME_TOTAL

In [551]:
data = df['AMT_INCOME_TOTAL']
print('Wejściowy rozmiar: ', data.shape[0])
upper_lim = data.quantile(.99)
df = df[( df.AMT_INCOME_TOTAL< upper_lim)]
print('Wyjściowy rozmiar: ', df.shape[0])

Wejściowy rozmiar:  14000
Wyjściowy rozmiar:  13860


-Błędne wartosci w kolumnie DAYS_EMPLOYED mozemy uzupełnić średnią
czy to dobre rozwiazanie? osoby z błędną dana maja problem ze splata w 13% czyli mniej niz w calej populacji
mozemy uzupełnić średnią i dodać dodatkową kolumnę oznaczającą błąd w danych

In [552]:
X = df.drop(["TARGET"],axis = 1)
y = df["TARGET"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature engineering

- poprawienie błędnych i brakujacych wartosci
- transformacje
- skalowanie zmiennych
- wybieranie najbardziej wartosciowych kolumn
- tworzenie nowych zmiennych


In [553]:
goods_price_impute = X_train['AMT_GOODS_PRICE'].dropna().median()
days_employed_impute = df.loc[df["DAYS_EMPLOYED"]<0].DAYS_EMPLOYED.mean()
days_employed_impute

-2350.6807458803123

Tworzymy transformer, aby zautomatyzować preprocessing danych

In [560]:
class Transformer(BaseEstimator, TransformerMixin):
    def fit(self, Xc, y=None):
        return self
    
    def transform(self, X):
        Xc = X.copy()
        # modify cnt children
        Xc.loc[X['CNT_CHILDREN'] > 4, 'CNT_CHILDREN'] = 4
        
        # impute missing values
        Xc['AMT_GOODS_PRICE'] = Xc['AMT_GOODS_PRICE'].fillna(goods_price_impute)

        # impute bad values in DAYS_EMPLOYED and add column indicating it
        Xc['BAD_DAYS_EMPLOYED'] = (Xc['DAYS_EMPLOYED'] > 0).astype(int)
        Xc.loc[Xc["DAYS_EMPLOYED"]>0,'DAYS_EMPLOYED'] = days_employed_impute
        
        # log transforms
        Xc["AMT_CREDIT"] = np.log1p(Xc["AMT_CREDIT"])
        Xc["AMT_INCOME_TOTAL"] = np.log1p(Xc["AMT_INCOME_TOTAL"])
        Xc["AMT_ANNUITY"] = np.log1p(Xc["AMT_ANNUITY"])
        Xc["AMT_GOODS_PRICE"] = np.log1p(Xc["AMT_GOODS_PRICE"])
        
        # skalowanie zmiennych
        s = StandardScaler()
        Xc["AMT_CREDIT"] = s.fit_transform(Xc[["AMT_CREDIT"]])
        Xc["AMT_INCOME_TOTAL"] = s.fit_transform(Xc[["AMT_INCOME_TOTAL"]])
        Xc["AMT_ANNUITY"] = s.fit_transform(Xc[["AMT_ANNUITY"]])
        Xc["AMT_GOODS_PRICE"] = s.fit_transform(Xc[["AMT_GOODS_PRICE"]])

        # wybranie najlepszych kolumn todo
        Xc = Xc.drop(["id"],axis=1)
        
        # stworzenie nowych zmiennych todo
        
        return Xc

# Pierwszy model i baseline

In [561]:
pipeline = Pipeline([
    ('transformer', Transformer()),
    ("estimator", LogisticRegression())
])

In [572]:
pipeline.fit(X_train,y_train)

  Xc.loc[Xc["DAYS_EMPLOYED"]>0,'DAYS_EMPLOYED'] = days_employed_impute
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [564]:
y_pred = pipeline.predict(X_test)

  Xc.loc[Xc["DAYS_EMPLOYED"]>0,'DAYS_EMPLOYED'] = days_employed_impute


In [582]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
print("Balanced Accuracy:", balanced_acc)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Confusion Matrix:
[[2276    0]
 [ 496    0]]
Balanced Accuracy: 0.5
Accuracy: 0.8210678210678211


- baseline_acc = 0.82
- baseline_balance_acc = 0.5

# Oversampling

In [612]:
pipeline = ImbPipeline([
    
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('transformer', Transformer(),X_train.columns),
        ],
        remainder='passthrough'
    )),
    ('oversampler', SMOTE()),
    ("estimator", LogisticRegression())
])

In [613]:
pipeline.fit(X_train,y_train)

  Xc.loc[Xc["DAYS_EMPLOYED"]>0,'DAYS_EMPLOYED'] = days_employed_impute
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [614]:
y_pred = pipeline.predict(X_test)

  Xc.loc[Xc["DAYS_EMPLOYED"]>0,'DAYS_EMPLOYED'] = days_employed_impute


In [615]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
print("Balanced Accuracy:", balanced_acc)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Confusion Matrix:
[[1290  986]
 [ 195  301]]
Balanced Accuracy: 0.5868193349963149
Accuracy: 0.573953823953824
