In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df['credit_line_utilization'] = df['credit_line_utilization'].str.replace(',', '.').astype(float)

In [4]:
columns = df.iloc[:, 1:-1].columns
columns

Index(['age', 'number_dependent_family_members', 'monthly_income',
       'number_of_credit_lines', 'real_estate_loans',
       'ratio_debt_payment_to_income', 'credit_line_utilization',
       'number_of_previous_late_payments_up_to_59_days',
       'number_of_previous_late_payments_up_to_89_days',
       'number_of_previous_late_payments_90_days_or_more'],
      dtype='object')

In [5]:
X, y = df[columns], df.iloc[:, -1]

### Missing value imputation

In [6]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp_mean = IterativeImputer(random_state=0)

In [7]:
imp_mean = IterativeImputer(random_state=0)
X_imputed = imp_mean.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed)
X_imputed.columns = columns
X_imputed.head()

Unnamed: 0,age,number_dependent_family_members,monthly_income,number_of_credit_lines,real_estate_loans,ratio_debt_payment_to_income,credit_line_utilization,number_of_previous_late_payments_up_to_59_days,number_of_previous_late_payments_up_to_89_days,number_of_previous_late_payments_90_days_or_more
0,66.0,0.531495,4000.0,9.435243,1.0,0.569108,0.054888,0.0,0.0,0.0
1,61.0,2.0,4000.0,6.0,1.0,0.297176,0.10195,0.0,0.0,0.0
2,31.0,2.0,3040.0,8.0,0.0,0.160145,1.227135,4.0,0.0,0.0
3,54.0,4.0,10218.0,5.0,0.0,0.067913,0.083278,0.0,0.0,0.0
4,29.0,0.0,4468.0,6.0,0.0,0.328261,0.317446,0.0,0.0,0.0


### Outlier removal

In [8]:
from sklearn.svm import OneClassSVM
from icecream import ic

In [9]:
ee = OneClassSVM(nu=.01)
yhat = ee.fit_predict(X_imputed)
mask = yhat != -1
X_out, y_out = X_imputed[mask], y[mask]

#### X_new and y_new are imputed and outlier free

In [10]:
X_new, y_new = X_out, y_out

Balance the dataset by undersampling

In [11]:
# Concatinate imputed and 
data_new = pd.concat([X_new, y_new], axis=1)

In [12]:
number_of_1s = len(data_new[data_new['defaulted_on_loan'] == 1])
ic(number_of_1s)
ones = data_new[data_new['defaulted_on_loan'] == 1]
ones

ic| number_of_1s: 4974


Unnamed: 0,age,number_dependent_family_members,monthly_income,number_of_credit_lines,real_estate_loans,ratio_debt_payment_to_income,credit_line_utilization,number_of_previous_late_payments_up_to_59_days,number_of_previous_late_payments_up_to_89_days,number_of_previous_late_payments_90_days_or_more,defaulted_on_loan
2,31.000000,2.000000,3040.000000,8.000000,0.0,0.160145,1.227135,4.0,0.0,0.0,1
20,48.000000,1.000000,10800.000000,11.000000,3.0,0.590316,0.975767,3.0,0.0,0.0,1
30,42.168606,4.000000,6448.000000,11.000000,0.0,0.190107,0.865913,3.0,1.0,0.0,1
44,45.000000,0.000000,4938.000000,1.000000,0.0,0.068840,1.000000,0.0,1.0,2.0,1
65,44.000000,3.000000,3772.000000,8.000000,1.0,0.537503,0.207312,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
72070,24.000000,1.246024,3418.846084,0.000000,0.0,0.013540,1.000000,1.0,0.0,3.0,1
72086,45.000000,1.000000,10614.829761,8.000000,4.0,0.485349,0.073525,0.0,0.0,0.0,1
72120,51.877978,0.000000,2500.000000,4.000000,0.0,0.552579,0.465445,0.0,1.0,1.0,1
72129,62.000000,0.000000,945.000000,7.814023,0.0,0.707188,3.990244,1.0,0.0,0.0,1


In [13]:
zeros = data_new[data_new['defaulted_on_loan'] == 0]
zeros = zeros.sample(n=4974, replace=False)

# data undersampled
df_und = ones.append(zeros)
df_und.reset_index(drop=True)
df_und

Unnamed: 0,age,number_dependent_family_members,monthly_income,number_of_credit_lines,real_estate_loans,ratio_debt_payment_to_income,credit_line_utilization,number_of_previous_late_payments_up_to_59_days,number_of_previous_late_payments_up_to_89_days,number_of_previous_late_payments_90_days_or_more,defaulted_on_loan
2,31.000000,2.0,3040.000000,8.0,0.0,0.160145,1.227135,4.000000,0.0,0.000000,1
20,48.000000,1.0,10800.000000,11.0,3.0,0.590316,0.975767,3.000000,0.0,0.000000,1
30,42.168606,4.0,6448.000000,11.0,0.0,0.190107,0.865913,3.000000,1.0,0.000000,1
44,45.000000,0.0,4938.000000,1.0,0.0,0.068840,1.000000,0.000000,1.0,2.000000,1
65,44.000000,3.0,3772.000000,8.0,1.0,0.537503,0.207312,0.000000,0.0,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...
4612,54.000000,3.0,9350.000000,5.0,1.0,32.380957,0.378989,0.000000,0.0,0.000000,0
46753,52.000000,1.0,6388.000000,8.0,0.0,0.166223,0.087128,0.000000,0.0,0.014701,0
12434,74.000000,0.0,1320.000000,8.0,1.0,1.294474,0.000000,0.114449,0.0,0.000000,0
45337,30.000000,0.0,5000.000000,7.0,0.0,0.203159,0.562887,0.000000,0.0,0.000000,0


In [101]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix,precision_recall_curve, \
roc_auc_score,roc_curve,recall_score,classification_report, f1_score
from sklearn.ensemble import VotingClassifier

In [15]:
X_und = df_und[columns]
y_und = pd.DataFrame(df_und.iloc[:, -1])

X_und = X_und.reset_index(drop=False)
y_und = y_und.reset_index(drop=False)

X_und = X_und.drop(['index'], axis=1)
y_und = y_und.drop(['index'], axis=1)

In [104]:
kf = KFold(8, shuffle=True)
skf = StratifiedKFold(5)

In [19]:
roc = []
recall = []
for train_idx, test_idx in kf.split(X_und):
    X_train, X_test = X_und.iloc[train_idx], X_und.iloc[test_idx]
    y_train, y_test = y_und.iloc[train_idx], y_und.iloc[test_idx]
    
    pipe = Pipeline(steps=[
        ('preprocessing', ColumnTransformer(transformers=[
            ('numeric', Pipeline(steps=[
                ('scale', StandardScaler())
            ]), columns),
        ])),
        ('classifier', VotingClassifier(estimators=[
            ('tree3', DecisionTreeClassifier(max_depth=3)),
            ('tree9', DecisionTreeClassifier(max_depth=9)),
            ('tree15', DecisionTreeClassifier(max_depth=15)),
            ('tree21', DecisionTreeClassifier(max_depth=21)),
            ('tree35', DecisionTreeClassifier(max_depth=35)),
        ], voting='hard'))
    ])
    pipe.fit(X_train, y_train)
    y_predicted=pipe.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
    recall.append(recall_score(y_test, y_predicted))
    
print(np.mean(roc))
print(np.mean(recall))
# print(np.std(accuracies))


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.7062855603345252
0.6928531515091951


In [19]:
from sklearn.ensemble import BaggingClassifier

In [41]:
roc = []
recall = []
for train_idx, test_idx in kf.split(X_und):
    X_train, X_test = X_und.iloc[train_idx], X_und.iloc[test_idx]
    y_train, y_test = y_und.iloc[train_idx], y_und.iloc[test_idx]
    
    pipe2 = Pipeline(steps=[
        ('preprocessing', ColumnTransformer(transformers=[
            ('numeric', Pipeline(steps=[
                ('scale', StandardScaler())
            ]), columns),
        ])),
        ('classifier', BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                        n_estimators=10))
    ])
    pipe2.fit(X_train, y_train.values.ravel())
    y_predicted=pipe2.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
    recall.append(recall_score(y_test, y_predicted))
    
print(np.mean(roc))
print(np.mean(recall))
# print(np.std(accuracies))


0.7309655443939062
0.6860645213588288


In [42]:
roc = []
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_new, y_new) 
    y_predicted=pipe2.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
print(np.mean(roc))

0.8548294895277755


In [32]:
roc = []
recall = []
for train_idx, test_idx in kf.split(X_und):
    X_train, X_test = X_und.iloc[train_idx], X_und.iloc[test_idx]
    y_train, y_test = y_und.iloc[train_idx], y_und.iloc[test_idx]
    
    pipe1 = Pipeline(steps=[
        ('preprocessing', ColumnTransformer(transformers=[
            ('numeric', Pipeline(steps=[
                ('scale', StandardScaler())
            ]), columns),
        ])),
        ('classifier', BaggingClassifier(base_estimator=RandomForestClassifier(),
                                        n_estimators=10))
    ])
    pipe1.fit(X_train, y_train)
    y_predicted=pipe1.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
    recall.append(recall_score(y_test, y_predicted))
    
print(np.mean(roc))
print(np.mean(recall))
# print(np.std(accuracies))


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.7664302905851895
0.7449503019700334


In [33]:
roc = []
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_new, y_new) 
    y_predicted=pipe1.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
print(np.mean(roc))

0.852779037773152


In [39]:
roc = []
recall = []
for train_idx, test_idx in kf.split(X_und):
    X_train, X_test = X_und.iloc[train_idx], X_und.iloc[test_idx]
    y_train, y_test = y_und.iloc[train_idx], y_und.iloc[test_idx]
    
    pipe4 = Pipeline(steps=[
        ('preprocessing', ColumnTransformer(transformers=[
            ('numeric', Pipeline(steps=[
                ('scale', StandardScaler())
            ]), columns),
        ])),
        ('classifier', BaggingClassifier(base_estimator=VotingClassifier(estimators=[
            ('tree3', DecisionTreeClassifier(max_depth=3)),
            ('tree9', DecisionTreeClassifier(max_depth=9)),
            ('tree15', DecisionTreeClassifier(max_depth=15)),
            ('tree21', DecisionTreeClassifier(max_depth=21)),
            ('tree35', DecisionTreeClassifier(max_depth=35)),
        ], voting='hard')) )
    ])
    pipe4.fit(X_train, y_train)
    y_predicted=pipe4.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
    recall.append(recall_score(y_test, y_predicted))
    
print(np.mean(roc))
print(np.mean(recall))
# print(np.std(accuracies))


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.7480409964092062
0.6952959885374963


In [40]:
roc = []
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_new, y_new) 
    y_predicted=pipe4.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
print(np.mean(roc))

0.8313835174252251


## Let's try RandomUnderSampler

In [58]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [30]:
Counter(y_new)

Counter({0: 66461, 1: 4974})

In [25]:
rus = RandomUnderSampler()

In [26]:
X_rus, y_rus = rus.fit_resample(X_new, y_new)

In [31]:
Counter(y_rus)

Counter({0: 4974, 1: 4974})

In [50]:
roc = []
recall = []
for train_idx, test_idx in kf.split(X_rus):
    X_train, X_test = X_rus.iloc[train_idx], X_rus.iloc[test_idx]
    y_train, y_test = y_rus.iloc[train_idx], y_rus.iloc[test_idx]
    
    pipe7 = Pipeline(steps=[
        ('preprocessing', ColumnTransformer(transformers=[
            ('numeric', Pipeline(steps=[
                ('scale', StandardScaler())
            ]), columns),
        ])),
        ('classifier', BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                        n_estimators=10))
    ])
    pipe7.fit(X_train, y_train)
    y_predicted=pipe7.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
    recall.append(recall_score(y_test, y_predicted))
    
print(np.mean(roc))
print(np.mean(recall))
# print(np.std(accuracies))


0.734524559721736
0.6884159100226204


In [52]:
roc = []
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_new, y_new) 
    y_predicted=pipe7.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
print(np.mean(roc))

0.8663647006775216


### Pipe7 is the best so far

### Pipe 71 (random forest, undersampling)

In [108]:
roc = []
recall = []
for train_idx, test_idx in kf.split(X_rus):
    X_train, X_test = X_rus.iloc[train_idx], X_rus.iloc[test_idx]
    y_train, y_test = y_rus.iloc[train_idx], y_rus.iloc[test_idx]
    
    pipe71 = Pipeline(steps=[
        ('preprocessing', ColumnTransformer(transformers=[
            ('numeric', Pipeline(steps=[
                ('scale', StandardScaler())
            ]), columns),
        ])),
        ('classifier', RandomForestClassifier())
    ])
    pipe71.fit(X_train, y_train)
    y_predicted=pipe71.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
    recall.append(recall_score(y_test, y_predicted))
    
print(np.mean(roc))
print(np.mean(recall))
# print(np.std(accuracies))


0.7602874961634885
0.7380651204328005


In [109]:
roc = []
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_new, y_new) 
    y_predicted=pipe71.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
print(np.mean(roc))

0.8821445737361475


In [116]:
from catboost import CatBoostClassifier

In [118]:
roc = []
recall = []
for train_idx, test_idx in kf.split(X_rus):
    X_train, X_test = X_rus.iloc[train_idx], X_rus.iloc[test_idx]
    y_train, y_test = y_rus.iloc[train_idx], y_rus.iloc[test_idx]
    
    pipe72 = Pipeline(steps=[
        ('preprocessing', ColumnTransformer(transformers=[
            ('numeric', Pipeline(steps=[
                ('scale', StandardScaler())
            ]), columns),
        ])),
        ('classifier', CatBoostClassifier(iterations=500,
                             learning_rate=0.02,
                             depth=12,
                             eval_metric='AUC',
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             od_wait=100))
    ])
    pipe72.fit(X_train, y_train)
    y_predicted=pipe72.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
    recall.append(recall_score(y_test, y_predicted))
    
print(np.mean(roc))
print(np.mean(recall))
# print(np.std(accuracies))


0:	total: 386ms	remaining: 3m 12s
1:	total: 581ms	remaining: 2m 24s
2:	total: 824ms	remaining: 2m 16s
3:	total: 1.03s	remaining: 2m 8s
4:	total: 1.25s	remaining: 2m 3s
5:	total: 1.46s	remaining: 2m
6:	total: 1.65s	remaining: 1m 56s
7:	total: 1.86s	remaining: 1m 54s
8:	total: 2.06s	remaining: 1m 52s
9:	total: 2.28s	remaining: 1m 51s
10:	total: 2.48s	remaining: 1m 50s
11:	total: 2.69s	remaining: 1m 49s
12:	total: 2.9s	remaining: 1m 48s
13:	total: 3.11s	remaining: 1m 48s
14:	total: 3.33s	remaining: 1m 47s
15:	total: 3.54s	remaining: 1m 46s
16:	total: 3.76s	remaining: 1m 46s
17:	total: 4.02s	remaining: 1m 47s
18:	total: 4.34s	remaining: 1m 49s
19:	total: 4.56s	remaining: 1m 49s
20:	total: 4.78s	remaining: 1m 49s
21:	total: 5.01s	remaining: 1m 48s
22:	total: 5.23s	remaining: 1m 48s
23:	total: 5.46s	remaining: 1m 48s
24:	total: 5.67s	remaining: 1m 47s
25:	total: 5.68s	remaining: 1m 43s
26:	total: 5.9s	remaining: 1m 43s
27:	total: 6.11s	remaining: 1m 43s
28:	total: 6.34s	remaining: 1m 43s
29:

In [119]:
roc = []
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_new, y_new) 
    y_predicted=pipe72.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
print(np.mean(roc))

0.8357831188907928


In [62]:
ros = RandomOverSampler(sampling_strategy=0.5)
rus = RandomUnderSampler(sampling_strategy=0.8)

In [63]:
X_ros, y_ros = ros.fit_resample(X_new, y_new)
X_comb, y_comb = rus.fit_resample(X_ros, y_ros)

In [64]:
Counter(y_ros)

Counter({0: 66461, 1: 33230})

In [65]:
roc = []
recall = []
for train_idx, test_idx in kf.split(X_comb):
    X_train, X_test = X_comb.iloc[train_idx], X_comb.iloc[test_idx]
    y_train, y_test = y_comb.iloc[train_idx], y_comb.iloc[test_idx]
    
    pipe8 = Pipeline(steps=[
        ('preprocessing', ColumnTransformer(transformers=[
            ('numeric', Pipeline(steps=[
                ('scale', StandardScaler())
            ]), columns),
        ])),
        ('classifier', BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                        n_estimators=10))
    ])
    pipe8.fit(X_train, y_train)
    y_predicted=pipe8.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
    recall.append(recall_score(y_test, y_predicted))
    
print(np.mean(roc))
print(np.mean(recall))
# print(np.std(accuracies))


0.972507046952549
0.9968328839842586


In [66]:
roc = []
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_new, y_new) 
    y_predicted=pipe8.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
print(np.mean(roc))

0.9880622332261007


### Lets try SMOTE

In [84]:
from imblearn.over_sampling import SMOTE
from collections import Counter
import imblearn

In [46]:
counter = Counter(y_new)
counter

Counter({0: 66461, 1: 4974})

In [86]:
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = imblearn.pipeline.Pipeline(steps=steps)

In [87]:
X_smote, y_smote = pipeline.fit_resample(X_new, y_new)

In [88]:
counter = Counter(y_smote)
counter

Counter({0: 13292, 1: 6646})

In [89]:
roc = []
recall = []
for train_idx, test_idx in kf.split(X_smote):
    X_train, X_test = X_smote.iloc[train_idx], X_smote.iloc[test_idx]
    y_train, y_test = y_smote.iloc[train_idx], y_smote.iloc[test_idx]
    
    pipe6 = Pipeline(steps=[
        ('preprocessing', ColumnTransformer(transformers=[
            ('numeric', Pipeline(steps=[
                ('scale', StandardScaler())
            ]), columns),
        ])),
        ('classifier', BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                        n_estimators=10))
    ])
    pipe6.fit(X_train, y_train.values.ravel())
    y_predicted=pipe6.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
    recall.append(recall_score(y_test, y_predicted))
    
print(np.mean(roc))
print(np.mean(recall))

0.7501360121943943
0.6018239840331392


In [90]:
roc = []
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_new, y_new) 
    y_predicted=pipe6.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
print(np.mean(roc))

0.9083991070270396


In [97]:
roc = []
recall = []
for train_idx, test_idx in kf.split(X_smote):
    X_train, X_test = X_smote.iloc[train_idx], X_smote.iloc[test_idx]
    y_train, y_test = y_smote.iloc[train_idx], y_smote.iloc[test_idx]
    
    pipe6 = Pipeline(steps=[
        ('preprocessing', ColumnTransformer(transformers=[
            ('numeric', Pipeline(steps=[
                ('scale', StandardScaler())
            ]), columns),
        ])),
        ('classifier',RandomForestClassifier())
    ])
    pipe6.fit(X_train, y_train.values.ravel())
    y_predicted=pipe6.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
    recall.append(recall_score(y_test, y_predicted))
    
print(np.mean(roc))
print(np.mean(recall))

0.7773541562952422
0.6512334749874402


In [98]:
roc = []
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_new, y_new) 
    y_predicted=pipe6.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
print(np.mean(roc))

0.9323260607770602


In [80]:
roc = []
recall = []
for train_idx, test_idx in kf.split(X_rus):
    X_train, X_test = X_rus.iloc[train_idx], X_rus.iloc[test_idx]
    y_train, y_test = y_rus.iloc[train_idx], y_rus.iloc[test_idx]
    
    pipe9 = Pipeline(steps=[
        ('preprocessing', ColumnTransformer(transformers=[
            ('numeric', Pipeline(steps=[
                ('scale', StandardScaler())
            ]), columns),
        ])),
        ('classifier', BaggingClassifier(base_estimator=XGBClassifier(),
                                        n_estimators=10))
    ])
    pipe9.fit(X_train, y_train)
    y_predicted=pipe9.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
    recall.append(recall_score(y_test, y_predicted))
    
print(np.mean(roc))
print(np.mean(recall))
# print(np.std(accuracies))
































0.7653641612490674
0.745056984143311


In [81]:
roc = []
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_new, y_new) 
    y_predicted=pipe9.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
print(np.mean(roc))

0.819894573393763


In [35]:
test = pd.read_csv('test.csv')

In [36]:
test['credit_line_utilization'] = test['credit_line_utilization'].str.replace(',', '.').astype(float)

In [37]:
imp_mean = IterativeImputer(random_state=0)
X_imputed = imp_mean.fit_transform(test.iloc[:, 1:])
X_imputed = pd.DataFrame(X_imputed)
X_imputed.columns = columns
test = X_imputed

In [120]:
test_prediction = pipe71.predict(test)

In [121]:
submission = pd.DataFrame({'Predicted':test_prediction})

In [122]:
submission = submission.set_index(pd.Index(np.arange(1, 48109))).reset_index()

In [123]:
submission

Unnamed: 0,index,Predicted
0,1,0
1,2,1
2,3,0
3,4,0
4,5,0
...,...,...
48103,48104,0
48104,48105,0
48105,48106,0
48106,48107,0


In [124]:
submission.to_csv('submission11.csv')