In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df['credit_line_utilization'] = df['credit_line_utilization'].str.replace(',', '.').astype(float)

In [4]:
columns = df.iloc[:, 1:-1].columns
columns

Index(['age', 'number_dependent_family_members', 'monthly_income',
       'number_of_credit_lines', 'real_estate_loans',
       'ratio_debt_payment_to_income', 'credit_line_utilization',
       'number_of_previous_late_payments_up_to_59_days',
       'number_of_previous_late_payments_up_to_89_days',
       'number_of_previous_late_payments_90_days_or_more'],
      dtype='object')

In [5]:
X, y = df[columns], df.iloc[:, -1]

### Missing value imputation

In [15]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp_mean = IterativeImputer(random_state=0)

In [8]:
imp_mean = IterativeImputer(random_state=0)
X_imputed = imp_mean.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed)
X_imputed.columns = columns
X_imputed.head()

Unnamed: 0,age,number_dependent_family_members,monthly_income,number_of_credit_lines,real_estate_loans,ratio_debt_payment_to_income,credit_line_utilization,number_of_previous_late_payments_up_to_59_days,number_of_previous_late_payments_up_to_89_days,number_of_previous_late_payments_90_days_or_more
0,66.0,0.531495,4000.0,9.435243,1.0,0.569108,0.054888,0.0,0.0,0.0
1,61.0,2.0,4000.0,6.0,1.0,0.297176,0.10195,0.0,0.0,0.0
2,31.0,2.0,3040.0,8.0,0.0,0.160145,1.227135,4.0,0.0,0.0
3,54.0,4.0,10218.0,5.0,0.0,0.067913,0.083278,0.0,0.0,0.0
4,29.0,0.0,4468.0,6.0,0.0,0.328261,0.317446,0.0,0.0,0.0


### Outlier removal

In [10]:
from sklearn.svm import OneClassSVM
from icecream import ic

In [13]:
ee = OneClassSVM(nu=.01)
yhat = ee.fit_predict(X_imputed)
mask = yhat != -1
X_out, y_out = X_imputed[mask], y[mask]

#### X_new and y_new are imputed and outlier free

In [14]:
X_new, y_new = X_out, y_out

Balance the dataset by undersampling

In [16]:
# Concatinate imputed and 
data_new = pd.concat([X_new, y_new], axis=1)

In [17]:
number_of_1s = len(data_new[data_new['defaulted_on_loan'] == 1])
ic(number_of_1s)
ones = data_new[data_new['defaulted_on_loan'] == 1]
ones

ic| number_of_1s: 4974


Unnamed: 0,age,number_dependent_family_members,monthly_income,number_of_credit_lines,real_estate_loans,ratio_debt_payment_to_income,credit_line_utilization,number_of_previous_late_payments_up_to_59_days,number_of_previous_late_payments_up_to_89_days,number_of_previous_late_payments_90_days_or_more,defaulted_on_loan
2,31.000000,2.000000,3040.000000,8.000000,0.0,0.160145,1.227135,4.0,0.0,0.0,1
20,48.000000,1.000000,10800.000000,11.000000,3.0,0.590316,0.975767,3.0,0.0,0.0,1
30,42.168606,4.000000,6448.000000,11.000000,0.0,0.190107,0.865913,3.0,1.0,0.0,1
44,45.000000,0.000000,4938.000000,1.000000,0.0,0.068840,1.000000,0.0,1.0,2.0,1
65,44.000000,3.000000,3772.000000,8.000000,1.0,0.537503,0.207312,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
72070,24.000000,1.246024,3418.846084,0.000000,0.0,0.013540,1.000000,1.0,0.0,3.0,1
72086,45.000000,1.000000,10614.829761,8.000000,4.0,0.485349,0.073525,0.0,0.0,0.0,1
72120,51.877978,0.000000,2500.000000,4.000000,0.0,0.552579,0.465445,0.0,1.0,1.0,1
72129,62.000000,0.000000,945.000000,7.814023,0.0,0.707188,3.990244,1.0,0.0,0.0,1


In [18]:
zeros = data_new[data_new['defaulted_on_loan'] == 0]
zeros = zeros.sample(n=4974, replace=False)

# data undersampled
df_und = ones.append(zeros)
df_und.reset_index(drop=True)
df_und

Unnamed: 0,age,number_dependent_family_members,monthly_income,number_of_credit_lines,real_estate_loans,ratio_debt_payment_to_income,credit_line_utilization,number_of_previous_late_payments_up_to_59_days,number_of_previous_late_payments_up_to_89_days,number_of_previous_late_payments_90_days_or_more,defaulted_on_loan
2,31.000000,2.000000,3040.0,8.0,0.0,0.160145,1.227135,4.00000,0.000000,0.0,1
20,48.000000,1.000000,10800.0,11.0,3.0,0.590316,0.975767,3.00000,0.000000,0.0,1
30,42.168606,4.000000,6448.0,11.0,0.0,0.190107,0.865913,3.00000,1.000000,0.0,1
44,45.000000,0.000000,4938.0,1.0,0.0,0.068840,1.000000,0.00000,1.000000,2.0,1
65,44.000000,3.000000,3772.0,8.0,1.0,0.537503,0.207312,0.00000,0.000000,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
51773,50.000000,3.000000,9000.0,9.0,2.0,0.397511,5.749412,0.22689,0.000000,0.0,0
8119,46.000000,3.000000,2715.0,5.0,1.0,0.445508,1.000000,0.00000,0.000000,0.0,0
40597,63.000000,1.000000,12000.0,18.0,1.0,0.093576,0.079540,0.00000,0.000000,0.0,0
52176,42.000000,3.000000,5700.0,7.0,0.0,0.189616,0.223659,0.00000,0.000000,0.0,0


In [19]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix,precision_recall_curve, \
roc_auc_score,roc_curve,recall_score,classification_report, f1_score
from sklearn.ensemble import VotingClassifier

In [20]:
X_und = df_und[columns]
y_und = pd.DataFrame(df_und.iloc[:, -1])

X_und = X_und.reset_index(drop=False)
y_und = y_und.reset_index(drop=False)

X_und = X_und.drop(['index'], axis=1)
y_und = y_und.drop(['index'], axis=1)

In [21]:
kf = KFold(5, shuffle=True)

In [22]:
roc = []
recall = []
for train_idx, test_idx in kf.split(X_und):
    X_train, X_test = X_und.iloc[train_idx], X_und.iloc[test_idx]
    y_train, y_test = y_und.iloc[train_idx], y_und.iloc[test_idx]
    
    pipe = Pipeline(steps=[
        ('preprocessing', ColumnTransformer(transformers=[
            ('numeric', Pipeline(steps=[
                ('scale', StandardScaler())
            ]), columns),
        ])),
        ('classifier', VotingClassifier(estimators=[
            ('tree3', DecisionTreeClassifier(max_depth=3)),
            ('tree9', DecisionTreeClassifier(max_depth=9)),
            ('tree15', DecisionTreeClassifier(max_depth=15)),
            ('tree21', DecisionTreeClassifier(max_depth=21)),
            ('tree35', DecisionTreeClassifier(max_depth=35)),
        ], voting='hard'))
    ])
    pipe.fit(X_train, y_train)
    y_predicted=pipe.predict(X_test)
    roc.append(roc_auc_score(y_test, y_predicted))
    recall.append(recall_score(y_test, y_predicted))
    
print(np.mean(roc))
print(np.mean(recall))
# print(np.std(accuracies))


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.7130372784204075
0.7202056179774317


In [23]:
test = pd.read_csv('test.csv')

In [24]:
test['credit_line_utilization'] = test['credit_line_utilization'].str.replace(',', '.').astype(float)

In [25]:
imp_mean = IterativeImputer(random_state=0)
X_imputed = imp_mean.fit_transform(test.iloc[:, 1:])
X_imputed = pd.DataFrame(X_imputed)
X_imputed.columns = columns
test = X_imputed

In [26]:
test_prediction = pipe.predict(test)

In [28]:
submission = pd.DataFrame({'Predicted':test_prediction})

In [32]:
submission = submission.set_index(pd.Index(np.arange(1, 48109))).reset_index()

In [33]:
submission

Unnamed: 0,index,Predicted
0,1,1
1,2,1
2,3,0
3,4,1
4,5,1
...,...,...
48103,48104,0
48104,48105,0
48105,48106,0
48106,48107,0


In [34]:
submission.to_csv('submission3.csv')