# **Import librabies**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('default')
import seaborn as  sns
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

# **Read data**

In [None]:
application_train = pd.read_csv('application_train.csv')

In [None]:
train_df = application_train.copy()
train_df.head()

# **Cleaning data**

In [None]:
train_df.duplicated(keep='first').values.any()

In [None]:
# Drop NA/ XNA/ Unknown
train_df.drop(['SK_ID_CURR','ORGANIZATION_TYPE', 'OCCUPATION_TYPE', 'FONDKAPREMONT_MODE', 'WALLSMATERIAL_MODE', 'WEEKDAY_APPR_PROCESS_START'], axis=1, inplace=True)
train_df.drop(train_df.loc[train_df['CODE_GENDER']== 'XNA'].index, inplace=True)
train_df.drop(train_df.loc[train_df['NAME_FAMILY_STATUS']== 'Unknown'].index, inplace=True)
train_df.drop(train_df.loc[train_df['DAYS_EMPLOYED']== 365243].index, inplace=True) # 365243 is NA values ( https://www.kaggle.com/competitions/home-credit-default-risk/discussion/57247)
train_df.drop(train_df.loc[train_df['AMT_INCOME_TOTAL']== 117000000].index, inplace=True)
train_df.update(train_df[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].fillna(0.5))

In [None]:
percent_missing = train_df.isnull().sum() * 100 / len(train_df)
pct = percent_missing.tolist()

train_df.drop([(train_df.columns.tolist()[col]) for col in range(len(pct)) if pct[col]>15], axis=1, inplace=True)
train_df.dropna(how='any',axis=0,inplace=True)

missing_value_df = pd.DataFrame({'column_name': train_df.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df

# **Encoding**

In [None]:
def barplot_mean(x, y, df, hue=None, order=None, hue_order=None):
    print(df.groupby(x)[y].mean())
    uniqs = df[x].nunique()

    if uniqs > 4:
        plt.figure(figsize=(16,4))

    sns.barplot(x=x, y=y, data=df, estimator=np.mean, hue=hue, order=order, hue_order=hue_order)
    plt.show()

In [None]:
# NAME_TYPE_SUITE
train_df.loc[train_df['NAME_TYPE_SUITE'] != 'Unaccompanied', 'NAME_TYPE_SUITE'] = 'Accompanied'
#barplot_mean('NAME_TYPE_SUITE','TARGET', train_df)

In [None]:
# NAME_INCOME_TYPE
train_df.loc[train_df['NAME_INCOME_TYPE'] != 'Working', 'NAME_INCOME_TYPE'] = 'NotWorking'
#barplot_mean('NAME_INCOME_TYPE','TARGET', train_df)

In [None]:
# NAME_HOUSING_TYPE
train_df.loc[train_df['NAME_HOUSING_TYPE'] != 'House / apartment', 'NAME_HOUSING_TYPE'] = 'Not Owner'
#barplot_mean('NAME_HOUSING_TYPE','TARGET', train_df)

In [None]:
# NAME_FAMILY_STATUS
train_df.loc[train_df['NAME_FAMILY_STATUS'] != 'Married', 'NAME_FAMILY_STATUS'] = 'Not Married'
#barplot_mean('NAME_FAMILY_STATUS','TARGET', train_df)

In [None]:
# NAME_EDUCATION_TYPE
train_df.loc[train_df['NAME_EDUCATION_TYPE'] != 'Lower secondary', 'NAME_EDUCATION_TYPE'] = 'From Secondary'
#barplot_mean('NAME_EDUCATION_TYPE','TARGET', train_df)

In [None]:
# category --> onehot endcoding
cat_cols_encoded = []
cat_ft = train_df.dtypes[train_df.dtypes == 'object'].index
for col in list(cat_ft):
  cat_cols_encoded += [f"{col}_{cat}" for cat in list(train_df[col].unique()[1:])]

In [None]:
OHE = OneHotEncoder(sparse=False, handle_unknown='error',drop = 'first')
encoded_cols = OHE.fit_transform(train_df[cat_ft])
df_enc = pd.DataFrame(encoded_cols, columns=cat_cols_encoded)
train_df.reset_index(drop=True,inplace= True)
train_df_enc = train_df.join(df_enc)
train_df_enc.drop(cat_ft, axis=1, inplace=True)

# **Split train/test**

In [None]:
# train test split
X,y = train_df_enc.drop(columns=['TARGET']), train_df_enc['TARGET']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# **Scaling data**

In [None]:
col_fts = [column for column in list(X_train.dtypes[X_train.dtypes == 'int64'].index | X_train.dtypes[X_train.dtypes == 'float64'].index) if len(X_train[column].unique()) > 2]

In [None]:
# coutinuos --> scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train[col_fts] = scaler.fit_transform(X_train[col_fts])
X_test[col_fts] = scaler.fit_transform(X_test[col_fts])

# **Handle imbalance data**

In [None]:
# fix imbalance
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

nm = NearMiss()
sm = SMOTE(random_state=42)

X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
X_train_nm, y_train_nm = nm.fit_resample(X_train, y_train)

# **Building model and evaluating**

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [None]:
# KNN no Resample

knn_clf = KNeighborsClassifier(n_neighbors = 21)
knn_clf.fit(X_train, y_train)

y_pred = knn_clf.predict(X_test)

print(classification_report( y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()
plt.close()

In [None]:
# KNN with SMOTE

knn_clf = KNeighborsClassifier(n_neighbors = 21)
knn_clf.fit(X_train_sm, y_train_sm)

y_pred_sm = knn_clf.predict(X_test)

print(classification_report( y_test, y_pred_sm))
cm = confusion_matrix(y_test, y_pred_sm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()
plt.close()

In [None]:
# KNN with NearMiss
knn_clf = KNeighborsClassifier(n_neighbors = 21)
knn_clf.fit(X_train_nm, y_train_nm)

y_pred_nm = knn_clf.predict(X_test)

print(classification_report( y_test, y_pred_nm))
cm = confusion_matrix(y_test, y_pred_nm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()
plt.close()