# Baseline Model


In [39]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [40]:
application_train=pd.read_csv("application_train.csv")
df=application_train.sample(100000,random_state=42)
df.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
245895,384575,0,Cash loans,M,Y,N,2,207000.0,465457.5,52641.0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,1.0
98194,214010,0,Cash loans,F,Y,Y,0,247500.0,1281712.5,48946.5,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,3.0
36463,142232,0,Cash loans,F,Y,N,0,202500.0,495000.0,39109.5,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,3.0
249923,389171,0,Cash loans,F,N,Y,0,247500.0,254700.0,24939.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
158389,283617,0,Cash loans,M,N,Y,0,112500.0,308133.0,15862.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0


In [41]:
basline_X=df.drop(columns=["SK_ID_CURR","TARGET"])
baselin_y=df["TARGET"]

In [42]:
from sklearn.model_selection import train_test_split

In [50]:
bl_X_train, bl_X_test, bl_Y_train, bl_Y_test = train_test_split(basline_X,
                                                        baselin_y, test_size=0.2,random_state=42)

## Numeric columns


In [51]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_df = bl_X_train.select_dtypes(include=numerics)
num_col=num_df.columns
num_col[0:5]

Index(['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'AMT_GOODS_PRICE'],
      dtype='object')

## Catagorical Columns


In [52]:
text_df = bl_X_train.select_dtypes(exclude=numerics)
text_col=text_df.columns

## Impute Missingness

In [53]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


In [54]:
bl_pre_col = ColumnTransformer(
    remainder="passthrough",
    transformers=[('imputer', SimpleImputer(missing_values=np.nan, strategy='mean'),num_col),
                  ("ohe",OneHotEncoder(handle_unknown='ignore', dtype='int'),text_col)
                 ])

## Baseline Model: Logistic Regression without class weights


In [55]:
from sklearn.linear_model import LogisticRegression

pl_bl=Pipeline([
    ("base_line_col_pricessing", bl_pre_col),
    ('lr', LogisticRegression())
])
bl_pre_col.fit(bl_X_train.replace([np.inf, -np.inf], np.nan), bl_Y_train)

In [56]:
pl_bl.fit(bl_X_train.replace([np.inf, -np.inf], np.nan), bl_Y_train)

In [57]:
pl_bl.predict(bl_X_test.replace([np.inf, -np.inf], np.nan)).sum()

0

In [58]:
roc_auc_score(bl_Y_test, pl_bl.predict(bl_X_test.replace([np.inf, -np.inf], np.nan)))

0.5

## Baseline Model: Logistic Regression with class weights


In [72]:
from sklearn.linear_model import LogisticRegression

pl_bl=Pipeline([
    ("base_line_col_pricessing", bl_pre_col),
    ('lr', LogisticRegression(class_weight={0:0.08,1:0.92}))
])

bl_pre_col.fit(bl_X_train.replace([np.inf, -np.inf], np.nan), bl_Y_train)

In [73]:
pl_bl.fit(bl_X_train.replace([np.inf, -np.inf], np.nan), bl_Y_train)

In [74]:
roc_auc_score(bl_Y_test, pl_bl.predict(bl_X_test.replace([np.inf, -np.inf], np.nan)))

0.5743419117070236

## Baseline Model: Random Forrest

In [75]:
from sklearn.ensemble import RandomForestClassifier

pl_bl=Pipeline([
    ("base_line_col_pricessing", bl_pre_col),
    ('lr', RandomForestClassifier(class_weight={0:0.08,1:0.92},
                                  max_depth=15))
])

bl_pre_col.fit(bl_X_train.replace([np.inf, -np.inf], np.nan), bl_Y_train)

In [76]:
pl_bl.fit(bl_X_train.replace([np.inf, -np.inf], np.nan), bl_Y_train)

In [77]:
pl_bl.predict(bl_X_test.replace([np.inf, -np.inf], np.nan))

array([1, 0, 0, ..., 0, 0, 0])

In [78]:
roc_auc_score(bl_Y_test, pl_bl.predict(bl_X_test.replace([np.inf, -np.inf], np.nan)))

0.6265917620663406