In [1]:
!pip install keras
!pip install tensorflow
!pip install xgboost
!pip install catboost




















In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline 

## Models
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb 
from keras.models import Sequential
from keras.layers import Dense
from sklearn.neural_network import MLPClassifier

## Model evaluators
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve


In [3]:
train_df = pd.read_csv("TrainingData.csv")
train_df.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(83000, 53)

In [4]:
train_df1 = train_df.drop(['mvar47','application_key', 'default_ind'], axis=1)

train_df2 = train_df1.replace(to_replace ="[a-zA-Z]+", value = np.nan, regex = True)
train_df2 = train_df2.astype('float')
train_df2 = pd.concat([train_df2, train_df['mvar47']], axis=1)

In [5]:
# one hot encoding 
train_df2 = pd.get_dummies(train_df2, columns=['mvar47'])

# **Exploratory Data Analysis**

In [6]:
test_df = pd.read_csv("testX.csv")
test_df1 = test_df.drop(['mvar47'], axis=1)
test_df2 = test_df1.replace(to_replace ="[a-zA-Z]+", value = np.nan, regex = True)
test_df2 = test_df2.astype('float')
test_df2 = pd.concat([test_df2, test_df['mvar47']], axis=1)
test_df2 = pd.get_dummies(test_df2, columns=['mvar47'])
test_df2 = test_df2.drop('application_key', axis=1)

df_list = [test_df2, train_df2]

In [7]:
for df in df_list:
  df['mvar3'] = (1+df['mvar3'])*(1+df['mvar4'])*(1+df['mvar5'])
  df['mvar7'] = df['mvar7'] + df['mvar8']
  mvar161718 = (df['mvar16'].add(df['mvar17'], fill_value=0)).add(df['mvar18'], fill_value=0)
  df['mvar16'] = mvar161718
  df['mvar26'] = (df['mvar26']+df['mvar27'])/2/365
  df['mvar35'] = df['mvar35'] + df['mvar34']
  df['mvar45'] = (df['mvar45'] + df['mvar46'])/2

In [8]:
#Since we have combined certain columns, we need to drop those which are already considered in the combination.

train_data = train_df2.drop(['mvar4', 'mvar5', 'mvar8', 'mvar17', 'mvar18', 'mvar27', 'mvar34', 'mvar46', 'mvar48', 'mvar49'], axis=1)

In [9]:
# dropping columns which have > 70% NULL values
train_data1 = train_data.drop(['mvar31', 'mvar40'], axis=1)

**Scaling and normalisation**

it is necessary because we will be using knn imputation as well as SMOTE for oversampling, which is also based on KNN methodology. For this reason, we need to scale the data so that higher magnitudes don't led to formation of clusters when using KNN.

*Standardization*


In [10]:
# importing sklearn StandardScaler class which is for Standardization
#from sklearn.preprocessing import StandardScaler

#sc = StandardScaler() # creating an instance of the class object
#df_scaled = pd.DataFrame(sc.fit_transform(train_df4), columns=train_df4.columns)  #fit and transforming StandardScaler the dataframe 

*Robust Scaling*

In [11]:
# importing sklearn Min Max Scaler class which is for Robust scaling
from sklearn.preprocessing import RobustScaler

rs = RobustScaler() # creating an instance of the class object
X_tr_sc = rs.fit_transform(train_data1)
train_data_scaled = pd.DataFrame(X_tr_sc, columns=train_data1.columns)

**Dropping Highly correlated and high VIF columns**

In [12]:
train_data_scaled1 = train_data_scaled.drop(['mvar20', 'mvar32', 'mvar10'], axis=1)

**Imputing NULL values**

In [13]:
def new_col_for_null(df, column):
  df[column+'_null'] = np.where(df[column].isnull(), 1, 0)
  return df

In [14]:
for col in train_data_scaled1.columns:
  new_col_for_null(train_data_scaled1, col)

In [None]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
imputed_1 = imputer.fit_transform(train_data_scaled1)
train_data2 = pd.DataFrame(imputed_1, columns=train_data_scaled1.columns)

In [None]:
train_data3 = train_data2.drop('mvar47_C', axis=1)

In [None]:
X = train_data3
y = train_df['default_ind']

# **Model Building**

#### **Dataset Balancing using SMOTETOMEK**

In [None]:
# Since our classes are highly skewed we should make them equivalent in order to have a normal 
# distribution of the classes.

from imblearn.combine import SMOTETomek 
smt = SMOTETomek(random_state=42)
X_sampled1, Y_sampled1 = smt.fit_resample(X, y)

X_sampled1.shape

In [None]:
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_sampled1, Y_sampled1)

In [None]:
#Hyperparameter tuning of logistic regression

from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

# define models and parameters
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=logreg, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(X_sampled1, Y_sampled1)

# **Working on COMPANY PROVIDED TEST DATA**

In [None]:
#Since we have combined certain columns, we need to drop those which are already considered in the combination.

test_df3 = test_df2.drop(['mvar4', 'mvar5', 'mvar8', 'mvar17', 'mvar18', 'mvar27', 'mvar34', 'mvar46', 
                         'mvar48', 'mvar49'], axis=1)

In [None]:
# dropping columns which have > 70% NULL values
test_df3 = test_df3.drop(['mvar31', 'mvar40'], axis=1)

**Scaling and normalisation**

it is necessary because we will be using knn imputation as well as SMOTE for oversampling, which is also based on KNN methodology. For this reason, we need to scale the data so that higher magnitudes don't led to formation of clusters when using KNN.

*Standardization*


In [None]:
# importing sklearn StandardScaler class which is for Standardization
#from sklearn.preprocessing import StandardScaler

#sc = StandardScaler() # creating an instance of the class object
#df_scaled = pd.DataFrame(sc.fit_transform(train_df4), columns=train_df4.columns)  #fit and transforming StandardScaler the dataframe 

*Robust Scaling*

In [None]:
# importing sklearn Min Max Scaler class which is for Robust scaling
from sklearn.preprocessing import RobustScaler

rs = RobustScaler() # creating an instance of the class object
sc = rs.fit_transform(test_df3)
scaled_test = pd.DataFrame(sc, columns=test_df3.columns)

**Dropping Highly correlated and high VIF columns**

In [None]:
scaled_test1 = scaled_test.drop(['mvar20', 'mvar32', 'mvar10'], axis=1)

In [None]:
def new_col_for_null(df, column):
  df[column+'_null'] = np.where(df[column].isnull(), 1, 0)
  return df

In [None]:
for col in scaled_test1.columns:
  new_col_for_null(scaled_test1, col)

**Imputing NULL values**

In [None]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
imputed = imputer.fit_transform(scaled_test1)
test_df4 = pd.DataFrame(imputed, columns=scaled_test1.columns)

In [None]:
test_df4 = test_df4.drop('mvar47_C', axis=1)

In [None]:
# make predictions for test data
X_test = test_df4
y_pred = grid_result.predict(test_df4)

new = pd.DataFrame(y_pred, columns=['predictions'])

In [None]:
new.insert(0, "application_key", test_df['application_key'])

In [None]:
new = new.reset_index(drop=True)

In [None]:
new.to_csv('Daring_souls_new.csv', index=False)