# Import Python libraries.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn import metrics

import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers


# Import Datasets

In [2]:
train_data = pd.read_csv("Training Data.csv",index_col=0)
test_data = pd.read_csv("Test Data.csv")
ss = pd.read_csv('Sample Prediction Dataset.csv')

In [3]:
train_data.describe()

Unnamed: 0,income,age,experience,current_job_years,current_house_years,risk_flag
count,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0
mean,4997117.0,49.954071,10.084437,6.333877,11.997794,0.123
std,2878311.0,17.063863,6.00259,3.647053,1.399037,0.328438
min,10310.0,21.0,0.0,0.0,10.0,0.0
25%,2503015.0,35.0,5.0,3.0,11.0,0.0
50%,5000694.0,50.0,10.0,6.0,12.0,0.0
75%,7477502.0,65.0,15.0,9.0,13.0,0.0
max,9999938.0,79.0,20.0,14.0,14.0,1.0


In [4]:
train_data['risk_flag'].value_counts()/252000

0    0.877
1    0.123
Name: risk_flag, dtype: float64

# Dataset Analysis

#### Insight 
We know that we are dealing with a heavily Imbalanced data set of about 7:1 ratio of non-defaulters to defaulters\
1) First step is selective Undersampling \
2) second Step is Using Cost-Sensitive Algorithms using costcla package\
3) Do Cross val

In [5]:
test_data["profession"] = test_data["profession"].apply(lambda x: x.replace(" ", "_"))
test_data["state"] = test_data["state"].apply(lambda x: x.replace(" ", "_"))
test_data["city"] = test_data["city"].apply(lambda x: x.replace(" ", "_"))
test_data['state'].value_counts()

Uttar_Pradesh        3110
Andhra_Pradesh       2957
Maharashtra          2830
West_Bengal          2593
Bihar                2300
Tamil_Nadu           1926
Madhya_Pradesh       1559
Karnataka            1279
Gujarat              1263
Rajasthan            1041
Jharkhand             970
Haryana               840
Telangana             835
Assam                 763
Kerala                574
Delhi                 564
Punjab                503
Odisha                490
Chhattisgarh          436
Uttarakhand           197
Jammu_and_Kashmir     196
Puducherry            173
Mizoram               115
Manipur               103
Himachal_Pradesh       86
Tripura                84
Uttar_Pradesh[5]       76
Sikkim                 70
Chandigarh             67
Name: state, dtype: int64

In [6]:
for i in train_data.iteritems():
    if str(i[-1].dtype) == 'object':
        print('\033[1m'+i[0]+'\033[0m')
        print(train_data[str(i[0])].describe())
        print(train_data[str(i[0])].value_counts())
        print()

[1mmarried[0m
count     252000
unique         2
top       single
freq      226272
Name: married, dtype: object
single     226272
married     25728
Name: married, dtype: int64

[1mhouse_ownership[0m
count     252000
unique         3
top       rented
freq      231898
Name: house_ownership, dtype: object
rented          231898
owned            12918
norent_noown      7184
Name: house_ownership, dtype: int64

[1mcar_ownership[0m
count     252000
unique         2
top           no
freq      176000
Name: car_ownership, dtype: object
no     176000
yes     76000
Name: car_ownership, dtype: int64

[1mprofession[0m
count        252000
unique           51
top       Physician
freq           5957
Name: profession, dtype: object
Physician                     5957
Statistician                  5806
Web_designer                  5397
Psychologist                  5390
Computer_hardware_engineer    5372
Drafter                       5359
Magistrate                    5357
Fashion_Designer       

In [7]:
train_data['combination'] = train_data['house_ownership'].astype('str')+'_'+train_data['car_ownership'].astype('str')
test_data['combination'] = test_data['house_ownership'].astype('str')+'_'+test_data['car_ownership'].astype('str')

In [8]:
#Stages of life
train_data['free_period'] = train_data['age']-train_data['experience']
train_data['youth'] = (train_data['age'].apply(lambda x:x>=24))*1
train_data['adult'] = (train_data['age'].apply(lambda x: x in range(25,60)))*1
train_data['old'] = (train_data['age'].apply(lambda x: x>60))
train_data['late_marriage'] = (train_data['age'].apply(lambda x: x>=35)).eq(train_data['married'].apply(lambda x: x=='single'))

In [9]:
#Stages of life
test_data['free_period'] = test_data['age']-test_data['experience']
test_data['youth'] = (test_data['age'].apply(lambda x:x>=24))*1
test_data['adult'] = (test_data['age'].apply(lambda x: x in range(25,60)))*1
test_data['old'] = (test_data['age'].apply(lambda x: x>60))
test_data['late_marriage'] = (test_data['age'].apply(lambda x: x>=35)).eq(test_data['married'].apply(lambda x: x=='single'))

#### States

In [10]:
#Cleaning city
for i in range(1,len(train_data['city'])):
    if '[' in str(train_data['city'][i]):
        train_data['city'][i] = train_data['city'][i].split('[')[0]

for i in range(1,len(test_data['city'])):
    if '[' in str(test_data['city'][i]):
        test_data['city'][i] = test_data['city'][i].split('[')[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
#Cleanup the states a bit
train_data.loc[train_data['state']=='Uttar_Pradesh[5]','state'] = 'Uttar_Pradesh'
test_data.loc[test_data['state']=='Uttar_Pradesh[5]','state'] = 'Uttar_Pradesh'

In [12]:
#Define per_capita income by state:
per_capita_income = { 'Goa': '466585' , 'Sikkim' : '425656', 'Delhi' : '376143', 'Chandigarh': '350000', 'Haryana': '247207','Telangana': '225756',
 'Karnataka':'223246','Kerala':'221904', 'Puducherry': '220949', 'Andaman_and_Nicobar_Islands': '219842', 'Tamil_Nadu':'218599',
 'Gujarat':'216329', 'Mizoram': '204018', 'Uttarakhand':'202895', 'Maharashtra': '202130', 'Himachal_Pradesh': '190255',
 'Andhra_Pradesh': '168480','Arunachal_Pradesh': '164615', 'Punjab': '161083', 'Nagaland': '130282', 'Tripura': '125630',
 'Rajasthan': '115492', 'West_Bengal': '115348', 'Odisha':'98896', 'Chhattisgarh':'105281', 'Jammu_and_Kashmir':'102882',
 'Madhya_Pradesh': '103288', 'Meghalaya':'92174', 'Assam':'90758', 'Manipur': '84746', 'Jharkhand':'79873', 'Uttar_Pradesh':'65704','Bihar':'46664'}


#Addition of per capita income in dataset
l=[]
for i in range(1,len(train_data)+1):
    l.append(int(per_capita_income[train_data['state'][i]]))
    
train_data['per_capita_income'] = l

l=[]
for i in range(1,len(test_data)+1):
    l.append(int(per_capita_income[test_data['state'][i]]))

test_data['per_capita_income'] = l



In [13]:
#State Risk
ris = []
risky_states =['Delhi','Haryana','Arunachal_Pradesh','Punjab','Tamil_Nadu']
safest_states = ['Gujrat','Rajasthan','West_Bengal','Telangana']
mediocre_states = ['Madhya_Pradesh','Maharashtra']


for i in range(1,len(train_data)+1):
    if train_data['state'][i] in risky_states:
        ris.append(1)
        continue
    if train_data['state'][i] in safest_states:
        ris.append(0)
        continue
    if train_data['state'][i] in mediocre_states:
        ris.append(0.4)
        continue
    else:
        ris.append(0.5)
        continue
train_data['state_risk'] = ris

ris = []
for i in range(1,len(test_data)+1):
    if test_data['state'][i] in risky_states:
        ris.append(1)
        continue
    if test_data['state'][i] in safest_states:
        ris.append(0)
        continue
    if test_data['state'][i] in mediocre_states:
        ris.append(0.4)
        continue
    else:
        ris.append(0.5)
        continue
test_data['state_risk'] = ris

In [14]:
#State Capitals
state_capitals = { 'Goa': 'Panaji' , 'Sikkim' : 'Gangtok', 'Delhi' : 'New_Delhi', 'Chandigarh': 'Chandigarh', 'Haryana': 'Chandigarh','Telangana': 'Hyderabad',
 'Karnataka':'Bengaluru','Kerala':'Thiruvananthapuram', 'Puducherry': 'Puducherry', 'Andaman_and_Nicobar_Islands': 'Port Blair', 'Tamil_Nadu':'Chennai',
 'Gujarat':'Gandhinagar', 'Mizoram': 'Aizawl', 'Uttarakhand':'Dehradun', 'Maharashtra': 'Mumbai', 'Himachal_Pradesh': 'Shimla',
 'Andhra_Pradesh': 'Amaravati','Arunachal_Pradesh': 'Itanagar', 'Punjab': 'Chandigarh', 'Nagaland': 'Kohima', 'Tripura': 'Agartala',
 'Rajasthan': 'Jaipur', 'West_Bengal': 'Kolkata', 'Odisha':'Bhubaneswar', 'Chhattisgarh':'Raipur', 'Jammu_and_Kashmir':'Kashmir',
 'Madhya_Pradesh': 'Bhopal', 'Meghalaya':'Shillong', 'Assam':'Dispur', 'Manipur': 'Imphal', 'Jharkhand':'Ranchi', 'Uttar_Pradesh':'Lucknow','Bihar':'Patna'}

cap = []
l=[]
for i in range(1,len(train_data)+1):
    l.append(state_capitals[train_data["state"][i]])

for i in range(1,len(train_data)+1):
    if train_data['city'][i] == l[i-1]:
        cap.append(1)
    else:
        cap.append(0)
        
train_data['is_capital'] = cap

cap = []
l=[]
for i in range(1,len(test_data)+1):
    l.append(state_capitals[test_data["state"][i]])

for i in range(1,len(test_data)+1):
    if test_data['city'][i] == l[i-1]:
        cap.append(1)
    else:
        cap.append(0)
test_data['is_capital'] =cap

In [15]:
#Direction of states in N,E,W,S=(1,2,3,4)
direc_state = { 'Goa': 4 , 'Sikkim' : 2, 'Delhi' : 1, 'Chandigarh': 1, 'Haryana': 1,'Telangana': 4,
 'Karnataka':4,'Kerala':4, 'Puducherry': 4, 'Tamil_Nadu':4,
 'Gujarat':3, 'Mizoram': 2, 'Uttarakhand':1, 'Maharashtra': 3, 'Himachal_Pradesh': 1,
 'Andhra_Pradesh': 4, 'Punjab': 1, 'Tripura': 2,
 'Rajasthan': 3, 'West_Bengal': 2, 'Odisha':3, 'Chhattisgarh':2, 'Jammu_and_Kashmir':1,
 'Madhya_Pradesh':2, 'Assam':2, 'Manipur': 2, 'Jharkhand':2, 'Uttar_Pradesh':1,'Bihar':1}

l=[]
for i in range(1,len(train_data)+1):
    l.append(direc_state[train_data['state'][i]])

train_data['Direc_state'] = l

l=[]
for i in range(1,len(test_data)+1):
    l.append(direc_state[test_data['state'][i]])
    
test_data['Direc_state'] = l

#### Profession

In [16]:
govt_job = { 'Mechanical_engineer':'0.1','Software_Developer':'0','Technical_writer':'0','Civil_servant':'1','Librarian':'0','Economist':'0.1',
 'Flight_attendant':'0.05', 'Architect':'0.1','Designer':'0','Physician':'0.2','Financial_Analyst':'0','Air_traffic_controller':'0.3',
 'Politician':'1','Police_officer':'1','Artist':'0','Surveyor':'0.4','Design_Engineer':'0.2','Chemical_engineer':'0.1','Hotel_Manager':'0',
 'Dentist':'0.3','Comedian':'0','Biomedical_Engineer':'0.1','Graphic_Designer':'0','Computer_hardware_engineer':'0','Petroleum_Engineer':'0.2',
 'Secretary':'0.4','Computer_operator':'0','Chartered_Accountant':'0','Technician':'0','Microbiologist':'0','Fashion_Designer':'0',
 'Aviator':'0.1','Psychologist':'0','Magistrate':'1','Lawyer':'0.3','Firefighter':'0.5','Engineer':'0.2','Official':'0.8','Analyst':'0',
 'Geologist':'0.1','Drafter':'0.1','Statistician':'0.1','Web_designer':'0','Consultant':'0','Chef':'0','Army_officer':'1','Surgeon':'0.5',
 'Scientist':'0.4', 'Civil_engineer':'0.2','Industrial_Engineer':'0.1','Technology_specialist':'0'}


l=[]
for i in range(1,len(train_data)+1):
    l.append(float(govt_job[train_data['profession'][i]]))

train_data['govt_job'] = l

l=[]
for i in range(1,len(test_data)+1):
    l.append(float(govt_job[test_data['profession'][i]]))

test_data['govt_job'] = l

In [17]:
#Encoding Professions
def divide(x):
    tech  = ['Analyst','Computer_hardware_engineer', 'Graphic_Designer', 'Design_Engineer','Computer_operator', 'Software_Developer', 'Technical_writer', 'Technology_specialist', 'Web_designer']
    art= ['Architect' ,'Artist', 'Designer', 'Fashion_Designer']
    aero = ['Air_traffic_controller', 'Aviator', 'Flight_attendant']
    engineer = ['Biomedical_Engineer', 'Drafter''Technician', 'Petroleum_Engineer', 'Chemical_engineer', 'Civil_engineer', 'Engineer', 'Industrial_Engineer', 'Mechanical_engineer']
    research = ['Statistician', 'Surveyor', 'Microbiologist', 'Scientist', 'Psychologist', 'Economist', 'Geologist']
    govt = ['Secretary','Firefighter', 'Politician', 'Civil_servant', 'Army_officer', 'Magistrate', 'Police_officer', 'Official']
    health = ['Surgeon', 'Physician', 'Dentist']
    business = ['Chartered_Accountant', 'Financial_Analyst', 'Consultant']
    remain = ['Chef','Comedian','Hotel_Manager', 'Lawyer', 'Librarian']
    if x in tech:
        return 'tech'
    elif x in art:
        return 'art'
    elif x in aero:
        return 'aero'
    elif x in engineer:
        return 'engineer'
    elif x in research:
        return 'research'
    elif x in govt:
        return 'govt'
    elif x in health:
        return 'health'
    elif x in business:
        return 'business'
    else:
        return 'remain'
    return x

l=[]
for i in range(1,len(train_data)+1):
    l.append(divide(train_data['profession'][i]))

train_data['encoded_profession'] = l

l=[]
for i in range(1,len(test_data)+1):
    l.append(divide(train_data['profession'][i]))

test_data['encoded_profession'] = l

In [18]:
def get_frequency_features(df, cols):
    df[[c + '_freq' for c in cols]] = df[cols].apply(lambda x: x.map(x.value_counts()))
    return df

In [19]:
train_data = get_frequency_features(train_data, ['profession'])
test_data = get_frequency_features(test_data, ['profession'])

#### Gender ratio

In [20]:
#Gender ratio of women in each profession
gender_ratio = { 'Mechanical_engineer':0.064,'Software_Developer':0.2,'Technical_writer':0.557,'Civil_servant':0.20,'Librarian':0.838,'Economist':0.258,
 'Flight_attendant':0.758, 'Architect':0.26,'Designer':0.4,'Physician':0.548,'Financial_Analyst':0.369,'Air_traffic_controller':0.33,
 'Politician':0.2,'Police_officer':0.14,'Artist':0.566,'Surveyor':0.079,'Design_Engineer':0.122,'Chemical_engineer':0.201,'Hotel_Manager':0.344,
 'Dentist':0.344,'Comedian':0.118,'Biomedical_Engineer':0.434,'Graphic_Designer':0.5,'Computer_hardware_engineer':0.247,'Petroleum_Engineer':0.203,
 'Secretary':0.90,'Computer_operator':0.508,'Chartered_Accountant':0.613,'Technician':0.513,'Microbiologist':0.71,'Fashion_Designer':0.548,
 'Aviator':0.15,'Psychologist':0.675,'Magistrate':0.342,'Lawyer':0.375,'Firefighter':0.045,'Engineer':0.122,'Official':0.245,'Analyst':0.451,
 'Geologist':0.6,'Drafter':0.15,'Statistician':0.525,'Web_designer':0.336,'Consultant':0.3,'Chef':0.214,'Army_officer':0.14,'Surgeon':0.382,
 'Scientist':0.3, 'Civil_engineer':0.108,'Industrial_Engineer':0.203,'Technology_specialist':0.2 }

l=[]
for i in range(1,len(train_data)+1):
    l.append(gender_ratio[train_data['profession'][i]])

train_data['gender_ratio'] = l

l=[]
for i in range(1,len(test_data)+1):
    l.append(gender_ratio[test_data['profession'][i]])
    
test_data['gender_ratio'] = l


In [21]:
sex_ratio_states = { 'Goa': 968 , 'Sikkim' : 889, 'Delhi' : 886, 'Chandigarh': 818, 'Haryana': 877,'Telangana': 992,
 'Karnataka':968,'Kerala':1084, 'Puducherry': 1038, 'Tamil_Nadu':995,
 'Gujarat':918, 'Mizoram': 975, 'Uttarakhand':963, 'Maharashtra': 925, 'Himachal_Pradesh': 974,
 'Andhra_Pradesh': 992, 'Punjab': 893, 'Tripura': 961,
 'Rajasthan': 926, 'West_Bengal': 947, 'Odisha':978, 'Chhattisgarh':991, 'Jammu_and_Kashmir':883,
 'Madhya_Pradesh': 930, 'Assam':954, 'Manipur': 987, 'Jharkhand':947, 'Uttar_Pradesh':908,'Bihar':916}

l=[]
for i in range(1,len(train_data)+1):
    l.append(sex_ratio_states[train_data['state'][i]])

train_data['sex_ratio_state'] = l

l=[]
for i in range(1,len(test_data)+1):
    l.append(sex_ratio_states[test_data['state'][i]])
    
test_data['sex_ratio_state'] = l

In [22]:
#Human Development Index
hdi_states = { 'Goa': 0.761 , 'Sikkim' : 0.716, 'Delhi' : 0.746, 'Chandigarh': 0.775, 'Haryana': 0.704,'Telangana': 0.669,
 'Karnataka':0.682,'Kerala':0.779, 'Puducherry': 0.738, 'Tamil_Nadu':0.708,
 'Gujarat':0.667, 'Mizoram': 0.7, 'Uttarakhand':0.684, 'Maharashtra': 0.696, 'Himachal_Pradesh': 0.725,
 'Andhra_Pradesh': 0.65, 'Punjab': 0.65, 'Tripura': 0.663,
 'Rajasthan': 0.629, 'West_Bengal': 0.641, 'Odisha':0.606, 'Chhattisgarh':0.613, 'Jammu_and_Kashmir':0.6,
 'Madhya_Pradesh': 0.606, 'Assam':0.614, 'Manipur': 0.696, 'Jharkhand':0.599, 'Uttar_Pradesh':0.596,'Bihar':0.566}

l=[]
for i in range(1,len(train_data)+1):
    l.append(hdi_states[train_data['state'][i]])

train_data['HDI_state'] = l

l=[]
for i in range(1,len(test_data)+1):
    l.append(hdi_states[test_data['state'][i]])
    
test_data['HDI_state'] = l

#### Encode data

In [23]:
train_data['late_marriage'] = train_data['late_marriage'].astype(int)
train_data['old'] = train_data['old'].astype(int)
train_data = pd.get_dummies(train_data, columns=['married','combination','house_ownership','car_ownership','encoded_profession','Direc_state'])

In [24]:
test_data['late_marriage'] = test_data['late_marriage'].astype(int)
test_data['old'] = test_data['old'].astype(int)
test_data = pd.get_dummies(test_data, columns=['married','combination','house_ownership','car_ownership','encoded_profession','Direc_state'])

In [25]:
ordinalencoder = OrdinalEncoder()
train_data[['city','profession','state']] = ordinalencoder.fit_transform(train_data[['city','profession','state']])
test_data[['city','profession','state']] = ordinalencoder.fit_transform(test_data[['city','profession','state']])

### Data Standardization/Normalization

In [26]:
pd.set_option('display.max_columns', None)
train_data.head(-1)

Unnamed: 0_level_0,income,age,experience,profession,city,state,current_job_years,current_house_years,risk_flag,free_period,youth,adult,old,late_marriage,per_capita_income,state_risk,is_capital,govt_job,profession_freq,gender_ratio,sex_ratio_state,HDI_state,married_married,married_single,combination_norent_noown_no,combination_norent_noown_yes,combination_owned_no,combination_owned_yes,combination_rented_no,combination_rented_yes,house_ownership_norent_noown,house_ownership_owned,house_ownership_rented,car_ownership_no,car_ownership_yes,encoded_profession_aero,encoded_profession_art,encoded_profession_business,encoded_profession_engineer,encoded_profession_govt,encoded_profession_health,encoded_profession_remain,encoded_profession_research,encoded_profession_tech,Direc_state_1,Direc_state_2,Direc_state_3,Direc_state_4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
1,1303835,23,3,33.0,250.0,13.0,3,13,0,20,0,0,0,0,103288,0.4,0,0.1,5217,0.064,930,0.606,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0
2,7574516,40,10,43.0,226.0,14.0,9,13,0,30,1,1,0,1,202130,0.4,0,0.0,5053,0.200,925,0.696,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
3,3991815,66,4,47.0,8.0,12.0,4,10,0,62,1,0,1,0,221904,0.5,0,0.0,5195,0.557,1084,0.779,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1
4,6256451,41,2,43.0,53.0,17.0,2,12,1,39,1,1,0,1,98896,0.5,1,0.0,5053,0.200,978,0.606,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0
5,5768871,47,11,11.0,295.0,22.0,3,14,1,36,1,1,0,1,218599,1.0,0,1.0,4413,0.200,995,0.708,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251995,7215678,27,8,5.0,262.0,13.0,8,10,0,19,1,1,0,0,103288,0.4,0,0.1,4758,0.150,930,0.606,0,1,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
251996,8154883,43,13,45.0,161.0,27.0,6,11,0,30,1,1,0,1,115348,0.0,1,0.5,4772,0.382,947,0.641,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0
251997,2843572,26,10,3.0,250.0,13.0,6,11,0,16,1,1,0,0,103288,0.4,0,1.0,4661,0.140,930,0.606,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0
251998,4522448,46,7,17.0,143.0,14.0,7,12,0,39,1,1,0,1,202130,0.4,0,0.2,4729,0.122,925,0.696,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0


In [27]:
train_data.describe()

Unnamed: 0,income,age,experience,profession,city,state,current_job_years,current_house_years,risk_flag,free_period,youth,adult,old,late_marriage,per_capita_income,state_risk,is_capital,govt_job,profession_freq,gender_ratio,sex_ratio_state,HDI_state,married_married,married_single,combination_norent_noown_no,combination_norent_noown_yes,combination_owned_no,combination_owned_yes,combination_rented_no,combination_rented_yes,house_ownership_norent_noown,house_ownership_owned,house_ownership_rented,car_ownership_no,car_ownership_yes,encoded_profession_aero,encoded_profession_art,encoded_profession_business,encoded_profession_engineer,encoded_profession_govt,encoded_profession_health,encoded_profession_remain,encoded_profession_research,encoded_profession_tech,Direc_state_1,Direc_state_2,Direc_state_3,Direc_state_4
count,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0
mean,4997117.0,49.954071,10.084437,25.276746,157.218536,13.705381,6.333877,11.997794,0.123,39.869635,0.948583,0.592861,0.320683,0.706679,151113.139794,0.473252,0.066448,0.209535,4968.970794,0.355179,944.92804,0.648459,0.102095,0.897905,0.019917,0.008591,0.035567,0.015694,0.642929,0.277302,0.028508,0.051262,0.92023,0.698413,0.301587,0.060187,0.077063,0.057413,0.136921,0.151052,0.061552,0.137861,0.138163,0.17979,0.286373,0.240401,0.201595,0.271631
std,2878311.0,17.063863,6.00259,14.728537,92.073914,9.220625,3.647053,1.399037,0.328438,18.095181,0.220847,0.491302,0.46674,0.455285,73792.213985,0.274014,0.249065,0.305014,371.608515,0.208199,41.383836,0.050451,0.302774,0.302774,0.139714,0.09229,0.185209,0.124291,0.479137,0.447668,0.166419,0.220532,0.270937,0.458948,0.458948,0.237833,0.266693,0.23263,0.343764,0.3581,0.24034,0.344755,0.345071,0.384013,0.452067,0.427328,0.401192,0.444801
min,10310.0,21.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,1.0,0.0,0.0,0.0,0.0,46664.0,0.0,0.0,0.0,4048.0,0.045,818.0,0.566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2503015.0,35.0,5.0,13.0,77.0,6.0,3.0,11.0,0.0,25.0,1.0,0.0,0.0,0.0,90758.0,0.4,0.0,0.0,4672.0,0.2,916.0,0.606,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5000694.0,50.0,10.0,26.0,156.0,14.0,6.0,12.0,0.0,40.0,1.0,1.0,0.0,1.0,161083.0,0.5,0.0,0.1,4990.0,0.336,930.0,0.65,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7477502.0,65.0,15.0,38.0,237.0,22.0,9.0,13.0,0.0,55.0,1.0,1.0,1.0,1.0,216329.0,0.5,0.0,0.3,5217.0,0.513,978.0,0.696,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
max,9999938.0,79.0,20.0,50.0,315.0,27.0,14.0,14.0,1.0,79.0,1.0,1.0,1.0,1.0,425656.0,1.0,1.0,1.0,5957.0,0.9,1084.0,0.779,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


##### Insight
So we know we have to standardize the income, sex_ratio\
And we need to normalize age,experience, profession, city, state, per_capita_income, Direc_state, encoded_profession

In [28]:
from sklearn.preprocessing import StandardScaler
stdscaler = StandardScaler()
#Train_data
train_data[['income','sex_ratio_state']] = stdscaler.fit_transform(train_data[['income','sex_ratio_state']])

#Test_data
test_data[['income','sex_ratio_state']] = stdscaler.fit_transform(test_data[['income','sex_ratio_state']])

In [29]:
#MinMaxscaler() Normalization
from sklearn.preprocessing import MinMaxScaler
minmaxscaler = MinMaxScaler()
minmaxscaler.fit(train_data[['age']])
train_data['age'] = minmaxscaler.transform(train_data[['age']])

#Test Data
minmaxscaler.fit(test_data[['age']])
test_data['age'] = minmaxscaler.transform(test_data[['age']])


#experience, Current-job_years, current_house_years,free_period

feature_array = ['experience','current_job_years','current_house_years','per_capita_income','free_period','city','profession','state']
minmaxscaler.fit(train_data[feature_array])
train_data[feature_array] = minmaxscaler.transform(train_data[feature_array])

#Test Data
minmaxscaler.fit(test_data[feature_array])
test_data[feature_array] = minmaxscaler.transform(test_data[feature_array])


# Convert datatype of selected fields.
Encoding the categorical data


In [30]:
train_data.describe()

Unnamed: 0,income,age,experience,profession,city,state,current_job_years,current_house_years,risk_flag,free_period,youth,adult,old,late_marriage,per_capita_income,state_risk,is_capital,govt_job,profession_freq,gender_ratio,sex_ratio_state,HDI_state,married_married,married_single,combination_norent_noown_no,combination_norent_noown_yes,combination_owned_no,combination_owned_yes,combination_rented_no,combination_rented_yes,house_ownership_norent_noown,house_ownership_owned,house_ownership_rented,car_ownership_no,car_ownership_yes,encoded_profession_aero,encoded_profession_art,encoded_profession_business,encoded_profession_engineer,encoded_profession_govt,encoded_profession_health,encoded_profession_remain,encoded_profession_research,encoded_profession_tech,Direc_state_1,Direc_state_2,Direc_state_3,Direc_state_4
count,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0
mean,-6.544324e-17,0.499208,0.504222,0.505535,0.499106,0.507607,0.45242,0.499448,0.123,0.498329,0.948583,0.592861,0.320683,0.706679,0.275597,0.473252,0.066448,0.209535,4968.970794,0.355179,1.908532e-15,0.648459,0.102095,0.897905,0.019917,0.008591,0.035567,0.015694,0.642929,0.277302,0.028508,0.051262,0.92023,0.698413,0.301587,0.060187,0.077063,0.057413,0.136921,0.151052,0.061552,0.137861,0.138163,0.17979,0.286373,0.240401,0.201595,0.271631
std,1.000002,0.294205,0.300129,0.294571,0.292298,0.341505,0.260504,0.349759,0.328438,0.231989,0.220847,0.491302,0.46674,0.455285,0.194707,0.274014,0.249065,0.305014,371.608515,0.208199,1.000002,0.050451,0.302774,0.302774,0.139714,0.09229,0.185209,0.124291,0.479137,0.447668,0.166419,0.220532,0.270937,0.458948,0.458948,0.237833,0.266693,0.23263,0.343764,0.3581,0.24034,0.344755,0.345071,0.384013,0.452067,0.427328,0.401192,0.444801
min,-1.73255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4048.0,0.045,-3.067098,0.566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.8665174,0.241379,0.25,0.26,0.244444,0.222222,0.214286,0.25,0.0,0.307692,1.0,0.0,0.0,0.0,0.116345,0.4,0.0,0.0,4672.0,0.2,-0.6990192,0.606,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.001243035,0.5,0.5,0.52,0.495238,0.518519,0.428571,0.5,0.0,0.5,1.0,1.0,0.0,1.0,0.301903,0.5,0.0,0.1,4990.0,0.336,-0.3607222,0.65,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.861752,0.758621,0.75,0.76,0.752381,0.814815,0.642857,0.75,0.0,0.692308,1.0,1.0,1.0,1.0,0.447674,0.5,0.0,0.3,5217.0,0.513,0.7991532,0.696,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
max,1.738114,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5957.0,0.9,3.360545,0.779,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 252000 entries, 1 to 252000
Data columns (total 48 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   income                        252000 non-null  float64
 1   age                           252000 non-null  float64
 2   experience                    252000 non-null  float64
 3   profession                    252000 non-null  float64
 4   city                          252000 non-null  float64
 5   state                         252000 non-null  float64
 6   current_job_years             252000 non-null  float64
 7   current_house_years           252000 non-null  float64
 8   risk_flag                     252000 non-null  int64  
 9   free_period                   252000 non-null  float64
 10  youth                         252000 non-null  int32  
 11  adult                         252000 non-null  int32  
 12  old                           252000 non-nul

In [32]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28000 entries, 1 to 28000
Data columns (total 47 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   income                        28000 non-null  float64
 1   age                           28000 non-null  float64
 2   experience                    28000 non-null  float64
 3   profession                    28000 non-null  float64
 4   city                          28000 non-null  float64
 5   state                         28000 non-null  float64
 6   current_job_years             28000 non-null  float64
 7   current_house_years           28000 non-null  float64
 8   free_period                   28000 non-null  float64
 9   youth                         28000 non-null  int32  
 10  adult                         28000 non-null  int32  
 11  old                           28000 non-null  int32  
 12  late_marriage                 28000 non-null  int32  
 13  p

# Define Variables

In [33]:
#SMOTE Oversampling
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)
#Oversampling data
X_oversample, y_oversample = smote.fit_resample(train_data.drop(['risk_flag'], axis=1), train_data['risk_flag'])


print(f'After OverSampling, Counts: \nLabel 1: {sum(y_oversample==1)}\nLabel 0: {sum(y_oversample==0)}')

After OverSampling, Counts: 
Label 1: 221004
Label 0: 221004


In [34]:
#Ensure to drop the Unnamed field from both datasets
#Train variables
xtrain=train_data.drop("risk_flag",axis=1)
ytrain=train_data["risk_flag"]

xtrain,xval,ytrain,yval = train_test_split(X_oversample,y_oversample,train_size=0.9,random_state=0)

#Testing Dataset
xtest = test_data

In [35]:
print(f'Dataset Sizes: \nTrain: X: {xtrain.shape}, y: {ytrain.shape}\nValidation: X: {xval.shape}, y: {yval.shape}\nTest: X: {xtest.shape}')

Dataset Sizes: 
Train: X: (397807, 47), y: (397807,)
Validation: X: (44201, 47), y: (44201,)
Test: X: (28000, 47)


In [36]:
yval.value_counts()

0    22369
1    21832
Name: risk_flag, dtype: int64

# Train models

In [37]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(random_state=0,
                         iterations = 2000,
                         loss_function = 'CrossEntropy',
                         custom_loss = 'AUC',
                         eval_metric='AUC',
                         learning_rate = 0.1)

cat.fit(xtrain,ytrain,
        eval_set=(xval,yval),
        verbose = 400,
        early_stopping_rounds=200)

0:	test: 0.5708970	best: 0.5708970 (0)	total: 194ms	remaining: 6m 27s
400:	test: 0.9558347	best: 0.9558347 (400)	total: 25.3s	remaining: 1m 40s
800:	test: 0.9615772	best: 0.9615905 (799)	total: 51s	remaining: 1m 16s
1200:	test: 0.9639621	best: 0.9639621 (1200)	total: 1m 17s	remaining: 51.6s
1600:	test: 0.9653934	best: 0.9654041 (1599)	total: 1m 43s	remaining: 25.8s
1999:	test: 0.9664569	best: 0.9664569 (1999)	total: 2m 9s	remaining: 0us

bestTest = 0.9664569032
bestIteration = 1999



<catboost.core.CatBoostClassifier at 0x1cf84f0ca48>

In [38]:
ypred_val = cat.predict(xval)
print(roc_auc_score(yval,ypred_val))

0.9403650748977792


In [39]:
ypred = cat.predict(xtrain)
print(roc_auc_score(ytrain,ypred))

0.9406620245230614


#Generate Sample weights
sample_weights = np.ones(shape=len(ytrain))
sample_weights[ytrain==1] = 5

#Bagging Classifier
from sklearn.ensemble import BaggingRegressor
BGClassifier = BaggingRegressor(random_state=0,
                                max_samples = 0.3,
                                n_estimators = 100)
BGClassifier.fit(xtrain,ytrain)
ypred_val = BGClassifier.predict(xval)
BGscore = roc_auc_score(yval,ypred_val)
print(BGscore)

ypred_val[ypred_val>=0.55] = 1
ypred_val[ypred_val<0.55] = 0
ypred_val = ypred_val.astype(int)
print(np.bincount(ypred_val.astype(int)))

ypred = BGClassifier.predict(xtrain)
BGscore = roc_auc_score(ytrain,ypred)
print(BGscore)

# Evaluation

In [40]:
## Cross validation
X=train_data.drop("risk_flag",axis=1)
y=train_data["risk_flag"]

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

#Define Evaluation Procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats = 3, random_state=0)
#Evaluate model
scores = cross_val_score(cat, train_data.drop("risk_flag",axis=1),train_data["risk_flag"], scoring='roc_auc', cv=cv )

0:	total: 39.7ms	remaining: 1m 19s
1:	total: 75.1ms	remaining: 1m 15s
2:	total: 122ms	remaining: 1m 21s
3:	total: 163ms	remaining: 1m 21s
4:	total: 196ms	remaining: 1m 18s
5:	total: 230ms	remaining: 1m 16s
6:	total: 272ms	remaining: 1m 17s
7:	total: 319ms	remaining: 1m 19s
8:	total: 356ms	remaining: 1m 18s
9:	total: 388ms	remaining: 1m 17s
10:	total: 428ms	remaining: 1m 17s
11:	total: 510ms	remaining: 1m 24s
12:	total: 551ms	remaining: 1m 24s
13:	total: 591ms	remaining: 1m 23s
14:	total: 627ms	remaining: 1m 22s
15:	total: 649ms	remaining: 1m 20s
16:	total: 682ms	remaining: 1m 19s
17:	total: 718ms	remaining: 1m 19s
18:	total: 753ms	remaining: 1m 18s
19:	total: 801ms	remaining: 1m 19s
20:	total: 841ms	remaining: 1m 19s
21:	total: 875ms	remaining: 1m 18s
22:	total: 935ms	remaining: 1m 20s
23:	total: 972ms	remaining: 1m 20s
24:	total: 1.03s	remaining: 1m 21s
25:	total: 1.06s	remaining: 1m 20s
26:	total: 1.1s	remaining: 1m 20s
27:	total: 1.15s	remaining: 1m 21s
28:	total: 1.19s	remaining: 1

In [44]:
np.mean(scores)

0.9128725501618636

In [42]:
#Classification Report
from sklearn.metrics import classification_report
print(f'bincount[0,1] = {np.bincount(ypred_val)}')
print(classification_report(yval,ypred_val))
print()

#Further Scoring Analysis
    #we want fp and fn to fall to near zero while keeping the rest same or increase it slightly
from sklearn.metrics import confusion_matrix
def display_summary(true,pred):
    tn, fp, fn, tp = confusion_matrix(true,pred).ravel()
    print('confusion matrix')
    print(np.array([[tp,fp],[fn,tn]]))
    print('sensitivity is %f',1.*tp/(tp+fn))
    print('specificity is %f',1.*tn/(tn+fp))
    print('accuracy is %f',1.*(tp+tn)/(tp+tn+fp+fn))
    print('balanced accuracy is %',1./2*(1.*tp/(tp+fn)+1.*tn/(tn+fp)))

display_summary(yval,ypred_val)

bincount[0,1] = [21096 23105]
              precision    recall  f1-score   support

           0       0.97      0.91      0.94     22369
           1       0.92      0.97      0.94     21832

    accuracy                           0.94     44201
   macro avg       0.94      0.94      0.94     44201
weighted avg       0.94      0.94      0.94     44201


confusion matrix
[[21143  1962]
 [  689 20407]]
sensitivity is %f 0.9684408208134848
specificity is %f 0.9122893289820734
accuracy is %f 0.9400239813578878
balanced accuracy is % 0.9403650748977791


In [65]:
np.bincount(ypred.astype(int))

array([185036, 212771], dtype=int64)

# Testing

In [46]:
cat.fit(X_oversample,y_oversample)
ypred = cat.predict(xtest)
print(np.bincount(ypred))

0:	total: 61.6ms	remaining: 2m 3s
1:	total: 108ms	remaining: 1m 47s
2:	total: 163ms	remaining: 1m 48s
3:	total: 213ms	remaining: 1m 46s
4:	total: 266ms	remaining: 1m 46s
5:	total: 325ms	remaining: 1m 47s
6:	total: 378ms	remaining: 1m 47s
7:	total: 435ms	remaining: 1m 48s
8:	total: 496ms	remaining: 1m 49s
9:	total: 551ms	remaining: 1m 49s
10:	total: 607ms	remaining: 1m 49s
11:	total: 673ms	remaining: 1m 51s
12:	total: 732ms	remaining: 1m 51s
13:	total: 782ms	remaining: 1m 50s
14:	total: 835ms	remaining: 1m 50s
15:	total: 884ms	remaining: 1m 49s
16:	total: 934ms	remaining: 1m 48s
17:	total: 986ms	remaining: 1m 48s
18:	total: 1.05s	remaining: 1m 49s
19:	total: 1.1s	remaining: 1m 48s
20:	total: 1.16s	remaining: 1m 49s
21:	total: 1.21s	remaining: 1m 48s
22:	total: 1.26s	remaining: 1m 48s
23:	total: 1.31s	remaining: 1m 47s
24:	total: 1.36s	remaining: 1m 47s
25:	total: 1.41s	remaining: 1m 47s
26:	total: 1.46s	remaining: 1m 46s
27:	total: 1.51s	remaining: 1m 46s
28:	total: 1.56s	remaining: 1m 

In [55]:
test_data = pd.read_csv("Test Data.csv")
ss['id'] = test_data['id']
ss['risk_flag'] = ypred

In [57]:
ss.to_csv('submission1.csv',index=False)