## Importing Libraries and Dataset

In [1]:
## Importing the libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [5]:
## Loading dataset

insurance_train = pd.read_csv("self_train.csv")
insurance_test = pd.read_csv("self_test_wo_response.csv")
pd.set_option("display.max_columns", None)

## Analyzing and Cleaning Dataset

In [6]:
## Viewing dataset info

insurance_train.info()
insurance_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41566 entries, 0 to 41565
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 40.6+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17815 entries, 0 to 17814
Columns: 127 entries, Id to Medical_Keyword_48
dtypes: float64(18), int64(108), object(1)
memory usage: 17.3+ MB


In [7]:
## Checking for null values

insurance_train.isna().sum()

Id                    0
Product_Info_1        0
Product_Info_2        0
Product_Info_3        0
Product_Info_4        0
                     ..
Medical_Keyword_45    0
Medical_Keyword_46    0
Medical_Keyword_47    0
Medical_Keyword_48    0
Response              0
Length: 128, dtype: int64

In [8]:
## Checking for null values

insurance_test.isna().sum()

Id                    0
Product_Info_1        0
Product_Info_2        0
Product_Info_3        0
Product_Info_4        0
                     ..
Medical_Keyword_44    0
Medical_Keyword_45    0
Medical_Keyword_46    0
Medical_Keyword_47    0
Medical_Keyword_48    0
Length: 127, dtype: int64

Filling Missing Values with Mean

In [9]:
## Filling missing values

insurance_train['Employment_Info_4'].fillna(insurance_train['Employment_Info_4'].mean(), inplace=True)  
insurance_train['Insurance_History_5'].fillna(insurance_train['Insurance_History_5'].mean(), inplace=True)  
insurance_train['Employment_Info_6'].fillna(insurance_train['Employment_Info_6'].mean(), inplace=True)  
insurance_train['Family_Hist_2'].fillna(insurance_train['Family_Hist_2'].mean(), inplace=True)  
insurance_train['Family_Hist_3'].fillna(insurance_train['Family_Hist_3'].mean(), inplace=True)
insurance_train['Family_Hist_4'].fillna(insurance_train['Family_Hist_4'].mean(), inplace=True) 
insurance_train['Family_Hist_5'].fillna(insurance_train['Family_Hist_5'].mean(), inplace=True) 
insurance_train['Medical_History_1'].fillna(insurance_train['Medical_History_1'].mean(), inplace=True) 
insurance_train['Medical_History_10'].fillna(insurance_train['Medical_History_10'].mean(), inplace=True) 
insurance_train['Medical_History_15'].fillna(insurance_train['Medical_History_15'].mean(), inplace=True)
insurance_train['Medical_History_24'].fillna(insurance_train['Medical_History_24'].mean(), inplace=True)  
insurance_train['Medical_History_32'].fillna(insurance_train['Medical_History_32'].mean(), inplace=True)


## Encoding Categorical features
encode = LabelEncoder()
insurance_train['Product_Info_2'] = encode.fit_transform(insurance_train['Product_Info_2'])


# Drop remaining null values
insurance_train.dropna(axis=1, inplace = True)  

In [10]:
## Filling missing values

insurance_test['Employment_Info_4'].fillna(insurance_test['Employment_Info_4'].mean(), inplace=True)
insurance_test['Insurance_History_5'].fillna(insurance_test['Insurance_History_5'].mean(), inplace=True)
insurance_test['Employment_Info_6'].fillna(insurance_test['Employment_Info_6'].mean(), inplace=True) 
insurance_test['Family_Hist_2'].fillna(insurance_test['Family_Hist_2'].mean(), inplace=True) 
insurance_test['Family_Hist_3'].fillna(insurance_test['Family_Hist_3'].mean(), inplace=True) 
insurance_test['Family_Hist_4'].fillna(insurance_test['Family_Hist_4'].mean(), inplace=True) 
insurance_test['Family_Hist_5'].fillna(insurance_test['Family_Hist_5'].mean(), inplace=True) 
insurance_test['Medical_History_1'].fillna(insurance_test['Medical_History_1'].mean(), inplace=True) 
insurance_test['Medical_History_10'].fillna(insurance_test['Medical_History_10'].mean(), inplace=True) 
insurance_test['Medical_History_15'].fillna(insurance_test['Medical_History_15'].mean(), inplace=True)  
insurance_test['Medical_History_24'].fillna(insurance_test['Medical_History_24'].mean(), inplace=True) 
insurance_test['Medical_History_32'].fillna(insurance_test['Medical_History_32'].mean(), inplace=True) 

## Encoding Categorical features
insurance_test['Product_Info_2'] = encode.fit_transform(insurance_test['Product_Info_2'])

# Drop remaining null values
insurance_test.dropna(axis=1, inplace = True)  

#### Separating data into x and y labels

In [11]:
## Creating x and y data labels

x = insurance_train.drop(['Id','Response'],axis = 1)
y = insurance_train['Response']

In [13]:
## Displaying x variable

x

Unnamed: 0,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,BMI,Employment_Info_2,Employment_Info_3,Employment_Info_4,Employment_Info_5,Employment_Info_6,InsuredInfo_1,InsuredInfo_2,InsuredInfo_3,InsuredInfo_4,InsuredInfo_5,InsuredInfo_6,InsuredInfo_7,Insurance_History_1,Insurance_History_2,Insurance_History_3,Insurance_History_4,Insurance_History_5,Insurance_History_7,Insurance_History_8,Insurance_History_9,Family_Hist_1,Family_Hist_2,Family_Hist_3,Family_Hist_4,Family_Hist_5,Medical_History_1,Medical_History_2,Medical_History_3,Medical_History_4,Medical_History_5,Medical_History_6,Medical_History_7,Medical_History_8,Medical_History_9,Medical_History_10,Medical_History_11,Medical_History_12,Medical_History_13,Medical_History_14,Medical_History_15,Medical_History_16,Medical_History_17,Medical_History_18,Medical_History_19,Medical_History_20,Medical_History_21,Medical_History_22,Medical_History_23,Medical_History_24,Medical_History_25,Medical_History_26,Medical_History_27,Medical_History_28,Medical_History_29,Medical_History_30,Medical_History_31,Medical_History_32,Medical_History_33,Medical_History_34,Medical_History_35,Medical_History_36,Medical_History_37,Medical_History_38,Medical_History_39,Medical_History_40,Medical_History_41,Medical_Keyword_1,Medical_Keyword_2,Medical_Keyword_3,Medical_Keyword_4,Medical_Keyword_5,Medical_Keyword_6,Medical_Keyword_7,Medical_Keyword_8,Medical_Keyword_9,Medical_Keyword_10,Medical_Keyword_11,Medical_Keyword_12,Medical_Keyword_13,Medical_Keyword_14,Medical_Keyword_15,Medical_Keyword_16,Medical_Keyword_17,Medical_Keyword_18,Medical_Keyword_19,Medical_Keyword_20,Medical_Keyword_21,Medical_Keyword_22,Medical_Keyword_23,Medical_Keyword_24,Medical_Keyword_25,Medical_Keyword_26,Medical_Keyword_27,Medical_Keyword_28,Medical_Keyword_29,Medical_Keyword_30,Medical_Keyword_31,Medical_Keyword_32,Medical_Keyword_33,Medical_Keyword_34,Medical_Keyword_35,Medical_Keyword_36,Medical_Keyword_37,Medical_Keyword_38,Medical_Keyword_39,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48
0,1,16,10,0.076923,2,1,1,0.641791,0.581818,0.148536,0.323008,12,1,0.000000,3,0.361417,1,2,6,3,1,2,1,1,1,3,1,0.000667,1,1,2,2,0.474152,0.598039,0.444931,0.526786,4.000000,112,2,1,1,3,2,2,1,146.118863,3,2,3,3,240.000000,3,3,1,1,2,1,2,3,50.635409,1,3,3,1,3,2,3,12.703085,1,3,1,2,2,1,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,18,26,0.076923,2,3,1,0.029851,0.745455,0.288703,0.428780,9,1,0.000000,2,0.030000,1,2,8,3,1,1,1,2,1,1,3,0.001723,3,2,3,3,0.304348,0.497679,0.225352,0.484692,10.000000,3,2,2,1,3,2,2,2,146.118863,3,2,3,3,124.213906,1,3,1,1,2,1,2,3,50.635409,2,2,3,1,3,2,3,12.703085,3,3,1,3,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,17,10,0.487179,2,3,1,0.164179,0.672727,0.205021,0.352438,9,1,0.000000,3,0.200000,2,2,8,3,1,2,1,2,1,1,3,0.001723,3,2,3,3,0.420290,0.497679,0.352113,0.484692,0.000000,350,2,2,1,3,2,2,2,146.118863,3,2,3,3,124.213906,1,3,1,1,2,2,2,3,50.635409,1,3,3,1,3,2,3,12.703085,3,3,1,2,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,15,26,0.230769,2,3,1,0.417910,0.654545,0.234310,0.424046,9,1,0.000000,2,0.050000,1,2,6,3,1,2,1,2,1,1,3,0.001723,3,2,3,2,0.463768,0.497679,0.408451,0.484692,7.942592,162,2,2,1,3,2,2,2,146.118863,3,2,3,3,124.213906,1,3,1,1,2,1,2,3,50.635409,2,2,3,1,3,2,3,12.703085,3,3,1,3,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,15,26,0.230769,3,1,1,0.507463,0.836364,0.299163,0.364887,15,1,0.000000,2,1.000000,1,2,8,3,1,1,1,2,1,3,2,0.005000,1,3,2,2,0.474152,0.294118,0.507042,0.484692,6.000000,491,2,2,1,3,2,2,2,146.118863,3,2,3,3,124.213906,1,3,2,1,2,2,2,3,50.635409,1,3,3,1,3,2,3,12.703085,3,1,1,2,2,1,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41561,1,17,26,0.307692,2,3,1,0.164179,0.690909,0.288703,0.484658,9,1,0.000000,2,0.020000,1,2,5,2,1,1,1,2,1,1,3,0.001723,3,2,3,3,0.405797,0.497679,0.295775,0.484692,68.000000,491,2,2,1,3,2,2,2,146.118863,3,2,3,3,124.213906,1,3,1,1,2,1,1,3,50.635409,2,2,3,1,3,2,3,12.703085,3,3,1,3,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
41562,1,16,26,0.230769,2,3,1,0.432836,0.800000,0.403766,0.551119,9,1,0.000010,2,0.350000,1,2,3,3,1,1,1,2,1,3,2,0.000267,1,3,2,3,0.565217,0.497679,0.478873,0.484692,24.000000,491,2,2,1,3,2,2,2,146.118863,3,2,3,3,124.213906,1,3,1,1,2,1,2,3,50.635409,2,2,3,1,3,2,3,12.703085,3,3,1,3,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
41563,1,18,26,0.076923,2,3,1,0.104478,0.745455,0.246862,0.360969,9,1,0.000000,2,0.361417,1,2,6,3,1,1,1,2,1,1,3,0.001723,3,2,3,3,0.173913,0.497679,0.126761,0.484692,7.942592,162,2,2,1,3,2,2,2,146.118863,3,2,3,3,124.213906,1,3,1,1,2,1,2,3,50.635409,2,2,3,1,3,2,3,12.703085,3,1,1,3,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
41564,1,15,10,0.230769,2,3,1,0.507463,0.690909,0.276151,0.462452,9,1,0.006147,3,0.361417,1,2,3,3,1,2,1,2,1,1,3,0.001723,3,2,3,2,0.474152,0.372549,0.704225,0.484692,0.000000,16,2,1,1,3,2,2,2,146.118863,3,2,1,3,240.000000,1,3,1,1,2,1,2,3,50.635409,1,3,3,1,3,2,3,12.703085,1,3,1,2,2,1,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


#### Feature Selection

In [14]:
## Vectorizing the features

from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold(threshold=0.07)
vt.fit(x)

In [15]:
vt.get_support()

array([False,  True,  True,  True, False,  True,  True, False, False,
       False, False,  True,  True, False,  True,  True,  True, False,
        True,  True, False,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False, False, False, False,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True, False, False,  True, False,  True,  True, False, False,
       False, False,  True, False,  True,  True,  True,  True, False,
       False,  True, False, False,  True,  True,  True, False,  True,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
        True, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False])

In [16]:
x.columns

Index(['Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_4',
       'Product_Info_5', 'Product_Info_6', 'Product_Info_7', 'Ins_Age', 'Ht',
       'Wt',
       ...
       'Medical_Keyword_39', 'Medical_Keyword_40', 'Medical_Keyword_41',
       'Medical_Keyword_42', 'Medical_Keyword_43', 'Medical_Keyword_44',
       'Medical_Keyword_45', 'Medical_Keyword_46', 'Medical_Keyword_47',
       'Medical_Keyword_48'],
      dtype='object', length=125)

### ML Model and Prediction

Defining the model

In [18]:
## Decision Tree

RF = RandomForestClassifier(random_state = 1, n_estimators = 5000,  min_samples_leaf = 100, criterion = 'gini',
                            oob_score = False)

Training the model

In [19]:
## Training the model

RF.fit(x,y)

Testing the model

In [20]:
## Vectorizing the features

from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold(threshold=0.07)
vt.fit(insurance_test)

In [21]:
vt.get_support()

array([ True, False,  True,  True,  True, False,  True,  True, False,
       False, False, False,  True,  True, False,  True,  True,  True,
       False,  True,  True, False,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True, False, False, False,
       False,  True,  True,  True,  True, False,  True, False,  True,
        True,  True, False, False,  True, False,  True,  True, False,
       False, False, False,  True, False,  True,  True,  True,  True,
       False, False,  True, False, False,  True,  True,  True, False,
        True, False, False,  True, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False,  True, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False])

Dropping the missing variable

In [22]:
## Dropping unnecessary variable

insurance_test.drop('InsuredInfo_7', axis = 1, inplace = True)

Predicting the values

In [23]:
## Predicting values

predicted = RF.predict(insurance_test)

Getting Corresponding ID

In [24]:
## Getting ID variable from the dataset

get_ID = pd.read_csv('self_test_wo_response.csv')

Assigning predicted values

In [25]:
## Assigning the predicted variable

get_ID['response'] = predicted

Exporting submission file

In [26]:
## Exporting csv file from dataframe

get_ID.to_csv('Submission_File.csv')