# Data Science Nigeria 2019 Challenge #1: Insurance Prediction

## Imports

In [1]:
# import ML libraries
# You may need to install some of these libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from rgf.sklearn import RGFClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, StratifiedKFold, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier

sns.set()
%matplotlib inline
np.random.seed(23)

pd.set_option('max_columns', 500)
# pd.set_option('max_rows', 500)
pd.set_option('max_colwidth', 500)

In [2]:
# import the data
train = pd.read_csv('data/train_data.csv')
test = pd.read_csv('data/test_data.csv')
vd = pd.read_csv('data/VariableDescription.csv')
submit = pd.read_csv('data/sample_submission.csv')

In [3]:
train.head(10)

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0
5,H4977,2012,1.0,0,V,N,O,R,535.0,1,1980.0,3,1143,0
6,H7390,2012,1.0,0,N,V,V,U,2830.0,1,1988.0,.,1143,0
7,H14488,2015,1.0,0,N,V,V,U,4952.0,1,1988.0,.,1160,0
8,H19355,2014,1.0,0,V,N,O,R,2735.0,1,2013.0,3,1173,1
9,H18601,2015,1.0,0,V,N,O,R,520.0,1,2011.0,2,1224,0


In [4]:
test.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code
0,H11920,2013,1.0,0,V,N,O,R,300.0,1,1960.0,3,3310
1,H11921,2016,0.997268,0,V,N,O,R,300.0,1,1960.0,3,3310
2,H9805,2013,0.369863,0,V,V,V,U,790.0,1,1960.0,.,3310
3,H7493,2014,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321
4,H7494,2016,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321


In [5]:
vd.head(50)

Unnamed: 0,Variable,Description
0,Customer Id,Identification number for the Policy holder
1,YearOfObservation,year of observation for the insured policy
2,Insured_Period,"duration of insurance policy in Olusola Insurance. (Ex: Full year insurance, Policy Duration = 1; 6 months = 0.5"
3,Residential,is the building a residential building or not
4,Building_Painted,"is the building painted or not (N-Painted, V-Not Painted)"
5,Building_Fenced,"is the building fence or not (N-Fenced, V-Not Fenced)"
6,Garden,building has garden or not (V-has garden; O-no garden)
7,Settlement,Area where the building is located. (R- rural area; U- urban area)
8,Building Dimension,Size of the insured building in m2
9,Building_Type,"The type of building (Type 1, 2, 3, 4)"


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer Id         7160 non-null   object 
 1   YearOfObservation   7160 non-null   int64  
 2   Insured_Period      7160 non-null   float64
 3   Residential         7160 non-null   int64  
 4   Building_Painted    7160 non-null   object 
 5   Building_Fenced     7160 non-null   object 
 6   Garden              7153 non-null   object 
 7   Settlement          7160 non-null   object 
 8   Building Dimension  7054 non-null   float64
 9   Building_Type       7160 non-null   int64  
 10  Date_of_Occupancy   6652 non-null   float64
 11  NumberOfWindows     7160 non-null   object 
 12  Geo_Code            7058 non-null   object 
 13  Claim               7160 non-null   int64  
dtypes: float64(3), int64(4), object(7)
memory usage: 783.2+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3069 entries, 0 to 3068
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer Id         3069 non-null   object 
 1   YearOfObservation   3069 non-null   int64  
 2   Insured_Period      3069 non-null   float64
 3   Residential         3069 non-null   int64  
 4   Building_Painted    3069 non-null   object 
 5   Building_Fenced     3069 non-null   object 
 6   Garden              3065 non-null   object 
 7   Settlement          3069 non-null   object 
 8   Building Dimension  3056 non-null   float64
 9   Building_Type       3069 non-null   int64  
 10  Date_of_Occupancy   2341 non-null   float64
 11  NumberOfWindows     3069 non-null   object 
 12  Geo_Code            3056 non-null   object 
dtypes: float64(3), int64(3), object(7)
memory usage: 311.8+ KB


In [9]:
submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3068 entries, 0 to 3067
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Customer Id  3068 non-null   object
 1   Claim        3068 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 48.1+ KB


## Exploratory Data Analysis

In [8]:
# Check for duplicates
train.duplicated().sum()

0

In [9]:
# check for missing values
train.isna().sum()

Customer Id             0
YearOfObservation       0
Insured_Period          0
Residential             0
Building_Painted        0
Building_Fenced         0
Garden                  7
Settlement              0
Building Dimension    106
Building_Type           0
Date_of_Occupancy     508
NumberOfWindows         0
Geo_Code              102
Claim                   0
dtype: int64

In [10]:
test.isna().sum()

Customer Id             0
YearOfObservation       0
Insured_Period          0
Residential             0
Building_Painted        0
Building_Fenced         0
Garden                  4
Settlement              0
Building Dimension     13
Building_Type           0
Date_of_Occupancy     728
NumberOfWindows         0
Geo_Code               13
dtype: int64

In [11]:
# fill missing values
train.fillna(-1, inplace=True, axis=1)
test.fillna(-1, inplace=True, axis=1)

In [12]:
train.isna().sum()

Customer Id           0
YearOfObservation     0
Insured_Period        0
Residential           0
Building_Painted      0
Building_Fenced       0
Garden                0
Settlement            0
Building Dimension    0
Building_Type         0
Date_of_Occupancy     0
NumberOfWindows       0
Geo_Code              0
Claim                 0
dtype: int64

In [17]:
train["Building_Type"].value_counts()

Customer Id


H13164    1
H12617    1
H3093     1
H4666     1
H16353    1
         ..
H17972    1
H15021    1
H15873    1
H6177     1
H17954    1
Name: Customer Id, Length: 7160, dtype: int64


YearOfObservation


2012    1858
2013    1811
2014    1373
2015    1075
2016    1043
Name: YearOfObservation, dtype: int64


Insured_Period


1.000000    5325
0.997268     547
0.000000     165
0.747945      60
0.495890      49
0.832877      28
0.246575      27
0.997260      26
0.994536      24
0.915068      23
0.161644      23
0.084932      22
0.967213      22
0.665753      21
0.983562      17
0.580822      16
0.413699      16
0.328767      13
0.751366      12
0.915301      12
0.748634      12
0.836066      10
0.579235       9
0.248634       9
0.494536       9
0.084699       8
0.251366       8
0.502732       8
0.163934       7
0.167123       6
0.989071       6
0.415301       6
0.334247       6
0.584699       6
0.418033       6
0.986301       6
0.104110       5
0.493151       5
0.333333       5
0

## Feature Engineering

In [13]:
train.drop(['Customer Id', 'Geo_Code'], inplace=True, axis=1)
test.drop(['Customer Id', 'Geo_Code'], inplace=True, axis=1)

In [14]:
train['NumberOfWindows'].value_counts()

   .    3551
4        939
3        844
5        639
2        363
6        306
7        211
8        116
1         75
>=10      67
9         49
Name: NumberOfWindows, dtype: int64

In [15]:
# fix number of windows
def fix_windows(x):
    if x == '>=10':
        return 10
    elif x == '   .':
        return -1
    else:
        return int(x)

train['NumberOfWindows'] = train['NumberOfWindows'].apply(fix_windows)
test['NumberOfWindows'] = test['NumberOfWindows'].apply(fix_windows)

In [16]:
train['NumberOfWindows'].value_counts()

-1     3551
 4      939
 3      844
 5      639
 2      363
 6      306
 7      211
 8      116
 1       75
 10      67
 9       49
Name: NumberOfWindows, dtype: int64

In [17]:
# encode categorical variables
combined = train.append(test, ignore_index=True).copy()

In [18]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10229 entries, 0 to 10228
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   YearOfObservation   10229 non-null  int64  
 1   Insured_Period      10229 non-null  float64
 2   Residential         10229 non-null  int64  
 3   Building_Painted    10229 non-null  object 
 4   Building_Fenced     10229 non-null  object 
 5   Garden              10229 non-null  object 
 6   Settlement          10229 non-null  object 
 7   Building Dimension  10229 non-null  float64
 8   Building_Type       10229 non-null  int64  
 9   Date_of_Occupancy   10229 non-null  float64
 10  NumberOfWindows     10229 non-null  int64  
 11  Claim               7160 non-null   float64
dtypes: float64(4), int64(4), object(4)
memory usage: 959.1+ KB


In [19]:
combined = pd.get_dummies(combined)

In [20]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10229 entries, 0 to 10228
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   YearOfObservation   10229 non-null  int64  
 1   Insured_Period      10229 non-null  float64
 2   Residential         10229 non-null  int64  
 3   Building Dimension  10229 non-null  float64
 4   Building_Type       10229 non-null  int64  
 5   Date_of_Occupancy   10229 non-null  float64
 6   NumberOfWindows     10229 non-null  int64  
 7   Claim               7160 non-null   float64
 8   Building_Painted_N  10229 non-null  uint8  
 9   Building_Painted_V  10229 non-null  uint8  
 10  Building_Fenced_N   10229 non-null  uint8  
 11  Building_Fenced_V   10229 non-null  uint8  
 12  Garden_-1           10229 non-null  uint8  
 13  Garden_O            10229 non-null  uint8  
 14  Garden_V            10229 non-null  uint8  
 15  Settlement_R        10229 non-null  uint8  
 16  Sett

In [21]:
train = combined[:7160].copy()
test = combined[7160:].copy()

In [22]:
test.drop('Claim', axis=1, inplace=True)

In [23]:
X = train.drop('Claim', axis=1).copy()
y = train['Claim'].copy()

In [24]:
X.shape, y.shape

((7160, 16), (7160,))

 ## Modelling

Evaluation Metrics for the competition is `roc_auc_score`. Read about it here: [Classification: ROC Curve and AUC](https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5, stratify=y)

### RandomForest

In [29]:
rf = RandomForestClassifier(n_jobs=-1, verbose=5)
rf.fit(X_train, y_train)

predx = rf.predict(X_test)
pred = rf.predict_proba(X_test)
pred = [x[1] for x in pred]

building tree 1 of 100building tree 2 of 100
building tree 3 of 100
building tree 4 of 100

building tree 5 of 100building tree 6 of 100

building tree 7 of 100
building tree 8 of 100
building tree 9 of 100building tree 10 of 100
building tree 11 of 100building tree 12 of 100


building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100building tree 19 of 100

building tree 20 of 100building tree 21 of 100

building tree 22 of 100building tree 23 of 100

building tree 24 of 100
building tree 25 of 100building tree 26 of 100
building tree 27 of 100

building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100building tree 35 of 100

building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100building tree 40 of 100

building tree 41 of 100
building tree 42 of 100bu

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [None]:
print(roc_auc_score(y_test, pred))
print(classification_report(y_test, predx))

In [30]:
# submission
value = rf.predict_proba(test)
value = [x[1] for x in value]

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [32]:
test2 = pd.read_csv('data/test_data.csv')

test2['Claim'] = value

submit = test2[['Customer Id', 'Claim']].copy()

In [33]:
submit.to_csv('data/Submission1.csv', index=False)