## Make sure all the updated version of libraries are loaded 
We will update the library versions and load them (to make sure everyone is at the same level) 

<br>

Make sure you restart your kernel after upgrading the libraries and then import. 


In [1]:
#### Uncomment and run if needed

# !pip install --upgrade numpy pandas scikit-learn catboost 

In [4]:
import pandas as pd
import numpy as np
from termcolor import colored
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold,train_test_split
from catboost import CatBoostClassifier
from collections import Counter

## Loading the datasets

We will first load the train and test datasets along with state mapping and see how exactly the data looks like. 

* We will train our models on train data (consider this as the data available to us) 
* The performance of our models will be tested on test data. Consider test data to be a real life simulation of events on which model performance is tested. 

In [5]:
print(colored('---------------------------------------------------------------------------------------\n','blue',attrs=['bold']))
train_df   = pd.read_csv('Train.csv')
print(colored('This is how the train data looks like - ','blue',attrs=['bold']))
display(train_df.sample(10))


print(colored('---------------------------------------------------------------------------------------\n','blue',attrs=['bold']))
print(colored('This is how the test data looks like','blue',attrs=['bold']))
test_df    = pd.read_csv('Test.csv')
display(test_df.sample(10))


[1m[34m---------------------------------------------------------------------------------------
[0m
[1m[34mThis is how the train data looks like - [0m


Unnamed: 0,ID,Policy Start Date,Policy End Date,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
7238,ID_LQOV70F,14-11-2010,13-11-2011,50,14-11-2010,1,,,,Ikeja,Lagos,CarSafe,0
647,ID_213L20T,09-01-2010,08-01-2011,50,09-01-2010,1,Saloon,,Hyundai,,,CarSafe,0
9112,ID_R56ALLB,18-01-2010,16-01-2011,31,18-01-2010,1,Saloon,Grey,TOYOTA,,,Car Plus,1
5420,ID_GBN1W6U,10-09-2010,09-09-2011,54,10-09-2010,1,Saloon,,TOYOTA,Victoria Island,Lagos,Car Classic,0
6398,ID_J1SBIF2,13-09-2010,12-09-2011,54,13-09-2010,2,Saloon,,TOYOTA,Warri,Delta,Car Classic,1
34,ID_035HSJO,21-06-2010,19-06-2011,36,21-06-2010,1,JEEP,,Mercedes,,,Car Plus,0
8801,ID_QC58LMM,22-07-2010,21-07-2011,70,22-07-2010,3,,,Honda,Biase,Benue,CVTP,0
1143,ID_3IVNP0Q,27-09-2010,26-09-2011,37,27-09-2010,1,,,,,,CarSafe,0
1562,ID_4R7ZNWW,22-11-2010,20-11-2011,44,22-11-2010,1,,,,,,CarSafe,0
8994,ID_QSFY00I,27-11-2010,01-09-2011,38,27-11-2010,1,,,,,,Car Classic,0


[1m[34m---------------------------------------------------------------------------------------
[0m
[1m[34mThis is how the test data looks like[0m


Unnamed: 0,ID,Policy Start Date,Policy End Date,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName
1082,ID_W0AIE6L,04-04-2010,03-04-2011,2,04-04-2010,1,Saloon,,TOYOTA,,,CarSafe
1036,ID_UOBWYR7,05-12-2010,04-12-2011,35,05-12-2010,2,,,,Lagos Mainland,Lagos,Car Classic
733,ID_LI4B144,05-01-2010,04-01-2011,120,05-01-2010,1,,Black,Mitsubishi,Victoria Island,Lagos,Car Classic
671,ID_JUO7R24,17-05-2010,16-05-2011,30,17-05-2010,1,,As Attached,TOYOTA,Abuja,Abuja,CVTP
215,ID_79L4P0R,18-06-2010,17-06-2011,42,18-06-2010,1,Saloon,Black,TOYOTA,Lekki,Lagos,Car Classic
832,ID_OG1I9YI,23-12-2010,22-12-2011,78,23-12-2010,2,Saloon,Black,TOYOTA,Idemili south,Idemili-south,Car Classic
184,ID_61YHUPM,08-02-2010,07-02-2011,43,08-02-2010,1,Saloon,,Pontiac,Ibadan South West,Ibadan-West,Car Classic
1057,ID_VCES3UB,13-09-2010,12-09-2011,34,13-09-2010,1,,,,,,CarSafe
987,ID_T5UAAOF,27-03-2010,26-03-2011,120,27-03-2010,1,Saloon,Grey,TOYOTA,Victoria Island,Lagos,Customized Motor
471,ID_E3VGMB5,09-01-2010,08-01-2011,37,09-01-2010,1,Saloon,Black,TOYOTA,Port-Harcourt,Port-Harcourt,Car Classic


Let's look into the class imbalance of the data. We can clearly see a class imbalance is there

In [6]:
train_df['target'].value_counts(normalize=True)

0    0.879543
1    0.120457
Name: target, dtype: float64

## Policy Tenure creation

We can see that we have three date columns given-

* Policy Start Date
* Policy End Date
* First Transaction Date

We can create some meaningful features that will answer the questions like below- 
* What was the tenure of policy in months? 
* What were the months, days, etc corresponding to a given Policy Start and End date. 
* If First Transaction date is similar to Policy Start Date, can we drop it ? 

In [7]:
train_df['Policy Start Date'] = pd.to_datetime(train_df['Policy Start Date'])
test_df['Policy Start Date']  = pd.to_datetime(test_df['Policy Start Date'])

train_df['Policy End Date']   = pd.to_datetime(train_df['Policy End Date'])
test_df['Policy End Date']    = pd.to_datetime(test_df['Policy End Date'])


train_df['Policy_Tenure']     = (train_df['Policy End Date']-train_df['Policy Start Date']).dt.days
test_df['Policy_Tenure']      = (test_df['Policy End Date']-test_df['Policy Start Date']).dt.days

train_df.sample(5)

Unnamed: 0,ID,Policy Start Date,Policy End Date,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target,Policy_Tenure
8193,ID_OMICC02,2010-10-07,2011-08-03,33,10-07-2010,1,Truck,White,Mercedes,Victoria Island,Lagos,CVTP,0,300
10556,ID_VMF33I1,2010-06-12,2011-05-12,72,06-12-2010,1,,,,,,CarSafe,0,334
1021,ID_3639SJR,2010-10-04,2011-09-04,65,10-04-2010,1,Saloon,,Kia,,,CarSafe,0,335
6551,ID_JJNZ0TR,2010-08-28,2011-08-27,40,28-08-2010,1,,,,,,CarSafe,0,364
1591,ID_4U5LNB6,2010-05-01,2011-04-01,37,05-01-2010,1,Saloon,,TOYOTA,,,CarSafe,0,335


In [8]:
def create_date_features(df,colname):
    '''
        Creates date features like month,day, year etc.

        Feel free to add more features and customize
    '''

    df[colname+'_month']           = df[colname].dt.month
    df[colname+'_year']            = df[colname].dt.year
    df[colname+'_day_of_week']     = df[colname].dt.dayofweek

    df                             = df.drop(colname,axis=1)

    return df

In [9]:
train_df  = create_date_features(train_df,'Policy Start Date')
test_df   = create_date_features(test_df,'Policy Start Date')

train_df  = create_date_features(train_df,'Policy End Date')
test_df   = create_date_features(test_df,'Policy End Date')

print(colored('The train dataframe looks like below after creating date features','blue',attrs=['bold']))
display(train_df.sample(3))

print(colored('The test dataframe looks like below after creating date features','blue',attrs=['bold']))
display(test_df.sample(3))

[1m[34mThe train dataframe looks like below after creating date features[0m


Unnamed: 0,ID,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target,Policy_Tenure,Policy Start Date_month,Policy Start Date_year,Policy Start Date_day_of_week,Policy End Date_month,Policy End Date_year,Policy End Date_day_of_week
11684,ID_YVL4AFH,63,06-05-2010,2,JEEP,Black,TOYOTA,Eti-Osa,Eti-Osa,Car Classic,0,334,6,2010,5,5,2011,3
10161,ID_UDZ92O6,37,16-09-2010,1,,,,,,Car Classic,0,209,9,2010,3,4,2011,2
1739,ID_5BGQH8D,30,11-02-2010,1,Saloon,,Nissan,,,CarSafe,0,334,11,2010,1,10,2011,6


[1m[34mThe test dataframe looks like below after creating date features[0m


Unnamed: 0,ID,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,Policy_Tenure,Policy Start Date_month,Policy Start Date_year,Policy Start Date_day_of_week,Policy End Date_month,Policy End Date_year,Policy End Date_day_of_week
361,ID_BHG9VIT,42,28-10-2010,2,JEEP,,Honda,Lagos Mainland,Lagos,Customized Motor,364,10,2010,3,10,2011,3
1100,ID_WOBIOKR,36,21-02-2010,3,Saloon,Grey,TOYOTA,Ikeja,Lagos,Car Classic,364,2,2010,6,2,2011,6
646,ID_J82LB22,40,18-07-2010,1,JEEP,Silver,TOYOTA,Mushin,Lagos,Car Classic,364,7,2010,6,7,2011,6


In [10]:
print(colored('We can clearly drop the feature - First Transaction Date from the data because it was just similar to Policy Start date only','blue',attrs=['bold']))

train_df  = train_df.drop('First Transaction Date',axis=1)
test_df   = test_df.drop('First Transaction Date',axis=1)


print(colored('The train dataframe looks like below after date adjustments','blue',attrs=['bold']))
display(train_df.sample(3))

print(colored('The test dataframe looks like below after date adjustments','blue',attrs=['bold']))
display(test_df.sample(3))

[1m[34mWe can clearly drop the feature - First Transaction Date from the data because it was just similar to Policy Start date only[0m
[1m[34mThe train dataframe looks like below after date adjustments[0m


Unnamed: 0,ID,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target,Policy_Tenure,Policy Start Date_month,Policy Start Date_year,Policy Start Date_day_of_week,Policy End Date_month,Policy End Date_year,Policy End Date_day_of_week
8913,ID_QKGJRWP,39,1,Saloon,,Nissan,,,CarSafe,0,335,7,2010,1,6,2011,0
7176,ID_LIJHZ9S,29,1,Saloon,Silver,TOYOTA,Victoria Island,Lagos,Car Classic,0,364,5,2010,0,5,2011,0
11671,ID_YUG9XL4,38,1,Saloon,,TOYOTA,Nnewi,Anambra,Car Classic,0,364,3,2010,3,3,2011,3


[1m[34mThe test dataframe looks like below after date adjustments[0m


Unnamed: 0,ID,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,Policy_Tenure,Policy Start Date_month,Policy Start Date_year,Policy Start Date_day_of_week,Policy End Date_month,Policy End Date_year,Policy End Date_day_of_week
784,ID_N33CAVY,20,5,JEEP,Green,Mercedes,Victoria Island,Lagos,Car Classic,102,11,2010,0,2,2011,4
1137,ID_XZ628U2,27,1,,,,,,CarSafe,334,6,2010,2,5,2011,0
278,ID_92YSVHG,51,1,Saloon,,TOYOTA,,,CarSafe,334,9,2010,2,8,2011,0


## Dropping ID column from both train and test data. 

We can drop ID column from both train and test data. ID can't be used as a feature in any case. 

In [11]:
train_df      = train_df.drop(['ID'],axis=1)
test_df       = test_df.drop(['ID'],axis=1)
print(colored('The train dataframe looks like below after removal of ID column','blue',attrs=['bold']))
display(train_df.sample(3))

print(colored('The test dataframe looks like below after removal of ID column','blue',attrs=['bold']))
display(test_df.sample(3))

[1m[34mThe train dataframe looks like below after removal of ID column[0m


Unnamed: 0,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target,Policy_Tenure,Policy Start Date_month,Policy Start Date_year,Policy Start Date_day_of_week,Policy End Date_month,Policy End Date_year,Policy End Date_day_of_week
5689,45,1,Saloon,Blue,TOYOTA,Isolo,Benue,Car Classic,1,451,1,2010,0,3,2011,3
4049,70,1,JEEP,Black,Mercedes,Jos South,Jos-South,Car Classic,0,121,12,2010,5,4,2011,0
6418,50,1,,,,,,CarSafe,0,335,12,2010,5,11,2011,4


[1m[34mThe test dataframe looks like below after removal of ID column[0m


Unnamed: 0,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,Policy_Tenure,Policy Start Date_month,Policy Start Date_year,Policy Start Date_day_of_week,Policy End Date_month,Policy End Date_year,Policy End Date_day_of_week
850,71,2,Saloon,Black,TOYOTA,Eti-Osa,Eti-Osa,Car Classic,362,4,2010,4,4,2011,2
1053,40,1,,,,Ikorodu,Lagos,Car Classic,335,12,2010,6,11,2011,5
456,36,1,,,,,,CarFlex,363,10,2010,1,10,2011,0


## Target Column adjustment

We will save the target values in a separate array and then drop the Target feature from train data.

In [12]:
target_y    = train_df['target'].values
train_df    = train_df.drop(['target'],axis=1)

print(colored(f'The number of features in train data is {train_df.shape}','blue',attrs=['bold']))
print(colored(f'The number of features in test data is {test_df.shape}','green',attrs=['bold']))

[1m[34mThe number of features in train data is (12079, 15)[0m
[1m[32mThe number of features in test data is (1202, 15)[0m


## Many Categorical features

We see that apart from Age, all features are categorical. 

Features like Number of policies, day, month etc may look numerical but they are actually Nominal Categorical variables. 


In [14]:
cat_cols = train_df.columns.tolist()
_        = cat_cols.remove('Age')
_        = cat_cols.remove('Policy_Tenure')

for cols in cat_cols:
    train_df[cols] = train_df[cols].astype(str)
    test_df[cols]  = test_df[cols].astype(str)



In [15]:
cate_features_index = np.where(train_df.dtypes == object)[0]
cate_features_index

array([ 1,  2,  3,  4,  5,  6,  7,  9, 10, 11, 12, 13, 14])

## Class Imbalance of data and Model development

* We will visualize the class imabalance of the data and we can see that the class imbalance is pretty high 
* We will handle it is two ways - 
    1. Stratified K fold Cross Validation 
    2. Class Imabalance parameter of catboost

In [16]:
print(colored(f'The data imbalance is {dict(Counter(target_y))}','red',attrs=['bold']))

[1m[31mThe data imbalance is {0: 10624, 1: 1455}[0m


In [17]:
train   = train_df.values
test    = test_df.values
train_y = target_y

In [18]:

oof_pred               = np.zeros((len(train),))

y_pred_final           = np.zeros((len(test), ))

n_splits               = 5
thresh                 = 0.45
feat_df_dict           = {}
kf                     = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=13)

for fold, (tr_ind, val_ind) in enumerate(kf.split(train, train_y)):   
    print(colored(f'-------------------------------Performing analysis for fold {fold+1}--------------------------',attrs=['bold']))
    X_train, X_val     = train[tr_ind], train[val_ind]
    y_train, y_val     = train_y[tr_ind], train_y[val_ind]

    
    model              = CatBoostClassifier(n_estimators=100, random_state=13, 
                                            verbose=False,
                                            scale_pos_weight = np.sqrt(10624/1455))

    _                  = model.fit(X_train,y_train,cat_features = cate_features_index)
    
    val_pred           = model.predict_proba(X_val)[:,1]
    
    F1score            = f1_score(y_val,np.where(val_pred>thresh,1,0))
    
    
    
    feat_df    = pd.DataFrame(columns = ['features'])
    feat_df['features'] = train_df.columns
    feat_df[f'importances_Fold{fold+1}'] = model.feature_importances_
    # display(feat_df.sort_values(by = ['importances'],ascending=False).reset_index(drop=True))
    feat_df_dict[fold] = feat_df
    

    print(colored(f'The classification report on the train data is below '))
    train_predictions = model.predict_proba(X_train)[:,1]
    print(colored(classification_report(y_train, np.where(train_predictions>thresh,1,0) ),'blue',attrs=['bold']))
    print(colored(f'The confusion matrix on the train data is below '))
    print(colored(confusion_matrix(y_train, np.where(train_predictions>thresh,1,0) ),'blue',attrs=['bold']))
    

    
    print(colored(f'The classification report on the test data is below '))
    print(colored(classification_report(y_val,  np.where(val_pred>thresh,1,0)),'green',attrs=['bold']))
    print(colored(f'The confusion matrix on the test data is below '))
    print(colored(confusion_matrix(y_val, np.where(val_pred>thresh,1,0)),'green',attrs=['bold']))
    
    print(colored(f'Fold {fold+1}, Validation F1 Score {F1score}','red',attrs=['bold']))
    oof_pred[val_ind]  = val_pred
    y_pred_final    += model.predict_proba(test)[:,1] / (n_splits)
    print(colored(f'----------------------------------------------------------------------------------------',attrs=['bold']))
print('\n')
print('OOF(Cross-Validation) F1 Score :- ',(f1_score(train_y,np.where(oof_pred>thresh,1,0))))

[1m-------------------------------Performing analysis for fold 1--------------------------[0m
The classification report on the train data is below [0m
[1m[34m              precision    recall  f1-score   support

           0       0.94      0.87      0.91      8499
           1       0.39      0.59      0.47      1164

    accuracy                           0.84      9663
   macro avg       0.66      0.73      0.69      9663
weighted avg       0.87      0.84      0.85      9663
[0m
The confusion matrix on the train data is below [0m
[1m[34m[[7424 1075]
 [ 482  682]][0m
The classification report on the test data is below [0m
[1m[32m              precision    recall  f1-score   support

           0       0.92      0.85      0.89      2125
           1       0.30      0.46      0.37       291

    accuracy                           0.81      2416
   macro avg       0.61      0.66      0.63      2416
weighted avg       0.85      0.81      0.82      2416
[0m
The confusion ma

In [19]:
for folds in list(feat_df_dict.keys()):
    if folds==0:
        feat_df = feat_df_dict[folds]
        
    else:
        feat_df = feat_df.merge(feat_df_dict[folds],on=['features'],how='left')
print(colored('The feature importances accross all the folds is shown below',attrs=['bold']))
feat_df['Average_importance'] = feat_df[[f'importances_Fold{i}' for i in range(1,n_splits+1)]].mean(axis=1)
display(feat_df.sort_values(by=['Average_importance'],ascending=False))


[1mThe feature importances accross all the folds is shown below[0m


Unnamed: 0,features,importances_Fold1,importances_Fold2,importances_Fold3,importances_Fold4,importances_Fold5,Average_importance
7,ProductName,40.351379,41.94824,40.350653,44.696866,44.260276,42.321483
0,Age,8.760662,11.064767,10.067098,9.578266,9.897317,9.873622
4,Subject_Car_Make,6.826275,9.200303,9.697825,8.375784,9.484114,8.71686
5,LGA_Name,4.956166,6.097612,7.702406,6.30358,5.861096,6.184172
3,Subject_Car_Colour,7.361339,5.301833,5.511147,6.896932,4.871541,5.988558
2,Car_Category,5.657667,5.719546,5.918825,5.83766,5.777154,5.782171
6,State,4.558756,4.90661,3.412249,3.658115,4.230739,4.153294
8,Policy_Tenure,2.878589,3.226279,3.45951,2.848632,2.921872,3.066977
11,Policy Start Date_day_of_week,6.144338,2.763581,2.405962,1.261096,2.393971,2.993789
1,No_Pol,3.246664,2.040087,3.208343,2.731871,2.44301,2.733995


In [20]:
final_predictions = np.where(y_pred_final>thresh,1,0)
final_df = pd.read_csv('Test.csv')
final_df['target'] = final_predictions

final_data         = final_df[['ID','target']]
final_data['target'].value_counts(normalize=True)

0    0.841098
1    0.158902
Name: target, dtype: float64

In [21]:
final_data.to_csv('attempt_1.csv',index=False)

## Submission Time

We will download this csv file and submit it and see the results on public leaderboard. 


## Ways to Improve

We need to make sure we do not overfit on private leaderboard and for that we can ensure the following steps - 
1. Have a good validation strategy: While we applied Stratified K fold above, we can either find a better validation strategy than this or make sure we are able to find a validation set which is similar to test data. In short find a way to balance **LB vs CV score**
2. Parameter tuning: We tried CatBoostClassifier over here, we can use parameter tuning to make sure our model fits well on the train data. Some common parameters to tune in catboost only is reg_lambda/reg_alpha (both of which are regularization parameters)
3. Trying out different models: Catboost was one of the models I tried, you can try other models like xgboost, lightgbm, random forest, logistic regression etc. 
4. Creating more features: We can create more and more features like the ones I suggest below and see how the performance increases. 
5. Tune the threshold using roc_curve (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html)
6. Use Internet: Use internet to see how such similar problems were solved. I link the notebooks of the competition previously hosted and you can take hints from the notebooks - 
    * https://deepnote.com/@abid/AutoInland-Vehicle-Insurance-Claim-fd06923d-6c6f-4386-b84b-713b5911f32f
    * https://github.com/harshad317/AutoInland-Vehicle-Insurance-Claim-Challenge
    * https://github.com/damolaoriola/Zindi-AutoInland-Vehicle-Insurance
    

## Some feature engineering techniques that you can use 

## Aggregate feature generation from train data

We give one example of such feature where we try to find out the most Popular product in a specific state. We then use it as a feature in our dataset. 

**Note:** Be very careful while creating the aggregate features. The aggregate features must be created only from train dataset. One has to remember that the test dataset is something that we have not even seen till now. So don't create aggregate features from test dataset because unknowingly it may lead to data leakages and the result may not be useful. 

In [23]:
agg_df = train_df.groupby(['State','ProductName'],dropna=True).agg({'ProductName':'count'}).rename({'ProductName':'POPULAR_PRODUCT_STATE'},axis=1)
agg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,POPULAR_PRODUCT_STATE
State,ProductName,Unnamed: 2_level_1
ABULE-EGBA,Car Classic,6
AJAO-ESTATE,Car Classic,1
AJAO-ESTATE,Car Plus,1
Aba-North,Car Classic,3
Aba-South,Car Classic,1
...,...,...
,CarFlex,159
,CarSafe,3438
,Customized Motor,23
,Motor Cycle,3


In [24]:
train_df = train_df.merge(agg_df, on=['State','ProductName'],how='left')
test_df  = test_df.merge(agg_df, on=['State','ProductName'],how='left')

print(colored('After adding one aggregate feature, the train data looks like this ','blue',attrs=['bold']))
display(train_df)
print('\n -------------------------------------------------------------------------------------')
print(colored('After adding one aggregate feature, the test data looks like this ','green',attrs=['bold']))
display(test_df)

[1m[34mAfter adding one aggregate feature, the train data looks like this [0m


Unnamed: 0,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,Policy_Tenure,Policy Start Date_month,Policy Start Date_year,Policy Start Date_day_of_week,Policy End Date_month,Policy End Date_year,Policy End Date_day_of_week,POPULAR_PRODUCT_STATE
0,30,1,Saloon,Black,TOYOTA,,,Car Classic,364,5,2010,4,5,2011,4,2605
1,79,1,JEEP,Grey,TOYOTA,,,Car Classic,364,11,2010,0,11,2011,0,2605
2,43,1,Saloon,Red,TOYOTA,,,Car Classic,364,3,2010,6,3,2011,6,2605
3,2,1,,,,,,CarSafe,364,8,2010,5,8,2011,5,3438
4,20,3,,,,Lagos,Lagos,Muuve,124,8,2010,6,12,2010,4,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12074,30,1,,Black,Range Rover,Ibeju/Lekki,Ibeju-Lekki,Car Classic,364,5,2010,1,5,2011,1,104
12075,59,1,,,,,,Car Classic,337,3,2010,2,2,2011,3,2605
12076,34,1,,,,,,CarSafe,304,10,2010,6,8,2011,2,3438
12077,120,2,,White,TOYOTA,Victoria Island,Lagos,CVTP,364,2,2010,5,2,2011,5,315



 -------------------------------------------------------------------------------------
[1m[32mAfter adding one aggregate feature, the test data looks like this [0m


Unnamed: 0,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,Policy_Tenure,Policy Start Date_month,Policy Start Date_year,Policy Start Date_day_of_week,Policy End Date_month,Policy End Date_year,Policy End Date_day_of_week,POPULAR_PRODUCT_STATE
0,46,1,,,Ford,Abuja Municipal,Abuja-Municipal,Car Classic,364,10,2010,5,10,2011,5,161.0
1,32,1,,,,Kosofe,Benue,Car Classic,364,10,2010,3,10,2011,3,513.0
2,45,2,Saloon,Black,Honda,Wuse 11,Abuja,Car Classic,364,8,2010,6,8,2011,6,56.0
3,58,1,Saloon,,TOYOTA,,,CarSafe,541,6,2010,6,12,2011,1,3438.0
4,120,1,Saloon,Red,Hyundai,Victoria Island,Lagos,Car Classic,539,1,2010,3,6,2011,3,2075.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1197,67,2,JEEP,Black,TOYOTA,Victoria Island,Lagos,Car Classic,335,5,2010,2,4,2011,1,2075.0
1198,43,1,Saloon,Silver,Hyundai,Surulere,Lagos,Car Classic,364,1,2010,3,1,2011,3,2075.0
1199,30,1,Truck,White,Iveco,Victoria Island,Lagos,CVTP,364,7,2010,0,7,2011,0,315.0
1200,44,2,Saloon,,Nissan,Aba North,Aba-North,Car Classic,364,2,2010,1,2,2011,1,3.0


## Adding features using Bayes theorem 
Let us visit our high school mathematics classes and create features using Bayes theorem. 
<br>

Let A be the event defined as 

$A:=$ customer will submit a claim. 



Let us now consider a categorical column - ProductName.

We will have prior probabilities of each product like $P(CarSafe)$, $P(Car Classic)$, etc. 

From the given data, we have the $P(Car Classic/A)$, $P(CarSafe/A)$, etc. 


We will now find the probability of A ie., $P(A)$. 

We will just need to evaluate $P(A/Car Classic)$, $P(A/CarSafe)$ and so on. We use Bayes Theorem for this purpose.


In [25]:
def create_features(colname):

    ###### We find the number of observations corresponding to a particular attribute in a given class
    count_dict                    = dict(Counter(train_data[colname]))
    total_values                  = sum(list(count_dict.values()))

    unique_values                 = train_data[colname].unique().tolist()

    prior_probabilities           = {}
    
    likelihood_dict = {}

    for vals in unique_values:
        favourable_events         = train_data[((train_data[colname]==vals) & (train_data['target']==1))].shape[0]
        total_occurances          = train_data[train_data['target']==1].shape[0]
        likelihood_dict[vals]     = favourable_events/total_occurances
        ############# We calculate the P(product/A) here #################
    
        prior_probabilities[vals] = count_dict[vals]/total_values
        ############ We calculate the P(product) here ####################
    
    Prob_A                        = 0
    for vals in unique_values:
        Prob_A += prior_probabilities[vals]*likelihood_dict[vals]
        ######## We will be able to find the P(A/product) and use the mapping as a feature
    

    posterior_prob                = {}
    for vals in unique_values:
        posterior_prob[vals]      = (prior_probabilities[vals]*likelihood_dict[vals])/Prob_A
    
    print(colored(f'The posterior probabilities are given as below - ','green',attrs=['bold']))
    print(posterior_prob)
    return posterior_prob


In [26]:
train_data      = pd.read_csv('Train.csv')
test_data       = pd.read_csv('Test.csv')
posterior_probs = create_features('ProductName')

[1m[32mThe posterior probabilities are given as below - [0m
{'Car Classic': 0.972957772686374, 'CarSafe': 0.007380905782813117, 'Muuve': 0.0008444540740613463, 'CVTP': 0.007862014108878837, 'Car Plus': 0.006992834084261847, 'Motor Cycle': 5.140762601399098e-05, 'Customized Motor': 0.0031976102159246016, 'CarFlex': 0.0007130014216723096, 'Car Vintage': 0.0}


In [None]:
train_data['Bayes_ProductName'] = train_data['ProductName'].map(posterior_probs)
test_data['Bayes_ProductName']  = test_data['ProductName'].map(posterior_probs)
train_data[['ProductName','Bayes_ProductName','target']].sample(10)

Unnamed: 0,ProductName,Bayes_ProductName,target
812,Car Classic,0.972958,0
5938,Car Classic,0.972958,0
1938,Car Classic,0.972958,0
11374,CarSafe,0.007381,0
5715,CarSafe,0.007381,0
171,Car Classic,0.972958,0
7776,Muuve,0.000844,1
543,Car Plus,0.006993,0
5245,Car Classic,0.972958,1
9818,CarSafe,0.007381,0


In [None]:
test_data[['ProductName','Bayes_ProductName']].sample(10)

Unnamed: 0,ProductName,Bayes_ProductName
1144,Car Classic,0.972958
1131,Customized Motor,0.003198
696,CarSafe,0.007381
663,CarSafe,0.007381
70,Customized Motor,0.003198
56,Car Classic,0.972958
170,CarSafe,0.007381
540,CarSafe,0.007381
123,Muuve,0.000844
356,CarSafe,0.007381
