In [1]:
import pandas as pd
import zipfile
from kaggle.api.kaggle_api_extended import KaggleApi

In [2]:
api = KaggleApi()
api.authenticate()

In [3]:
api.dataset_download_file(dataset='anmolkumar/health-insurance-cross-sell-prediction',
                         file_name='test.csv', path='../zip')

True

In [4]:
with zipfile.ZipFile('../zip/test.csv.zip','r') as zipref:
    zipref.extractall(path='../00-Data')

In [5]:
df_teste = pd.read_csv('../00-Data/test.csv')

In [None]:
import inflection
import bisect
import pandas as pd

In [25]:
class insuranse_cross_sell(object):
    def __init__(self):
        pass
    
    def data_cleaning(self, df):
        # snakecase
        snakecase = lambda col: inflection.underscore(col)
        new_columns = list(map(snakecase, df.columns))

        # rename
        df.columns = new_columns

        return df
    
    def age_stage(self, num, breakpoints=[10, 20, 30, 45, 60, 70, 80, 120], result='01234567'):
        i = bisect.bisect(breakpoints, num-1)
        age_mapping = {
            0: 'Child',
            1: 'Teenager',
            2: 'Young',
            3: 'Adult',
            4: 'Midlife',
            5: 'Senior',
            6: 'Mature Adulthood',
            7: 'Late Adulthood'
        }
        return age_mapping[i]

    
    def feature_engineering(self, df):
        # Age Stage
        df['age_stage'] = df['age'].apply(lambda row: insuranse_cross_sell.age_stage(self,row))
        
        # vehicle age
        vehicle_age_mapping = {'< 1 Year':0,
                               '1-2 Year':1,
                               '> 2 Years':2}

        df['vehicle_age'] = df['vehicle_age'].map(vehicle_age_mapping)
        
        # vehicle demage versus license
        vehicle_damage_mapping = {'No':0,
                                  'Yes':1}

        df['vehicle_damage'] = df['vehicle_damage'].map(vehicle_damage_mapping)
        df['vehicle_damage_license'] = df.apply(lambda row: -(row['vehicle_damage'] + row['driving_license'])**2\
                                                 if row['driving_license'] == 0\
                                                 else (row['vehicle_damage'] + row['driving_license'])**2, axis=1)
        
        # vehicle_damage_vehicle_age
        df['vehicle_damage_vehicle_age'] = df.apply(lambda row: -(row['vehicle_damage'] + row['vehicle_age'])**2\
                                                    if row['vehicle_damage'] == 1\
                                                    else (row['vehicle_damage'] + row['vehicle_age'])**2, axis=1)

        # age / mean_age_by_region_code
        region_code_mean_age = pd.DataFrame(df.groupby('region_code')['age'].mean()).\
                                                    reset_index().rename(columns={'age':'mean_age_by_region_code'})
        
        df = pd.merge(df, region_code_mean_age, how='left', on='region_code')
        df['age_mean_age_by_region_code'] = df['age'] / df['mean_age_by_region_code']
        #del region_code_mean_age

        # age / mean_age_by_policy_sales_channel
        policy_sales_channel_mean_age = pd.DataFrame(df.groupby('policy_sales_channel')['age'].mean()).\
                                                    reset_index().rename(columns={'age':'mean_age_by_policy_sales_channel'})
        
        df = pd.merge(df, policy_sales_channel_mean_age, how='left', on='policy_sales_channel')
        df['age_mean_age_by_policy_sales_channel'] = df['age'] / df['mean_age_by_policy_sales_channel']
        #del policy_sales_channel_mean_age

        # annual_premium / mean_annual_premium_by_region_code
        region_code_mean_annual_premium = pd.DataFrame(df.groupby('region_code')['annual_premium'].mean()).\
                                                    reset_index().rename(columns={'annual_premium':'mean_annual_premium_by_region_code'})
        
        df = pd.merge(df, region_code_mean_annual_premium, how='left', on='region_code')
        df['annual_premium_mean_annual_premium_by_region_code'] = df['annual_premium'] / df['mean_annual_premium_by_region_code']
        #del region_code_mean_annual_premium

        # annual_premium / mean_annual_premium_by_policy_sales_channel
        policy_sales_channel_mean_annual_premium = pd.DataFrame(df.groupby('policy_sales_channel')['annual_premium'].mean()).\
                                                    reset_index().rename(columns={'annual_premium':'mean_annual_premium_by_policy_sales_channel'})
        
        df = pd.merge(df, policy_sales_channel_mean_annual_premium, how='left', on='policy_sales_channel')
        df['annual_premium_mean_annual_premium_by_policy_sales_channel'] = df['annual_premium'] / df['mean_annual_premium_by_policy_sales_channel']
        #del policy_sales_channel_mean_annual_premium

        # age / vintage
        df['age_vintage'] = (df['age']*365) / df['vintage']

        
        return df

In [26]:
pipeline = insuranse_cross_sell()

In [29]:
df01 = pipeline.data_cleaning(df_teste)
df02 = pipeline.feature_engineering(df01)

In [30]:
df02.head()

Unnamed: 0,id,gender,age,driving_license,region_code,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,...,vehicle_damage_vehicle_age,mean_age_by_region_code,age_mean_age_by_region_code,mean_age_by_policy_sales_channel,age_mean_age_by_policy_sales_channel,mean_annual_premium_by_region_code,annual_premium_mean_annual_premium_by_region_code,mean_annual_premium_by_policy_sales_channel,annual_premium_mean_annual_premium_by_policy_sales_channel,age_vintage
0,381110,Male,25,1,11.0,1,0,0,35786.0,152.0,...,0,33.820736,0.739191,26.203022,0.954088,27798.016796,1.287358,30927.284874,1.157101,172.169811
1,381111,Male,40,1,28.0,0,1,1,33762.0,7.0,...,-4,46.123726,0.867233,55.296959,0.723367,38593.000028,0.874822,37980.703041,0.888925,131.531532
2,381112,Male,47,1,28.0,0,1,1,40050.0,124.0,...,-4,46.123726,1.018998,46.806081,1.004143,38593.000028,1.037753,31588.901745,1.26785,86.20603
3,381113,Male,24,1,27.0,1,0,1,37356.0,152.0,...,-1,30.045045,0.798801,26.203022,0.915925,22209.755631,1.681964,30927.284874,1.207865,46.84492
4,381114,Male,27,1,28.0,1,0,0,59097.0,152.0,...,0,46.123726,0.585382,26.203022,1.030416,38593.000028,1.531288,30927.284874,1.910837,33.181818


In [28]:
import inflection
import bisect