In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, RobustScaler, OneHotEncoder, FunctionTransformer
import pickle

In [23]:
def transform_column(column, transformation_dict):
    '''
    Transforms a column using a dictionary of transformations
    '''
    return column.map(transformation_dict).values.reshape(-1, 1)

In [24]:
df = pd.read_csv('../data/raw/health_insurance_cross_sell.csv').sort_values('id')

In [25]:
class HealthInsuranceCrossSell:
    def __init__(self):
        self.home_path = '../src/features/'
        self.encoders = pickle.load(open(self.home_path + 'encoders.pkl', 'rb'))
        self.scalers = pickle.load(open(self.home_path + 'scalers.pkl', 'rb'))
        self.final_features = [
            'age', 'gender', 'famous_region', 'vehicle_damage', 'vehicle_age',
            'policy_sales_channel2_124','policy_sales_channel2_152', 
            'policy_sales_channel2_26'
            ]
        self.id = []
    
    def data_cleaning(self, df):
        # Filter who doesn't have driving license
        df = df[df['driving_license'] == 1].drop('driving_license', axis=1)

        # Saving id
        self.id = df['id'].values
        
        return df
        
    def feature_engineering(self, df):
        # Famous region
        df['famous_region'] = 0
        df.loc[df['region_code'] == 28, 'famous_region'] = 1

        # Policy sales channel 2
        df['policy_sales_channel2'] = df['policy_sales_channel'].copy().astype('int64').astype(str)
        df.loc[~df['policy_sales_channel'].isin([152, 26, 124]), 'policy_sales_channel2'] = 'others'

        # Vehicle age 2
        df['vehicle_age2'] = 0
        df.loc[df['vehicle_age'].isin(['1-2 Year', '> 2 Years']), 'vehicle_age2'] = 1

        # Famous policy sales channel
        df['famous_policy_sales_channel'] = 0
        df.loc[df['policy_sales_channel'].isin([152, 26, 124]), 'famous_policy_sales_channel'] = 1

        # Health insurance customer profitability
        df['hi_customer_profitability'] = df['annual_premium'].div(df['vintage']).fillna(0)
        
        return df

    def data_preparation(self, df):
        # Gender, vehicle damage, vehicle age and policy sales channel 2
        df = self.encoders.transform(df)
        df = pd.DataFrame(df, columns=self.encoders.get_feature_names_out())
        
        # Drop unnecessary features
        scaler_vars = [
            'id', 'age', 'vehicle_damage', 'annual_premium', 'vintage',
            'famous_region', 'vehicle_age', 'vehicle_age2', 
            'hi_customer_profitability', 'famous_policy_sales_channel', 
            'policy_sales_channel2_124', 'policy_sales_channel2_152', 
            'policy_sales_channel2_26', 'gender', 'response'
        ]

        df = df[scaler_vars]

        # Age
        df = self.scalers.transform(df)
        df = pd.DataFrame(df, columns=self.scalers.get_feature_names_out())

        # Selecting features
        df = df[self.final_features]
        return df
    
    def get_prediction(self, model, original_data, test_data):
        y_pred = model.predict(test_data)
        y_predict_proba = model.predict_proba(test_data)[:, 1]

        pred_df = pd.DataFrame(self.id, columns=['id'])
        pred_df['prediction'] = y_pred
        pred_df['probability'] = y_predict_proba

        original_data = original_data.merge(pred_df, on='id', how='left')

        # Driving license = 1 (data cleaning filter because of business question)
        original_data['prediction'] = original_data['prediction'].fillna(1)
        original_data['probability'] = original_data['probability'].fillna(0.9999)

        return original_data

In [26]:
df_aux = df.copy()
model = pickle.load(open('../src/model/best-model.pkl', 'rb'))

In [27]:
pipeline = HealthInsuranceCrossSell()

df1 = pipeline.data_cleaning(df_aux)
df2 = pipeline.feature_engineering(df1)
df3 = pipeline.data_preparation(df2)
df4 = pipeline.get_prediction(model, df_aux, df3)

In [28]:
df4

Unnamed: 0,id,gender,age,driving_license,region_code,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,vintage,response,prediction,probability
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1,1.0,0.844178
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0,0.0,0.005304
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1,1.0,0.849236
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0,0.0,0.019185
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0,0.0,0.018057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508141,508142,Female,26,1,37.0,1,< 1 Year,No,30867.0,152.0,56,0,0.0,0.015344
508142,508143,Female,38,1,28.0,0,1-2 Year,Yes,28700.0,122.0,165,0,1.0,0.771266
508143,508144,Male,21,1,46.0,1,< 1 Year,No,29802.0,152.0,74,0,0.0,0.019185
508144,508145,Male,71,1,28.0,1,1-2 Year,No,62875.0,26.0,265,0,0.0,0.009722
