# 1. IMPORTS
---

## 1.1. Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, RobustScaler, OneHotEncoder, FunctionTransformer
import pickle

## 1.2. Functions

In [2]:
def transform_column(column, transformation_dict):
    '''
    Transforms a column using a dictionary of transformations
    '''
    return column.map(transformation_dict).values.reshape(-1, 1)

# 2. DATA PREPARATION
---

In [3]:
# Copy
dfp = pd.read_csv('../data/interim/hi_cs_processed.csv')

## 2.1. Creating features

In [4]:
# Policy sales channel 2
dfp['policy_sales_channel2'] = dfp['policy_sales_channel'].copy().astype('int64').astype(str)
dfp.loc[~dfp['policy_sales_channel'].isin([152, 26, 124]), 'policy_sales_channel2'] = 'others'

# One hot encoder of policy_sales_channel2
psc2_ohe = OneHotEncoder(handle_unknown='ignore', drop=['others'], dtype='int64')

## 2.2. Encoders

In [5]:
# Vehicle damage
vd_dict = {'Yes': 1, 'No': 0}
vd_args = {'transformation_dict': vd_dict}
vd_transformer = FunctionTransformer(transform_column, kw_args=vd_args,
                                     feature_names_out='one-to-one')

# Gender (0 = Male, 1 = Female)
g_dict = {'Male': 0, 'Female': 1}
g_args = {'transformation_dict': g_dict}
g_transformer = FunctionTransformer(transform_column, kw_args=g_args,
                                    feature_names_out='one-to-one')

# Vehicle age
va_dict = {'< 1 Year': 1, '1-2 Year': 2, '> 2 Years': 3}
va_args = {'transformation_dict': va_dict}
va_transformer = FunctionTransformer(transform_column, kw_args=va_args,
                                     feature_names_out='one-to-one')

In [6]:
# Define the encoders and the columns they should be applied to
transformers = [
    ('psc2_ohe', psc2_ohe, ['policy_sales_channel2']),
    ('vd_transformer', vd_transformer, 'vehicle_damage'),
    ('g_transformer', g_transformer, 'gender'),
    ('va_transformer', va_transformer, 'vehicle_age')
]

# Define the column transformer with specified encoders
encoders = ColumnTransformer(
        transformers=transformers, 
        remainder='passthrough', 
        verbose_feature_names_out=False
)

# Apply encoders to columns
dfp = encoders.fit_transform(dfp)
dfp = pd.DataFrame(dfp, columns=encoders.get_feature_names_out())

In [7]:
# Saving the encoders with pickle
pickle.dump(encoders, open('../src/features/encoders.pkl', 'wb'))

I'll test `vehicle_age` and `vehicle_age2` in feature importance methods. So, I'll include but drop one of them after.

## 2.3. Dropping some redundant features

In [8]:
# Order of variables and drop: region_code and previously_insured
wanted_vars = [
       'id', 'age', 'vehicle_damage', 'annual_premium', 'vintage',
       'famous_region', 'vehicle_age', 'vehicle_age2', 
       'hi_customer_profitability', 'famous_policy_sales_channel', 
       'policy_sales_channel2_124', 'policy_sales_channel2_152', 
       'policy_sales_channel2_26', 'gender', 'response'
]

dfp = dfp[wanted_vars]

There are 4 continuous and **10 categorical features**. Maybe **catboost** will be a good propose for algorithm!

In [9]:
# Saving the data before training division
dfp.to_csv('../data/interim/hi_cs_pre_training.csv', index=False)

## 2.4. Train and test datasets

In [10]:
# X and y
X = dfp.drop('response', axis=1)
y = dfp['response']

In [11]:
# Train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42, stratify=y)

In [12]:
# Saving X_train and y_train
X_train.to_csv('../data/processed/X_train.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)

# Saving X_test and y_test
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

## 2.5. Rescaling

I'll use some methods to rescale the data: **Min Max Scaler** and **Robust Scaler**. None of the continuous variables has a Normal distribution, aparently. If there are a lot of outliers, then Robust Scaler will be better. On the other hand, I'll use Min Max.

**SCALERS DEFINED:**

- **Min Max Scaler:**
    - `age`
    - `vintage`
- **Robust Scaler:**
    - `annual_premium`
    - `hi_customer_profitability`

In [13]:
# Define the scalers and the columns they should be applied to
transformers = [
    (MinMaxScaler(), ['age', 'vintage']),
    (RobustScaler(), ['annual_premium', 'hi_customer_profitability'])
]

# Define the column transformer with specified scalers
transformers = [(scaler.__class__.__name__.lower(), scaler, cols) for scaler, cols in transformers]
scalers = ColumnTransformer(transformers=transformers, remainder='passthrough')

# Apply rescaling to columns
X_train_rescaled = scalers.fit_transform(X_train)
X_test_rescaled = scalers.transform(X_test)

# Get feature names
X_train_cols = list(scalers.get_feature_names_out())
X_train_cols = [x.split('__')[1] for x in X_train_cols]

In [14]:
# Saving the scalers with pickle
pickle.dump(scalers, open('../src/features/scalers.pkl', 'wb'))

In [15]:
# Saving X_train_rescaled and X_test_rescaled
pd.DataFrame(X_train_rescaled, columns=X_train_cols)\
    .to_csv('../data/processed/X_train_rescaled.csv', index=False)
pd.DataFrame(X_test_rescaled, columns=X_train_cols)\
    .to_csv('../data/processed/X_test_rescaled.csv', index=False)