# Autores
Guillermo Luigui Ubaldo Nieto Angarita \
Jairo Antonio Viteri Rojas

## Librerías

In [3]:
import os
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer


## Cargue de datos

In [4]:
train_data_path = os.path.join('data', 'train.csv')
test_data_path = os.path.join('data', 'test.csv')

In [6]:
train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)


## Transformacion de datos

In [7]:
imputer = SimpleImputer(strategy='most_frequent', missing_values=np.nan)

In [8]:
def clean_data(data):
  data.drop(['id','SalesChannelID','VehicleAge','DaysSinceCreated'], axis=1, inplace=True)

  data['AnnualPremium'] = data['AnnualPremium'].str.replace('£', '').str.replace(',', '').astype(float)

  for col in ['Gender', 'RegionID']:
       data[col] = imputer.fit_transform(data[[col]]).flatten()

  data['Age'] = data['Age'].fillna(data['Age'].median())
  data['HasDrivingLicense']= data['HasDrivingLicense'].fillna(1)
  data['Switch'] = data['Switch'].fillna(-1)
  data['PastAccident'] = data['PastAccident'].fillna("Unknown", inplace=False)

  Q1 = data['AnnualPremium'].quantile(0.25)
  Q3 = data['AnnualPremium'].quantile(0.75)
  IQR = Q3 - Q1
  upper_bound = Q3 + 1.5 * IQR
  data = data[data['AnnualPremium'] <= upper_bound]

  return data

In [9]:
train_data = clean_data(train)
test_data = clean_data(test)


In [10]:
train_data.head()

Unnamed: 0,Gender,Age,HasDrivingLicense,RegionID,Switch,PastAccident,AnnualPremium,Result
0,Female,26.0,1.0,40.0,1.0,Yes,1416.5,0
1,Male,22.0,1.0,12.0,1.0,Unknown,1534.05,0
2,Female,41.0,1.0,28.0,-1.0,Unknown,1680.85,0
3,Male,25.0,1.0,10.0,1.0,No,1222.05,0
4,Male,58.0,1.0,28.0,-1.0,Unknown,2109.0,0


## Partición entre Train y Test

In [11]:
def feature_target_separator(data):
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    return X, y


In [12]:
X_train, y_train = feature_target_separator(train_data)
X_test, y_test = feature_target_separator(test_data)

## Entrenamiento

En esta función se crea un Pipeline de modelamiento; una función capaz de procesar los datos y guardar un modelo con la mejor versión de entrenamiento.

Previamente deben haber evaluado distintos modelos con PYCARET y haber seleccionado el mejor, al seleccionarlo deben extraer sus hiperparámetros para incluirlos dentro de la función, aquí se esta tomando un RandomForestClassifier con sus hiperparámetros básicos, la idea es que prueben con otros modelos y otras combinaciones.

*Sugerencia: Hacer un Randomized Search sobre el modelo seleccionado por pycaret para escoger los mejores hiperparámetros*

In [13]:
def create_pipeline():
  preprocessor = ColumnTransformer(transformers=[
      ('minmax', MinMaxScaler(), ['AnnualPremium']),
      ('standardize', StandardScaler(), ['Age','RegionID']),
      ('onehot', OneHotEncoder(handle_unknown='ignore'), ['Gender', 'PastAccident']),
  ])

  smote = SMOTE(sampling_strategy=1.0)

  model_map = {
      'RFClassifier': RandomForestClassifier,
      'DecisionTreeClassifier': DecisionTreeClassifier,
      'GradientBoostingClassifier': GradientBoostingClassifier}

  model_class = model_map['RFClassifier']
  model = model_class(
      n_estimators=50,
      max_depth=10,
      min_samples_split=2,
      min_samples_leaf=1,
      random_state=42
  )
  pipeline = Pipeline([
      ('preprocessor', preprocessor),
      ('smote', smote),
      ('model', model)
  ])

  return pipeline

In [14]:
pipeline = create_pipeline()

In [15]:
model = pipeline.fit(X_train, y_train)

In [16]:
dir(model)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__sklearn_is_fitted__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_can_fit_resample',
 '_can_fit_transform',
 '_can_inverse_transform',
 '_can_transform',
 '_check_feature_names',
 '_check_method_params',
 '_check_n_features',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_final_estimator',
 '_fit',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_params',
 '_get_tags',
 '_iter',
 '_log_

## Guardado del modelo

In [18]:
joblib.dump(model, 'models/model_RF.pkl')

['models/model_RF.pkl']

## Evaluación del modelo

In [19]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [21]:
def evaluate_model(X_test, y_test):
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  class_report = classification_report(y_test, y_pred)
  roc_auc = roc_auc_score(y_test, y_pred)
  return accuracy, class_report, roc_auc


In [22]:
accuracy, class_report, roc_auc_score = evaluate_model(X_test, y_test)

In [23]:
print(f"Accuracy Score: {accuracy:.4f}, ROC AUC Score: {roc_auc_score:.4f}")
print(f"\n{class_report}")


Accuracy Score: 0.6285, ROC AUC Score: 0.7251

              precision    recall  f1-score   support

           0       0.97      0.60      0.74     85582
           1       0.23      0.85      0.36     11752

    accuracy                           0.63     97334
   macro avg       0.60      0.73      0.55     97334
weighted avg       0.88      0.63      0.69     97334

