# Insurance Company — Caravan Prediction (Refactor)



**EQUIPO 10**

**ALUMNOS:**

Gabriela Lizeth Moreno Hernández - A01796677

Alejandro Chávez Campos  -  A01374974

Carlos Armando Alcántar Sánchez - A01311747

Luis Fernando Caporal Montes de Oca  -  A01795898

Yamil Abraham Nechar Reyes - A01795726

In [1]:
# Imports principales
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report, confusion_matrix

import joblib
import warnings
warnings.filterwarnings('ignore')

print('Libraries loaded')

Libraries loaded


In [2]:
# Cargar datos
data_path = Path('insurance_company_modified.csv')
if not data_path.exists():
    raise FileNotFoundError(f"Archivo no encontrado en {data_path}. Coloca el CSV en la misma carpeta o actualiza la ruta.")

df = pd.read_csv(data_path)
print('Shape:', df.shape)
df.head()

Shape: (5937, 87)


Unnamed: 0,33,1,3,2,8,0,5,1.1,3.1,7,...,0.38,0.39,1.13,0.40,0.41,0.42,0.43,0.44,0.45,mixed_type_col
0,37.0,1.0,2.0,2.0,8.0,1.0,4.0,?,4.0,6.0,...,0.0,0.0,1.0,invalid,0.0,0.0,0.0,0.0,0.0,108
1,37.0,1.0,2.0,2.0,8.0,0.0,4.0,2.0,4.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,unknown
2,9.0,1.0,3.0,3.0,3.0,2.0,3.0,2.0,4.0,5.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,904
3,40.0,1.0,4.0,2.0,10.0,1.0,4.0,1.0,4.0,7.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,bad
4,23.0,1.0,2.0,1.0,5.0,0.0,5.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,271


In [3]:
# Visión rápida de los datos
def quick_overview(df, n=5):
    print('Columnas:', df.shape[1])
    display(df.head(n))
    display(df.info())
    display(df.describe(include='all').T)

quick_overview(df)

Columnas: 87


Unnamed: 0,33,1,3,2,8,0,5,1.1,3.1,7,...,0.38,0.39,1.13,0.40,0.41,0.42,0.43,0.44,0.45,mixed_type_col
0,37.0,1.0,2.0,2.0,8.0,1.0,4.0,?,4.0,6.0,...,0.0,0.0,1.0,invalid,0.0,0.0,0.0,0.0,0.0,108
1,37.0,1.0,2.0,2.0,8.0,0.0,4.0,2.0,4.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,unknown
2,9.0,1.0,3.0,3.0,3.0,2.0,3.0,2.0,4.0,5.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,904
3,40.0,1.0,4.0,2.0,10.0,1.0,4.0,1.0,4.0,7.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,bad
4,23.0,1.0,2.0,1.0,5.0,0.0,5.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,271


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5937 entries, 0 to 5936
Data columns (total 87 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   33              5869 non-null   object
 1   1               5864 non-null   object
 2   3               5873 non-null   object
 3   2               5868 non-null   object
 4   8               5877 non-null   object
 5   0               5863 non-null   object
 6   5               5858 non-null   object
 7   1.1             5874 non-null   object
 8   3.1             5875 non-null   object
 9   7               5876 non-null   object
 10  0.1             5871 non-null   object
 11  2.1             5871 non-null   object
 12  1.2             5871 non-null   object
 13  2.2             5852 non-null   object
 14  6               5877 non-null   object
 15  1.3             5866 non-null   object
 16  2.3             5874 non-null   object
 17  7.1             5874 non-null   object
 18  1.4     

None

Unnamed: 0,count,unique,top,freq
33,5869,154,33.0,768
1,5864,81,1.0,4953
3,5873,75,3.0,2513
2,5868,85,3.0,2806
8,5877,79,8.0,1478
...,...,...,...,...
0.42,5889,42,0.0,5415
0.43,5876,40,0.0,5523
0.44,5875,39,0.0,5444
0.45,5858,31,0.0,5173


In [7]:
#Identificar valores nulos
def identificar_valores_nulos(df):
    print("Valores nulos por columna:")
    print(df.isna().sum())
    print("\n")
    total_filas = len(df)
    for col, nulos in df.isna().sum().items():
        if nulos > 0:
            porcentaje = (nulos / total_filas) * 100
            print(f"Columna '{col}': {nulos} nulos ({porcentaje:.2f}%)")
identificar_valores_nulos(df)

Valores nulos por columna:
33                 68
1                  73
3                  64
2                  69
8                  60
                 ... 
0.42               48
0.43               61
0.44               62
0.45               79
mixed_type_col    593
Length: 87, dtype: int64


Columna '33': 68 nulos (1.15%)
Columna '1': 73 nulos (1.23%)
Columna '3': 64 nulos (1.08%)
Columna '2': 69 nulos (1.16%)
Columna '8': 60 nulos (1.01%)
Columna '0': 74 nulos (1.25%)
Columna '5': 79 nulos (1.33%)
Columna '1.1': 63 nulos (1.06%)
Columna '3.1': 62 nulos (1.04%)
Columna '7': 61 nulos (1.03%)
Columna '0.1': 66 nulos (1.11%)
Columna '2.1': 66 nulos (1.11%)
Columna '1.2': 66 nulos (1.11%)
Columna '2.2': 85 nulos (1.43%)
Columna '6': 60 nulos (1.01%)
Columna '1.3': 71 nulos (1.20%)
Columna '2.3': 63 nulos (1.06%)
Columna '7.1': 63 nulos (1.06%)
Columna '1.4': 82 nulos (1.38%)
Columna '0.2': 67 nulos (1.13%)
Columna '1.5': 78 nulos (1.31%)
Columna '2.4': 85 nulos (1.43%)
Columna '5.1': 52

In [8]:
#Limpieza de los datos
def quitar_nulos(df):
    df = df.dropna()

quitar_nulos(df)
    