<a href="https://colab.research.google.com/github/DanielCastilloRdz/Codigo_machine_learning/blob/main/modelos_supervisados%5B1%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import warnings
import numpy as np
from sklearn.model_selection import LeaveOneOut, GridSearchCV, cross_validate, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,AdaBoostRegressor,GradientBoostingRegressor,RandomForestRegressor
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,mean_squared_error, r2_score

warnings.filterwarnings("ignore")

def verificar_missing(dataframe):
    """
    Función para verificar valores faltantes en un DataFrame.

    Args:
        dataframe (pd.DataFrame): El DataFrame a analizar.

    Returns:
        pd.DataFrame: Un resumen con el número y porcentaje de valores faltantes por columna.
    """
    total_missing = dataframe.isnull().sum()
    porcentaje_missing = 100 * total_missing / len(dataframe)
    missing_df = pd.DataFrame({
        'Total de Missings': total_missing,
        'Porcentaje (%)': porcentaje_missing
    })
    missing_df = missing_df[missing_df['Total de Missings'] > 0].sort_values(by='Total de Missings', ascending=False)

    if missing_df.empty:
        print(" No hay valores faltantes en el DataFrame.")
    else:
        print("Hay valores faltantes:")

    return missing_df


In [None]:
insurance = pd.read_csv("insurance_claims.csv",header=0,keep_default_na=False,na_values=[''])
verificar_missing(insurance)

Hay valores faltantes:


Unnamed: 0,Total de Missings,Porcentaje (%)
_c39,1000,100.0


In [None]:

columnas_modelos = ['incident_date','insured_sex', 'incident_type', 'collision_type', 'incident_severity','property_damage', 'police_report_available',
     'insured_education_level','months_as_customer', 'age', 'policy_deductable',
     'policy_annual_premium', 'umbrella_limit', 'capital-gains', 'capital-loss',
     'incident_hour_of_the_day','number_of_vehicles_involved', 'bodily_injuries',
     'witnesses', 'auto_year', 'total_claim_amount', 'fraud_reported']

categorical_columns = ['insured_sex','insured_education_level','incident_type', 'collision_type', 'incident_severity','property_damage','police_report_available']
insurance  = insurance[columnas_modelos]
insurance = insurance.sort_values(by='incident_date').reset_index(drop=True)
#insurance[categorical_columns] = insurance[categorical_columns].astype('category')

In [None]:
insurance["fraud_reported"].value_counts()

fraud_reported
N    753
Y    247
Name: count, dtype: int64

In [None]:
columnas = insurance.columns
X = insurance[columnas[1:20]]
y_fraud = insurance[columnas[-1]]
y_claim = insurance[columnas[-2]]
print(columnas)

Index(['incident_date', 'insured_sex', 'incident_type', 'collision_type',
       'incident_severity', 'property_damage', 'police_report_available',
       'insured_education_level', 'months_as_customer', 'age',
       'policy_deductable', 'policy_annual_premium', 'umbrella_limit',
       'capital-gains', 'capital-loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses',
       'auto_year', 'total_claim_amount', 'fraud_reported'],
      dtype='object')


In [None]:
X

Unnamed: 0,insured_sex,incident_type,collision_type,incident_severity,property_damage,police_report_available,insured_education_level,months_as_customer,age,policy_deductable,policy_annual_premium,umbrella_limit,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,auto_year
0,FEMALE,Multi-vehicle Collision,Front Collision,Minor Damage,?,NO,MD,128,35,500,1366.60,0,0,0,10,3,2,1,2008
1,MALE,Single Vehicle Collision,Side Collision,Major Damage,?,?,Associate,116,34,500,1737.66,0,0,-24100,1,1,1,1,2001
2,FEMALE,Multi-vehicle Collision,Rear Collision,Major Damage,NO,YES,PhD,253,41,2000,1312.75,0,81300,0,10,3,2,2,2014
3,MALE,Single Vehicle Collision,Side Collision,Major Damage,YES,YES,College,45,37,1000,1114.23,0,0,0,1,1,0,1,2005
4,MALE,Vehicle Theft,?,Minor Damage,NO,NO,Masters,389,53,2000,791.47,0,0,0,6,1,1,2,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,FEMALE,Multi-vehicle Collision,Front Collision,Major Damage,YES,YES,JD,140,36,2000,979.26,0,0,-67000,2,4,0,2,1998
996,FEMALE,Multi-vehicle Collision,Front Collision,Major Damage,NO,?,High School,118,28,2000,1207.36,0,0,-57000,22,2,1,0,1997
997,FEMALE,Multi-vehicle Collision,Side Collision,Major Damage,YES,NO,Associate,115,31,1000,1051.67,0,0,0,18,3,0,3,2005
998,FEMALE,Multi-vehicle Collision,Side Collision,Total Loss,NO,NO,MD,312,47,1000,1212.07,0,66900,-51800,17,3,2,3,2000


# PROBLEMA DE CLASIFICACIÓN

In [None]:

def evaluar_modelo(modelo, X, y, nombre_modelo="Modelo"):
    loo = LeaveOneOut()

    # Predicciones
    y_pred = cross_val_predict(modelo, X, y, cv=loo, n_jobs=-1)

    # Probabilidades (para ROC AUC)
    try:
        y_scores = cross_val_predict(modelo, X, y, cv=loo, method="predict_proba", n_jobs=-1)[:, 1]
        roc_auc = roc_auc_score(y, y_scores)
    except:
        roc_auc = np.nan

    pos_label = y.unique()[0]

    # Crear DataFrame con una fila
    return pd.DataFrame([{
        "Modelo": nombre_modelo,
        "Accuracy": accuracy_score(y, y_pred),
        "Precision": precision_score(y, y_pred, pos_label=pos_label, zero_division=0),
        "Recall": recall_score(y, y_pred, pos_label=pos_label, zero_division=0),
        "F1 Score": f1_score(y, y_pred, pos_label=pos_label, zero_division=0),
        "ROC AUC": roc_auc
    }])


modelos_y_parametros = {
    'LogisticRegression': (
        LogisticRegression(max_iter=3000, solver='liblinear'),
        {
            'model__penalty': ['l1', 'l2'],
            'model__C': [0.01, 0.1, 1, 10, 100, 1000, 10000, 1e-5],
            'model__class_weight': [None, 'balanced'],
            'model__fit_intercept': [True, False]
        }
    ),
    'DecisionTree': (
        DecisionTreeClassifier(),
        {
            'model__criterion': ['gini', 'entropy', 'log_loss'],
            'model__max_depth': [None, 3, 5, 10, 20, 30],
            'model__min_samples_split': [2, 5, 10, 20],
            'model__min_samples_leaf': [1, 2, 4, 6]
        }
    ),
    'KNN': (
        KNeighborsClassifier(),
        {
            'model__n_neighbors': [1, 3, 5, 7, 9, 11, 15, 21],
            'model__weights': ['uniform', 'distance'],
            'model__p': [1, 2, 3]
        }
    ),
    'SVM': (
        SVC(probability=True, kernel='rbf'),
        {
            'model__C': [0.01, 0.1, 1, 10, 100, 1000],
            'model__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
        }
    ),
    'RandomForest': (
        RandomForestClassifier(),
        {
            'model__n_estimators': [50, 100, 200, 300, 500],
            'model__max_depth': [None, 5, 10, 20, 30],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4],
            'model__max_features': ['sqrt', 'log2'],
            'model__class_weight': [None, 'balanced']
        }
    ),
    'GradientBoosting': (
        GradientBoostingClassifier(),
        {
            'model__n_estimators': [50, 100, 150, 200, 300],
            'model__learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2],
            'model__max_depth': [2, 3, 4, 5, 6],
            'model__subsample': [0.6, 0.8, 1.0]
        }
    ),
    'AdaBoost': (
        AdaBoostClassifier(),
        {
            'model__n_estimators': [50, 100, 150, 200, 300],
            'model__learning_rate': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0]
        }
    ),
    'MLPClassifier': (
        MLPClassifier(max_iter=500),
        {
            'model__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100)],
            'model__activation': ['relu', 'tanh', 'logistic'],
            'model__solver': ['adam', 'lbfgs'],
            'model__alpha': [0.0001, 0.001, 0.01],
            'model__learning_rate': ['constant', 'invscaling', 'adaptive']
        }
    )
}


def correr_todos(X, y):
    resultados = []

    for nombre_modelo, (modelo, parametros) in modelos_y_parametros.items():
        print(f"🔍 Buscando mejores hiperparámetros para: {nombre_modelo}")

        pipeline = Pipeline([
            ('model', modelo)
        ])

        grid = GridSearchCV(pipeline, parametros, cv=3, n_jobs=-1)
        grid.fit(X, y)

        mejor_modelo = grid.best_estimator_
        mejores_parametros = grid.best_params_

        # Evaluar con LOO
        df_metricas = evaluar_modelo(mejor_modelo, X, y, nombre_modelo=nombre_modelo)
        df_metricas["Best Params"] = [mejores_parametros]

        resultados.append(df_metricas)

    # Concatenar todos los DataFrames
    df_resultados = pd.concat(resultados, ignore_index=True)
    df_resultados = df_resultados.sort_values(by="F1 Score", ascending=False)
    return df_resultados


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Definir columnas categóricas y numéricas
cat_cols = X.columns[:7]
num_cols = X.columns[7:]

# Crear el preprocesador (solo OneHotEncoder)
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first', sparse_output=False), cat_cols),
    ('num', 'passthrough', num_cols)  # deja las numéricas tal como están
])

# Ajustar y transformar
X_procesado = preprocessor.fit_transform(X)

In [None]:
res = correr_todos(X_procesado, y_fraud)

🔍 Buscando mejores hiperparámetros para: LogisticRegression
🔍 Buscando mejores hiperparámetros para: DecisionTree
🔍 Buscando mejores hiperparámetros para: KNN
🔍 Buscando mejores hiperparámetros para: SVM
🔍 Buscando mejores hiperparámetros para: RandomForest
🔍 Buscando mejores hiperparámetros para: GradientBoosting
🔍 Buscando mejores hiperparámetros para: AdaBoost
🔍 Buscando mejores hiperparámetros para: MLPClassifier


In [None]:
res

NameError: name 'res' is not defined

### para la continua claims_total

In [None]:
def evaluar_modelo_continuas(modelo, X, y, nombre_modelo="Modelo"):
    loo = LeaveOneOut()

    # Predicciones
    y_pred = cross_val_predict(modelo, X, y, cv=loo, n_jobs=-1)

    # Calcular métricas de regresión
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    mape = np.mean(np.abs((y - y_pred) / y)) * 100  # MAPE en porcentaje

    # Crear DataFrame con una fila
    return pd.DataFrame([{
        "Modelo": nombre_modelo,
        "MSE": mse,
        "MAPE": mape,
        "R2": r2
    }])



modelos_y_parametros_continua = {

    'Ridge': (
        Ridge(),
        {
            'model__alpha': [0.01, 0.1, 1, 10, 100],
            'model__alpha': [0.01],
            'model__fit_intercept': [True, False],
            'model__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
        }
    ),

    'Lasso': (
        Lasso(max_iter=3000),
        {
        'model__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
        'model__fit_intercept': [True, False],
        'model__selection': ['cyclic', 'random']
       }
    ),

    'MLPRegressor': (
        MLPRegressor(max_iter=500),
        {
        'model__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100)],
        'model__activation': ['relu', 'tanh'],
        'model__solver': ['adam', 'lbfgs'],
        'model__alpha': [0.0001, 0.001, 0.01],
        'model__learning_rate': ['constant', 'invscaling', 'adaptive']
        }
    ),
    'AdaBoostRegressor': (
        AdaBoostRegressor(random_state=42),
        {
        'model__n_estimators': [50, 100, 200],
        'model__learning_rate': [0.01, 0.1, 1.0],
        'model__loss': ['linear', 'square', 'exponential'],

        }
    ),

    'GradientBoostingRegressor': (
    GradientBoostingRegressor(random_state=42),
        {
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7],
        'model__subsample': [0.6, 0.8, 1.0],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 5],
        'model__max_features': ['auto', 'sqrt', 'log2', None]
        }
    ),
    'RandomForestRegressor': (
    RandomForestRegressor(random_state=42),
        {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [None, 5, 10, 20],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['auto', 'sqrt', 'log2'],
        'model__bootstrap': [True, False]
        }
    )

}

def correr_todos_continuas(X, y):
    resultados = []

    for nombre_modelo, (modelo, parametros) in modelos_y_parametros_continua.items():
        print(f"🔍 Buscando mejores hiperparámetros para: {nombre_modelo}")

        pipeline = Pipeline([
            ('model', modelo)
        ])

        grid = GridSearchCV(pipeline, parametros, cv=3, n_jobs=-1)
        grid.fit(X, y)

        mejor_modelo = grid.best_estimator_
        mejores_parametros = grid.best_params_

        # Evaluar con LOO
        df_metricas = evaluar_modelo_continuas(mejor_modelo, X, y, nombre_modelo=nombre_modelo)
        df_metricas["Best Params"] = [mejores_parametros]

        resultados.append(df_metricas)

    # Concatenar todos los DataFrames
    df_resultados = pd.concat(resultados, ignore_index=True)
    df_resultados = df_resultados.sort_values(by="R2", ascending=False)
    return df_resultados

In [None]:
res2 = correr_todos_continuas(X_procesado, y_claim)

🔍 Buscando mejores hiperparámetros para: Ridge
🔍 Buscando mejores hiperparámetros para: Lasso
🔍 Buscando mejores hiperparámetros para: MLPRegressor
🔍 Buscando mejores hiperparámetros para: AdaBoostRegressor
🔍 Buscando mejores hiperparámetros para: GradientBoostingRegressor
🔍 Buscando mejores hiperparámetros para: RandomForestRegressor


In [None]:
res2

Unnamed: 0,Modelo,MSE,MAPE,R2,Best Params
3,AdaBoostRegressor,211112200.0,29.273049,0.696828,"{'model__learning_rate': 0.01, 'model__loss': ..."
1,Lasso,212963300.0,37.506181,0.694169,"{'model__alpha': 10, 'model__fit_intercept': F..."
0,Ridge,213069700.0,37.618254,0.694017,"{'model__alpha': 0.01, 'model__fit_intercept':..."
5,RandomForestRegressor,214413500.0,35.71604,0.692087,"{'model__bootstrap': True, 'model__max_depth':..."
4,GradientBoostingRegressor,214606900.0,39.592009,0.691809,"{'model__learning_rate': 0.01, 'model__max_dep..."
2,MLPRegressor,709420100.0,245.280869,-0.018778,"{'model__activation': 'tanh', 'model__alpha': ..."


In [None]:
res2['RMSE'] = res2['MSE'].apply(np.sqrt)

In [None]:
res2


Unnamed: 0,Modelo,MSE,MAPE,R2,Best Params,RMSE
3,AdaBoostRegressor,211112200.0,29.273049,0.696828,"{'model__learning_rate': 0.01, 'model__loss': ...",14529.700041
1,Lasso,212963300.0,37.506181,0.694169,"{'model__alpha': 10, 'model__fit_intercept': F...",14593.261984
0,Ridge,213069700.0,37.618254,0.694017,"{'model__alpha': 0.01, 'model__fit_intercept':...",14596.905985
5,RandomForestRegressor,214413500.0,35.71604,0.692087,"{'model__bootstrap': True, 'model__max_depth':...",14642.864978
4,GradientBoostingRegressor,214606900.0,39.592009,0.691809,"{'model__learning_rate': 0.01, 'model__max_dep...",14649.468108
2,MLPRegressor,709420100.0,245.280869,-0.018778,"{'model__activation': 'tanh', 'model__alpha': ...",26634.94111
