## 2. Predicción del Costo del Seguro Médico con Regresión Lineal Múltiple

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

df = pd.read_csv('../csvs/insurance_v2.csv')

#Primero categorizamos las variables no numéricas
encoder = LabelEncoder()
df['sex'] = encoder.fit_transform(df['sex'])
df['smoker'] = encoder.fit_transform(df['smoker'])
df['region'] = encoder.fit_transform(df['region'])
X = df.drop(columns=['charges'])
# Aplicar transformación logarítmica a la variable objetivo
df['charges_log'] = np.log(df['charges'])  
y = df['charges_log']  # Usamos la nueva variable transformada

#y = df['charges']

# Escalar las variables independientes con StandardScaler()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # Devuelve un array NumPy


# Calcular la correlación con la variable objetivo
df_scaled = pd.DataFrame(X_scaled, columns=X.columns) 
df_scaled["charges"] = y  # Agregamos la variable objetivo para calcular correlación
correlaciones = df_scaled.corr()["charges"].sort_values(ascending=False)
print(correlaciones)

# Dividir el dataset en entrenamiento (80%) y prueba (20%) con train_test_split()
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Entrenar el modelo de Regresión Lineal con LinearRegression()
modelo = LinearRegression()
modelo.fit(X_train, y_train)

# Obtener e interpretar los coeficientes y el intercepto del modelo
coeficientes = pd.Series(modelo.coef_, index=X.columns) 
print("Coeficientes del modelo:\n", coeficientes) # Cuando entrenamos un modelo de Regresión Lineal Múltiple, este calcula los coeficientes (b1, b2, b3...bn) que indican la importancia de cada variable en la predicción de la variable objetivo.
print("Intercepto:", modelo.intercept_) # Es el valor de Y cuando todas las variables X son cero.

# Calcular VIF para cada variable
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X_scaled, i) for i in range(X_scaled.shape[1])]

print(vif_data)

# Realizar predicciones con modelo.predict(X_test)
y_pred = modelo.predict(X_test)

# Evaluar el modelo 
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Mostrar resultados
print(f"MAE (Error Absoluto Medio): {mae:.4f}")
print(f"MSE (Error Cuadrático Medio): {mse:.4f}")
print(f"RMSE (Raíz del Error Cuadrático Medio): {rmse:.4f}")
print(f"R² (Coeficiente de Determinación): {r2:.4f}")

df

charges     1.000000
smoker      0.665506
age         0.527834
children    0.161336
bmi         0.132669
sex         0.005632
region     -0.042690
Name: charges, dtype: float64
Coeficientes del modelo:
 age         0.481793
sex        -0.037004
bmi         0.078217
children    0.111682
smoker      0.625120
region     -0.048985
dtype: float64
Intercepto: 9.107247086300768
   Variable       VIF
0       age  1.015394
1       sex  1.008889
2       bmi  1.040608
3  children  1.002482
4    smoker  1.006466
5    region  1.025966
MAE (Error Absoluto Medio): 0.2708
MSE (Error Cuadrático Medio): 0.1776
RMSE (Raíz del Error Cuadrático Medio): 0.4214
R² (Coeficiente de Determinación): 0.8025


Unnamed: 0,age,sex,bmi,children,smoker,region,charges,charges_log
0,19,0,27.900,0,1,3,16884.92400,9.734176
1,18,1,33.770,1,0,2,1725.55230,7.453302
2,28,1,33.000,3,0,2,4449.46200,8.400538
3,33,1,22.705,0,0,1,21984.47061,9.998092
4,32,1,28.880,0,0,1,3866.85520,8.260197
...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830,9.268661
1334,18,0,31.920,0,0,0,2205.98080,7.698927
1335,18,0,36.850,0,0,2,1629.83350,7.396233
1336,21,0,25.800,0,0,3,2007.94500,7.604867


## 5. Regresión logística binaria para predecir compras

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss

df = pd.read_csv("../csvs/ejercicio5.csv")
encoder = LabelEncoder()
df['Gender'] = encoder.fit_transform(df['Gender'])
X = df.drop(columns=["Purchased", "User ID"])
y = df['Purchased']

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

df


Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,0,26,43000,0
3,15603246,0,27,57000,0
4,15804002,1,19,76000,0
...,...,...,...,...,...
395,15691863,0,46,41000,1
396,15706071,1,51,23000,1
397,15654296,0,50,20000,1
398,15755018,1,36,33000,0
