Paso 4: Análisis de correlación
Revisa la correlación entre variables numéricas y con la variable objetivo (charges).

sacar todos los data set 


Paso 5: Preparación de datos para regresión
Codifica variables categóricas (one-hot encoding).
Separa variables predictoras (X) y variable objetivo (y).
Divide en conjunto de entrenamiento y prueba.
python
Copy Code
from sklearn.model_selection import train_test_split

# One-hot encoding para variables categóricas
data_encoded = pd.get_dummies(total_data, drop_first=True)

X = data_encoded.drop('charges', axis=1)
y = data_encoded['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Paso 6: Entrenamiento y optimización de regresión lineal
Usa GridSearchCV para optimizar hiperparámetros (aunque regresión lineal simple no tiene muchos, puedes probar regularización con Ridge o Lasso).

python
Copy Code
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

ridge = Ridge()

param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Mejor alpha:", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2:", r2_score(y_test, y_pred))

Paso 7: Interpretación y conclusiones
Analiza los coeficientes del modelo para entender la influencia de cada variable.
Evalúa el desempeño con métricas (RMSE, R2).
Considera posibles mejoras (transformaciones, variables nuevas, modelos más complejos). 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import math

In [2]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/linear-regression-project-tutorial/main/medical_insurance_cost.csv"
total_data = pd.read_csv(url)


def process_dataframes_shapes(total_data, charges):

    # 1. total_data: dataset original
    total_data = total_data.copy()

    # 2. total_data_no_outliers: ajustar outliers en variables numéricas (winsorizing)
    def adjust_outliers(data):
        df_adj = data.copy()
        numeric_cols = df_adj.select_dtypes(include=['float64', 'int64']).columns.drop(target)
        for col in numeric_cols:
            Q1 = df_adj[col].quantile(0.25)
            Q3 = df_adj[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df_adj[col] = np.where(df_adj[col] < lower_bound, lower_bound, df_adj[col])
            df_adj[col] = np.where(df_adj[col] > upper_bound, upper_bound, df_adj[col])
        return df_adj

    total_data_no_outliers = adjust_outliers(total_data)

    # 3. total_data_factorized: factorizar variables categóricas en total_data
    total_data_factorized = total_data.copy()
    cat_cols = total_data_factorized.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
        total_data_factorized[col] = total_data_factorized[col].astype('category').cat.codes

    # 4. total_data_no_outliers_factorized: factorizar variables categóricas en total_data_no_outliers
    total_data_no_outliers_factorized = total_data_no_outliers.copy()
    cat_cols_no_out = total_data_no_outliers_factorized.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols_no_out:
        total_data_no_outliers_factorized[col] = total_data_no_outliers_factorized[col].astype('category').cat.codes

    # Función para escalar (sin escalar la variable target)
    def scale_data(data, scaler):
        data_scaled = data.copy()
        numeric_cols = data_scaled.select_dtypes(include=['float64', 'int64']).columns.drop('target')
        data_scaled[numeric_cols] = scaler.fit_transform(data_scaled[numeric_cols])
        return data_scaled

    # 5. total_data_standard: escalado Standard en total_data_factorized
    total_data_standard = scale_data(total_data_factorized, StandardScaler())

    # 6. total_data_no_outliers_standard: escalado Standard en total_data_no_outliers_factorized
    total_data_no_outliers_standard = scale_data(total_data_no_outliers_factorized, StandardScaler())

    # 7. total_data_factorized_standard: igual que total_data_standard (ya factorizado)
    total_data_factorized_standard = total_data_standard.copy()

    # 8. total_data_no_outliers_factorized_standard: igual que total_data_no_outliers_standard
    total_data_no_outliers_factorized_standard = total_data_no_outliers_standard.copy()

    # 9. total_data_minmax: escalado MinMax en total_data_factorized
    total_data_minmax = scale_data(total_data_factorized, MinMaxScaler())

    # 10. total_data_no_outliers_minmax: escalado MinMax en total_data_no_outliers_factorized
    total_data_no_outliers_minmax = scale_data(total_data_no_outliers_factorized, MinMaxScaler())

    # 11. total_data_factorized_minmax: igual que total_data_minmax
    total_data_factorized_minmax = total_data_minmax.copy()

    # 12. total_data_no_outliers_factorized_minmax: igual que total_data_no_outliers_minmax
    total_data_no_outliers_factorized_minmax = total_data_no_outliers_minmax.copy()

    return {
        'total_data': total_data.shape,
        'total_data_no_outliers': total_data_no_outliers.shape,
        'total_data_factorized': total_data_factorized.shape,
        'total_data_no_outliers_factorized': total_data_no_outliers_factorized.shape,
        'total_data_standard': total_data_standard.shape,
        'total_data_no_outliers_standard': total_data_no_outliers_standard.shape,
        'total_data_factorized_standard': total_data_factorized_standard.shape,
        'total_data_no_outliers_factorized_standard': total_data_no_outliers_factorized_standard.shape,
        'total_data_minmax': total_data_minmax.shape,
        'total_data_no_outliers_minmax': total_data_no_outliers_minmax.shape,
        'total_data_factorized_minmax': total_data_factorized_minmax.shape,
        'total_data_no_outliers_factorized_minmax': total_data_no_outliers_factorized_minmax.shape
    }


In [3]:
def plot_numerical_data(dataframe):
    numerical_columns = dataframe.select_dtypes(include=['float64', 'int64']).columns
    for column in numerical_columns:
        fig, axis = plt.subplots(2, 1, figsize=(8, 4), gridspec_kw={'height_ratios': [6, 1]})
        # Calculate mean, median, and standard deviation
        mean_val = np.mean(dataframe[column])
        median_val = np.median(dataframe[column])
        std_dev = np.std(dataframe[column])
        # Create a multiple subplots with histograms and box plots
        sns.histplot(ax=axis[0], data=dataframe, kde=True, x=column).set(xlabel=None)
        axis[0].axvline(mean_val, color='red', linestyle='dashed', linewidth=1, label='Mean')
        axis[0].axvline(median_val, color='orange', linestyle='dashed', linewidth=1, label='Median')
        axis[0].axvline(mean_val + std_dev, color='green', linestyle='dashed', linewidth=1, label='Standard Deviation')
        axis[0].axvline(mean_val - std_dev, color='green', linestyle='dashed', linewidth=1)
        sns.boxplot(ax=axis[1], data=dataframe, x=column, width=0.6).set(xlabel=None)
        axis[1].axvline(mean_val, color='red', linestyle='dashed', linewidth=1, label='Mean')
        axis[1].axvline(median_val, color='orange', linestyle='dashed', linewidth=1, label='Median')
        axis[1].axvline(mean_val + std_dev, color='green', linestyle='dashed', linewidth=1)
        axis[1].axvline(mean_val - std_dev, color='green', linestyle='dashed', linewidth=1)
        axis[0].legend()
        fig.suptitle(column)
        # Adjust the layout
        plt.tight_layout()
        # Show the plot
        plt.show()

