In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

: 

In [None]:
df = pd.read_csv('../data/raw/bike_buyers.csv')

df.shape

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
df.drop("ID", axis=1).describe()

In [None]:
cat_cols = df.select_dtypes(include="object")

for col in cat_cols:
    df_var = pd.DataFrame(df[col].value_counts())
    df_var.columns = ["freq_abs"]

    df_var["freq_rel"] = df_var["freq_abs"]/len(df)
    df_var["freq_rel_%"] = df_var["freq_rel"]*100

    display(df_var)

In [None]:
num_cols = df.select_dtypes(include=['int64', 'float64']).drop("ID", axis=1).columns.tolist()

n = len(num_cols)
ncols = 2
nrows = (n + ncols - 1) // ncols

fig, axes = plt.subplots(ncols=ncols,nrows=nrows,figsize=(4*ncols, 4*nrows))

axes = axes.flatten() if isinstance(axes, (list, np.ndarray)) else [axes]

for ax, col in zip(axes, num_cols):
    df[col].plot(kind='box', ax=ax)
    ax.set_title(f"Distribución de la variable {col}")


# hide any unused axes
for ax in axes[len(num_cols):]:
    ax.set_visible(False)

plt.tight_layout()
plt.show()

In [None]:

target_var = "Purchased Bike"
cat_cols = df.select_dtypes(include=['object']).drop(target_var, axis=1).columns.tolist()
# create a subplot grid that fits all categorical columns
n = len(cat_cols)
ncols = 2
nrows = (n + ncols - 1) // ncols
fig, axes = plt.subplots(nrows, ncols, figsize=(5 * ncols, 4 * nrows))

# ensure axes is a flat array for easy indexing
axes = axes.flatten() if isinstance(axes, (list, np.ndarray)) else [axes]

for ax, col in zip(axes, cat_cols):
    crosstab_data = pd.crosstab(df[col], df[target_var])
    crosstab_data.plot(kind='bar', stacked=True, ax=ax)
    ax.set_title(f'Distribucion de {col} según {target_var}')
    ax.set_xlabel(col)
    ax.set_ylabel('Cantidad')
    ax.legend(title=target_var)

# hide any unused axes
for ax in axes[len(cat_cols):]:
    ax.set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Verificar la cantidad de valores na
df.isna().sum()

In [None]:
df_clean = df.copy()

# Imputaremos valores usando SimpleImputer
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

# Separamos los tipos de valores
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
cat_cols = df.select_dtypes(include="object").columns

# Implementamos el imputador
df_clean[num_cols] = num_imputer.fit_transform(df_clean[num_cols])
df_clean[cat_cols] = cat_imputer.fit_transform(df_clean[cat_cols])

print("\nValores faltantes luego de la imputación")
print(df_clean.isna().sum())

In [None]:
# Detectar outliers usando el método IQR para cada columna numérica
num_cols = df_clean.select_dtypes(include=['int64', 'float64']).drop("ID", axis=1).columns.tolist()

print("=" * 50)
print("ANÁLISIS DE OUTLIERS (Método IQR)")
print("=" * 50)

for col in num_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    
    # Definir límites
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Identificar outliers
    outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)][col]
    
    print(f"\n{'=' * 50}")
    print(f"Columna: {col}")
    print(f"{'=' * 50}")
    print(f"Q1 (25%): {Q1}")
    print(f"Q3 (75%): {Q3}")
    print(f"IQR: {IQR}")
    print(f"Límite inferior: {lower_bound}")
    print(f"Límite superior: {upper_bound}")
    print(f"Cantidad de outliers: {len(outliers)} ({len(outliers)/len(df_clean)*100:.2f}%)")

print("=" * 50)

In [None]:
# Aplicar Winsorización usando método IQR (más efectivo)
num_cols = df_clean.select_dtypes(include=['int64', 'float64']).drop(["ID", "Cars"], axis=1).columns.tolist()

print("Tratamiento de outliers con Winsorización (método IQR):")
print("="*60)

for col in num_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    outliers_before = ((df_clean[col] < lower) | (df_clean[col] > upper)).sum()
    
    df_clean[col] = np.clip(df_clean[col], lower, upper)
    
    outliers_after = ((df_clean[col] < lower) | (df_clean[col] > upper)).sum()
    
    print(f"{col}:")
    print(f"  Outliers antes: {outliers_before}")
    print(f"  Outliers después: {outliers_after}")
    print(f"  Límites: [{lower:.2f}, {upper:.2f}]")
    print()

print("="*60)

## Construccion de datos

In [None]:
# Creamos una columna para agrupar a los compradores por tiers de income
bins = [0, 30000, 80000, np.inf]
labels = ['Low', 'Medium', 'High']
df['Income_Group'] = pd.cut(df['Income'], bins=bins, labels=labels, right=False)

In [None]:
# Creamos un ratio de dependencia: ninos por poseedores de autos
df['Dependency_Ratio'] = df['Children'] / (df['Cars'] + 1)

In [None]:
# Creamos grupos de edades (binning)
bins_age = [0, 35, 55, np.inf]
labels_age = ['Young Adult', 'Middle-Aged', 'Senior']
df['Age_Group'] = pd.cut(df['Age'], bins=bins_age, labels=labels_age, right=False)

In [None]:
# Mapeamos la distancia de viaje a un punto medio numerico
commute_map = {
    '0-1 Miles': 0.5,
    '1-2 Miles': 1.5,
    '2-5 Miles': 3.5,
    '5-10 Miles': 7.5,
    '10+ Miles': 12.0 
}
df['Commute_Midpoint_Miles'] = df['Commute Distance'].map(commute_map)

In [None]:
# Binarizamos la columna (0/1)
df['Purchased Bike'] = df['Purchased Bike'].map({'Yes': 1, 'No': 0})
df['Home Owner'] = df['Home Owner'].map({'Yes': 1, 'No': 0})

In [None]:
# Hacemos un mapeo de nivel de educacion a valores enteros
# Esto facilita el entrenamiento posterior de los modelos.
education_mapping = {
    'Partial High School': 1,
    'High School': 2,
    'Partial College': 3,
    'Bachelors': 4,
    'Graduate': 5,
    'Post-Graduate': 6 
}

# Creamos una nueva columna para mantener la antigua por ahora
df['Education_Rank'] = df['Education'].map(education_mapping)

# Confirmamos el cambio
print(df[['Education', 'Education_Rank']].head())