# Notebook Titanic con datos de Kaggle

## Celda 1 — imports y ajustes

In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.impute import SimpleImputer

## Celda 2 — cargar Kaggle CSV (normalizamos nombres)

In [None]:
train = pd.read_csv("data/train.csv")

cols_map = {
    "Survived": "survived",
    "Pclass":   "pclass",
    "Sex":      "sex",
    "Age":      "age",
    "SibSp":    "sibsp",
    "Parch":    "parch",
    "Fare":     "fare",
    "Embarked": "embarked",
}
existing = [c for c in cols_map if c in train.columns]
df = train[existing].rename(columns=cols_map)

# Tipos recomendados
if "survived" in df.columns:
    df["survived"] = df["survived"].astype(int)
if "sex" in df.columns:
    df["sex"] = df["sex"].astype("category")
if "embarked" in df.columns:
    df["embarked"] = df["embarked"].astype("category")

display(df.head())
print("Shape:", df.shape)
print(df.dtypes)


## Celda 3 — inspección inicial

In [None]:
print("Nulos por columna:")
display(df.isna().sum())

print("\nEstadísticas (numéricas):")
display(df.describe())

cat_cols = df.select_dtypes(include=["object", "category"]).columns
if len(cat_cols):
    print("\nEstadísticas (categóricas):")
    display(df[cat_cols].describe())
else:
    print("\n(No hay columnas 'object'/'category' para describir)")

print("\nCardinalidad categóricas:")
display(df[cat_cols].nunique() if len(cat_cols) else "—")


## Celda 4 — limpieza (duplicados, imputación, feature)

In [None]:
before = len(df)
df = df.drop_duplicates()
print("Duplicados eliminados:", before - len(df))

# Imputación segura (usar ravel() para evitar error 2D)
if "age" in df.columns:
    df["age"] = SimpleImputer(strategy="median").fit_transform(df[["age"]]).ravel()

if "embarked" in df.columns:
    df["embarked"] = df["embarked"].astype(object)
    df["embarked"] = SimpleImputer(strategy="most_frequent").fit_transform(df[["embarked"]]).ravel()
    df["embarked"] = df["embarked"].astype("category")

if set(["sibsp","parch"]).issubset(df.columns):
    df[["sibsp","parch"]] = df[["sibsp","parch"]].fillna(0)
    df["family_size"] = df["sibsp"] + df["parch"] + 1

cols_verif = [c for c in ["age","embarked"] if c in df.columns]
if cols_verif:
    print("\nNulos tras imputación:")
    display(df[cols_verif].isna().sum())


## Celda 5 — univariado

In [None]:
df["age"].dropna().plot(kind="hist", bins=30)
plt.title("Distribución de edad")
plt.xlabel("Edad"); plt.ylabel("Frecuencia")
plt.show()

sns.countplot(x="survived", data=df)
plt.title("Conteo: sobrevivió (0/1)")
plt.show()


## Celda 6 — bivariado

In [None]:
if "fare" in df.columns:
    sns.boxplot(x="survived", y="fare", data=df)
    plt.title("Fare por sobrevivencia")
    plt.show()

if set(["sex","survived"]).issubset(df.columns):
    sns.countplot(x="sex", hue="survived", data=df)
    plt.title("Sobrevivencia por sexo")
    plt.show()


## Celda 7 — correlación y outliers

In [None]:
num_df = df.select_dtypes(include=[np.number])
corr = num_df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, fmt=".2f", linewidths=.5)
plt.title("Matriz de correlación")
plt.show()

def outlier_mask_iqr(series, k=1.5):
    q1, q3 = np.percentile(series.dropna(), [25, 75])
    iqr = q3 - q1
    low, high = q1 - k*iqr, q3 + k*iqr
    return (series < low) | (series > high), (low, high)

for col in ["age","fare"]:
    if col in df.columns:
        mask, (lo, hi) = outlier_mask_iqr(df[col])
        print(f"Outliers en {col}: {int(mask.sum())} (límites: {lo:.2f}, {hi:.2f})")
        sns.boxplot(x=df[col]); plt.title(f"Boxplot {col} (IQR)"); plt.show()
if set(["fare","survived"]).issubset(df.columns):
    g1 = df.loc[df["survived"]==1, "fare"].dropna()
    g0 = df.loc[df["survived"]==0, "fare"].dropna()
    t_stat, p_val = stats.ttest_ind(g1, g0, equal_var=False)
    print(f"T-test fare | survived: t={t_stat:.3f}, p={p_val:.3e}")

if set(["sex","survived"]).issubset(df.columns):
    cont = pd.crosstab(df["sex"], df["survived"])
    chi2, p, dof, _ = stats.chi2_contingency(cont)
    print("Chi-cuadrado sex ~ survived:")
    display(cont)
    print(f"chi2={chi2:.3f}, dof={dof}, p={p:.3e}")


## Celda 8 — tests de hipótesis

In [None]:
if set(["fare","survived"]).issubset(df.columns):
    g1 = df.loc[df["survived"]==1, "fare"].dropna()
    g0 = df.loc[df["survived"]==0, "fare"].dropna()
    t_stat, p_val = stats.ttest_ind(g1, g0, equal_var=False)
    print(f"T-test fare | survived: t={t_stat:.3f}, p={p_val:.3e}")

if set(["sex","survived"]).issubset(df.columns):
    cont = pd.crosstab(df["sex"], df["survived"])
    chi2, p, dof, _ = stats.chi2_contingency(cont)
    print("Chi-cuadrado sex ~ survived:")
    display(cont)
    print(f"chi2={chi2:.3f}, dof={dof}, p={p:.3e}")



## Celda 9 — exportar CSVs limpios

In [None]:
df_encoded = pd.get_dummies(df, drop_first=True)
df.to_csv("titanic_clean.csv", index=False)
df_encoded.to_csv("titanic_encoded.csv", index=False)
print("Guardados: titanic_clean.csv, titanic_encoded.csv")
