## Preprocesado y limpieza de datos

**Objetivo:** Preparar el dataset para modelado, manejando datos faltantes, duplicados y transformaciones.

In [1]:
import pandas as pd
df = pd.read_csv('dataset.csv')

In [2]:
import numpy as np
np.random.seed(42)
# Simular 5% de datos faltantes en 'MonthlyCharges'
mask = np.random.rand(len(df)) < 0.05
df.loc[mask, 'MonthlyCharges'] = np.nan

In [3]:
# Imputar numéricas
df['MonthlyCharges'].fillna(df['MonthlyCharges'].median(), inplace=True)
# Imputar categóricas
df['Contract'].fillna(df['Contract'].mode()[0], inplace=True)

In [4]:
df.drop_duplicates(inplace=True)
print(f"Instancias tras eliminar duplicados: {len(df)}")

Instancias tras eliminar duplicados: 7043


In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Churn'] = le.fit_transform(df['Churn'])
df = pd.get_dummies(df, columns=['Contract', 'PaymentMethod'], drop_first=True)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['MonthlyCharges', 'tenure']] = scaler.fit_transform(df[['MonthlyCharges', 'tenure']])

In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Cargar dataset original
df = pd.read_csv('dataset.csv')

# Eliminar customerID
df = df.drop('customerID', axis=1)

# Convertir TotalCharges a numérico
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Codificar variable objetivo (Churn)
le = LabelEncoder()
df['Churn'] = le.fit_transform(df['Churn'])

# Codificar variables categóricas (one-hot encoding)
categorical_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Estandarizar variables numéricas
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Guardar dataset preprocesado
df.to_csv('dataset_preprocesado.csv', index=False)