# Trabajo Final - David Zapata

## 02- preprocesado

In [4]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [20]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

df_sim = df.copy()
total_cells = df_sim.shape[0] * df_sim.shape[1]
target_missing = int(total_cells * 0.05)
columns_for_missing = [col for col in df_sim.columns if col not in ['customerID', 'Churn']]

np.random.seed(42)
inserted = 0
while inserted < target_missing:
    rand_row = np.random.randint(0, df_sim.shape[0])
    rand_col = np.random.choice(columns_for_missing)
    if pd.notnull(df_sim.at[rand_row, rand_col]):
        df_sim.at[rand_row, rand_col] = np.nan
        inserted += 1

df_sim.drop(columns=['customerID'], inplace=True)

cat_cols = df_sim.select_dtypes(include='object').columns.tolist()
cat_cols.remove('Churn') 
num_cols = df_sim.select_dtypes(include=['int64', 'float64']).columns.tolist()

missing_before = df_sim.isnull().sum().sum()
print(f" Valores faltantes antes del preprocesamiento: {missing_before}")

num_imputer = SimpleImputer(strategy='mean')
df_sim[num_cols] = num_imputer.fit_transform(df_sim[num_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
df_sim[cat_cols] = cat_imputer.fit_transform(df_sim[cat_cols])

missing_after = df_sim.isnull().sum().sum()
print(f" Valores faltantes después del preprocesamiento: {missing_after}")

df_encoded = pd.get_dummies(df_sim, columns=cat_cols)

df_encoded['Churn'] = df_encoded['Churn'].map({'Yes': 1, 'No': 0})

scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

display(df_encoded.head())

df_encoded.to_csv("dataset_preprocesado.csv", index=False)
print(" Dataset preprocesado guardado como 'dataset_preprocesado.csv'")

 Valores faltantes antes del preprocesamiento: 7395
 Valores faltantes después del preprocesamiento: 0


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,TotalCharges_995.35,TotalCharges_996.45,TotalCharges_996.85,TotalCharges_996.95,TotalCharges_997.65,TotalCharges_997.75,TotalCharges_998.1,TotalCharges_999.45,TotalCharges_999.8,TotalCharges_999.9
0,-0.452484,-1.31822,-1.200621,0,True,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
1,-0.452484,0.061711,-0.272205,0,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,-0.452484,-1.276404,0.0,1,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,-0.452484,0.521688,-0.774097,0,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,-0.452484,-1.276404,0.198855,1,True,False,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False


 Dataset preprocesado guardado como 'dataset_preprocesado.csv'
