In [11]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [12]:
def create_model():
    """Crea y devuelve un modelo RandomForest optimizado con GridSearchCV y balanceado con SMOTE."""
    df = pd.read_csv("../data/WA_Fn-UseC_-Telco-Customer-Churn.csv")
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df.dropna(inplace=True)
    
    X = df.drop(columns=['Churn'])
    y = df['Churn']
    
    numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
    
    numeric_transformer = MinMaxScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )
    
    param_grid = {
        'n_estimators': [10, 50, 100],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    rf_clf = RandomForestClassifier(random_state=42, class_weight='balanced')
    model = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, scoring='balanced_accuracy')
    
    return preprocessor, model, X, y

In [13]:
def create_pipeline():
    """Crea un pipeline con preprocesamiento y modelo de clasificación."""
    preprocessor, model, X, y = create_model()
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(sampling_strategy='auto', random_state=42)),
        ('classifier', model)
    ])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    pipeline.fit(X_train, y_train)
    
    joblib.dump(pipeline, "../models/rf_pipeline.pkl")
    print("Modelo guardado en '../models/rf_pipeline.pkl'")
    
    return pipeline


In [14]:
pipeline = create_pipeline()

Modelo guardado en '../models/rf_pipeline.pkl'
