## Deploy

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import joblib

In [2]:
#Load the original dataset
df = pd.read_csv('dataset-churn.csv')

#Initial Preprocessing
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)

# Replace values
values_to_replace = ["No internet service", "No phone service"]
df = df.replace(values_to_replace, "No")

# Drop customerID
df = df.drop('customerID', axis=1)

#Define the columns
# Numerical columns for scaling
final_numeric_features = ['tenure', 'MonthlyCharges']
# Columns for Label Encoding (mapped to 0/1)
label_encode_features = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
    'StreamingTV', 'StreamingMovies', 'PaperlessBilling'
] # Churn is the target, SeniorCitizen is treated separately
# Columns for One-Hot Encoding
one_hot_encode_features = ['InternetService', 'Contract', 'PaymentMethod']

#Mapping for Label Encoding
mapeamento_binario = {'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0}

#Get unique values
categorical_unique_values = {}
# For Label Encoding featuresy)
for col in label_encode_features:
    if col == 'gender':
        categorical_unique_values[col] = ['Male', 'Female']
    else:
        categorical_unique_values[col] = ['No', 'Yes']

# For SeniorCitizen (treated as 0/1, but in UI we want 'Yes'/'No')
categorical_unique_values['SeniorCitizen'] = ['No', 'Yes']

# For One-Hot Encoding features
for col in one_hot_encode_features:
    categorical_unique_values[col] = df[col].unique().tolist()

#Prepare a DataFrame for StandardScaler and to capture 'model_columns'
X_simulated_for_processing = df.copy()

# Remove Churn for X
X_simulated_for_processing = X_simulated_for_processing.drop('Churn', axis=1)

# Apply Label Encoding
for col in label_encode_features:
    X_simulated_for_processing[col] = X_simulated_for_processing[col].map(mapeamento_binario)

# Apply One-Hot Encoding
X_simulated_for_processing = pd.get_dummies(X_simulated_for_processing, columns=one_hot_encode_features, drop_first=True, dtype=int)

# Drop TotalCharges
X_simulated_for_processing = X_simulated_for_processing.drop('TotalCharges', axis=1)

#Scale Numerical Features and Save Scaler
scaler = StandardScaler()
# Use final_numeric_features since TotalCharges was already dropped
X_simulated_for_processing[final_numeric_features] = scaler.fit_transform(X_simulated_for_processing[final_numeric_features])

#Train the final model with the best hyperparameters
best_lgbm_model = LGBMClassifier(
    objective='binary',
    metric='binary_logloss',
    random_state=42,
    scale_pos_weight=2.7686,
    learning_rate=0.05,
    n_estimators=100,
    num_leaves=20,
    reg_alpha=0.5,
    reg_lambda=0.5
)

# Separate X and y from the processed dataset
y = df['Churn'].map(mapeamento_binario)
X = X_simulated_for_processing.copy()

# Train the model
best_lgbm_model.fit(X, y)
print("LightGBM model trained with optimized hyperparameters.")

#Save all required components
joblib.dump(best_lgbm_model, 'lgbm_churn_model.pkl')
print("LightGBM model saved as 'lgbm_churn_model.pkl'")
joblib.dump(scaler, 'scaler.pkl')
print("StandardScaler saved as 'scaler.pkl'")
model_columns = X_simulated_for_processing.columns.tolist()
joblib.dump(model_columns, 'model_columns.pkl')
print("Model column list saved as 'model_columns.pkl'")
joblib.dump(final_numeric_features, 'numerical_features.pkl')
print("List of numerical features saved as 'numerical_features.pkl'")
joblib.dump(label_encode_features, 'label_encode_features.pkl')
print("List of Label Encoding features saved as 'label_encode_features.pkl'")
joblib.dump(one_hot_encode_features, 'one_hot_encode_features.pkl')
print("List of One-Hot Encoding features saved as 'one_hot_encode_features.pkl'")
all_categorical_features_for_ui = label_encode_features + ['SeniorCitizen'] + one_hot_encode_features
joblib.dump(all_categorical_features_for_ui, 'categorical_features_for_ui.pkl')
print("List of categorical features for UI saved as 'categorical_features_for_ui.pkl'")
joblib.dump(categorical_unique_values, 'categorical_unique_values.pkl')
print("Unique values of categorical features for UI saved as 'categorical_unique_values.pkl'")
joblib.dump(mapeamento_binario, 'mapeamento_binario.pkl')
print("Binary mapping saved as 'mapeamento_binario.pkl'")
print("All components have been successfully saved!")


[LightGBM] [Info] Number of positive: 1869, number of negative: 5174
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001299 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 369
[LightGBM] [Info] Number of data points in the train set: 7043, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265370 -> initscore=-1.018243
[LightGBM] [Info] Start training from score -1.018243
LightGBM model trained with optimized hyperparameters.
LightGBM model saved as 'lgbm_churn_model.pkl'
StandardScaler saved as 'scaler.pkl'
Model column list saved as 'model_columns.pkl'
List of numerical features saved as 'numerical_features.pkl'
List of Label Encoding features saved as 'label_encode_features.pkl'
List of One-Hot Encoding features saved as 'one_hot_encode_features.pkl'
List of categorical features for UI saved as 'categorical

In [1]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
