In [1]:
%pip install pandas scikit-learn numpy google-cloud-bigquery hopsworks hsfs[python] optuna xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
demography_df = pd.read_csv("https://repo.hops.works/dev/davit/churn/demography.csv")
customer_info_df = pd.read_csv("https://repo.hops.works/dev/davit/churn/customer_info.csv")
subscriptions_df = pd.read_csv("https://repo.hops.works/dev/davit/churn/subscriptions.csv")

In [4]:
def convert_dt(df):   
    df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')
    df['clienteidentifier'] = df['clienteidentifier'].astype(str)
    df['multiplelines'] = df['multiplelines'].astype(str)
    df['internetservice'] = df['internetservice'].astype(str)
    df['onlinesecurity'] = df['onlinesecurity'].astype(str)
    df['onlinebackup'] = df['onlinebackup'].astype(str)
    df['deviceprotection'] = df['deviceprotection'].astype(str)
    df['techsupport'] = df['techsupport'].astype(str)
    df['streamingtv'] = df['streamingtv'].astype(str)
    df['streamingmovies'] = df['streamingmovies'].astype(str)
    df['contract'] = df['contract'].astype(str)
    df['paymentmethod'] = df['paymentmethod'].astype(str)
    df['gender'] = df['gender'].astype(str)
    df['paperlessbilling'] = df['paperlessbilling'].astype(str)
    df['partner'] = df['partner'].astype(str)
    df['dependents'] = df['dependents'].astype(str)
    df['phoneservice'] = df['phoneservice'].astype(str)
    df['seniorcitizen'] = df['seniorcitizen'].astype(str)
    df['monthlycharges'] = df['monthlycharges'].astype(float)
    df['totalcharges'] = df['totalcharges'].astype(float)
    df['tenure'] = df['tenure'].astype(float)
    df['churn'] = df['churn'].astype(str)
    return df
    #df['fecha_ingreso'] = pd.to_datetime(df['fecha_ingreso']).dt.date
    
def reindex_df(df):
    new_order = ['clienteidentifier','multiplelines','internetservice','onlinesecurity','onlinebackup',
             'deviceprotection','techsupport','streamingtv','streamingmovies','contract','paymentmethod',
             'gender','paperlessbilling','partner','dependents','phoneservice','seniorcitizen',
             'monthlycharges','totalcharges','tenure','churn']
    
    data = df.reindex(columns=new_order)
    return data

In [5]:
# Realizar el join por el campo ID
df = customer_info_df.merge(subscriptions_df, on='customerID').merge(demography_df, on='customerID')
df = pd.DataFrame(df)
df.columns = df.columns.str.lower()
df = df.rename(columns={'customerid': 'clienteidentifier'})
# Imprimir el resultado


In [6]:
df = convert_dt(df)
data = reindex_df(df)

In [8]:
## TRANSFORMADOR AUXILIAR
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

def impute_missing_values(data):
    # Variables categóricas
    categorical_cols = data.select_dtypes(include='object').columns
    #categorical_cols = categorical_cols.drop('churn')  # Excluir 'churn'
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    data[categorical_cols] = categorical_imputer.fit_transform(data[categorical_cols])

    # Variables numéricas
    numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
    numeric_imputer = SimpleImputer(strategy='mean')
    data[numeric_cols] = numeric_imputer.fit_transform(data[numeric_cols])

    return data

def normalize_numeric_variables(data):
    numeric_cols = ['monthlycharges', 'totalcharges', 'tenure']

    scaler = MinMaxScaler()
    data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

    return data

def map_categorical_variables(data):
    mapping = {
        'multiplelines': {'No': 0, 'Yes': 0.5, 'No phone service': 1},
        'internetservice': {'Fiber optic': 0, 'DSL': 0.5, 'No': 1},
        'onlinesecurity': {'No': 0, 'Yes': 0.5, 'No internet service': 1},
        'onlinebackup': {'No': 0, 'Yes': 0.5, 'No internet service': 1},
        'deviceprotection': {'No': 0, 'Yes': 0.5, 'No internet service': 1},
        'techsupport': {'No': 0, 'Yes': 0.5, 'No internet service': 1},
        'streamingtv': {'No': 0, 'Yes': 0.5, 'No internet service': 1},
        'streamingmovies': {'No': 0, 'Yes': 0.5, 'No internet service': 1},
        'contract': {'Month-to-month': 0, 'One year': 0.5, 'Two year': 1},
        'gender' : {'Male' : 0, 'Female': 1},
        'paymentmethod': {'Electronic check': 0, 'Mailed check': 0.3, 'Bank transfer (automatic)': 0.6, 'Credit card (automatic)': 1},
        'paperlessbilling' : {'Yes' : 1, 'No': 0},
        'partner' : {'Yes' : 1, 'No': 0},
        'dependents' : {'Yes' : 1, 'No': 0},
        'phoneservice' : {'Yes' : 1, 'No': 0},
        'seniorcitizen' : {'0' : 0, '1': 1},
        'churn' : {'Yes' : 1, 'No': 0},    
    }

    data.replace(mapping, inplace=True)

    return data

def remove_duplicates(data):
    data.drop_duplicates(inplace=True)
    return data

In [9]:
data = impute_missing_values(data)
data = remove_duplicates(data)
data =  normalize_numeric_variables(data)
data = map_categorical_variables(data)
data = data.drop('clienteidentifier', axis=1)

In [11]:
data

Unnamed: 0,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paymentmethod,gender,paperlessbilling,partner,dependents,phoneservice,seniorcitizen,monthlycharges,totalcharges,tenure,churn
0,1.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,0,0,0,0.115423,0.001275,0.013889,0
1,0.0,0.5,0.5,0.0,0.5,0.0,0.0,0.0,0.5,0.3,0,0,0,0,1,0,0.385075,0.215867,0.472222,0
2,0.0,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.3,0,1,0,0,1,0,0.354229,0.010310,0.027778,1
3,1.0,0.5,0.5,0.0,0.5,0.5,0.0,0.0,0.5,0.6,0,0,0,0,0,0,0.239303,0.210241,0.625000,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0,0,1,0,0.521891,0.015330,0.027778,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.5,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.3,0,1,1,1,1,0,0.662189,0.227521,0.333333,0
7039,0.5,0.0,0.0,0.5,0.5,0.0,0.5,0.5,0.5,1.0,1,1,1,1,1,0,0.845274,0.847461,1.000000,0
7040,1.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,1,0,0,0.112935,0.037809,0.152778,0
7041,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0,1,1,0,1,1,0.558706,0.033210,0.055556,1


## Optimización de Hiperparametros

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import xgboost as xgb
from xgboost import XGBoostClassifier
import optuna

# Función objetivo para la optimización de hiperparámetros con Optuna
def objective(trial):
    params = {
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
        'max_depth' : trial.suggest_int('max_depth', 3, 10),
        'n_estimators' : trial.suggest_int('n_estimators', 100, 1000, step=100),
    }

    # Código para cargar tus datos
    # X contiene las características de entrada y y contiene las etiquetas objetivo
    X = data.drop('churn', axis = 1)
    y = data['churn']

    # Lista para almacenar los resultados de evaluación
    f1_scores = []
    accuracies = []
    confusion_matrices = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Utilizamos 5-fold cross-validation

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = xgb.XGBClassifier(**params)
        model.fit(X_train, y_train)

        # Realizar predicciones en conjunto de prueba
        y_pred = model.predict(X_test)

        # Calcular métricas de evaluación
        f1 = f1_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        confusion = confusion_matrix(y_test, y_pred)

        # Almacenar los resultados de evaluación
        f1_scores.append(f1)
        accuracies.append(accuracy)
        confusion_matrices.append(confusion)

    return np.mean(f1_scores)  # Puedes ajustar aquí la métrica que deseas optimizar

# Ejecutar la optimización de hiperparámetros con Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)  # Especifica el número de iteraciones que desees

# Imprimir los mejores hiperparámetros encontrados por Optuna
print("Mejores hiperparámetros:")
print(study.best_params)

[I 2023-07-01 21:26:55,210] A new study created in memory with name: no-name-37c46333-bc5d-4549-a354-e85f09d8ee09
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-01 21:27:32,845] Trial 0 finished with value: 0.5432489329684937 and parameters: {'learning_rate': 0.1858288477967429, 'max_depth': 10, 'n_estimators': 800}. Best is trial 0 with value: 0.5432489329684937.
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-01 21:27:36,816] Trial 1 finished with value: 0.39731990898194075 and parameters: {'learning_rate': 1.9842558018269472, 'max_depth': 5, 'n_estimators': 400}. Best is trial 0 with value: 0.5432489329684937.
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 3),
[I 2023-07-01 21:27:48,658] Trial 2 finished with value: 0.5460689292256955 and parameters: {'learning_rate': 0.3290163632007756, 'max_depth': 9, 'n_estimators': 300}. Best is trial 2 with value: 0.5460689292256955.
  'learning_rate' : 

Mejores hiperparámetros:
{'learning_rate': 0.02726342379655315, 'max_depth': 4, 'n_estimators': 1000}
Resultados de evaluación:


NameError: name 'f1_scores' is not defined

In [17]:
# Imprimir los mejores hiperparámetros encontrados por Optuna
print("Mejores hiperparámetros:")
print(study.best_params)

Mejores hiperparámetros:
{'learning_rate': 0.02726342379655315, 'max_depth': 4, 'n_estimators': 1000}


## Entrenamiento final del model

In [21]:
import xgboost as xgb
from xgboost import XGBClassifier

#dividir conjunto de datos
X = data.drop('churn', axis = 1)
y = data['churn']

# instancia parametros y modelo
params = study.best_params
xgb_model = xgb.XGBClassifier(**params)

# instancial metricas de evaluación
f1_scores = []
accuracies = []
confusion_matrices = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    xgb_model.fit(X_train, y_train)

    # Realizar predicciones en conjunto de prueba
    y_pred = xgb_model.predict(X_test)

    # Calcular métricas de evaluación
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)

    # Almacenar los resultados de evaluación
    f1_scores.append(f1)
    accuracies.append(accuracy)
    confusion_matrices.append(confusion)



In [22]:
# Imprimir los resultados
print("Resultados de evaluación en todo el conjunto de datos:")
print("F1 score:", f1)
print("Accuracy:", accuracy)
print("Confusion matrix:", confusion)

Resultados de evaluación en todo el conjunto de datos:
F1 score: 0.5633383010432191
Accuracy: 0.7919034090909091
Confusion matrix: [[926 108]
 [185 189]]


In [23]:
xgb_model

In [28]:
import os
import pickle

# Ruta del directorio actual (ajustar según la ubicación del script en el repositorio)
dir_path = os.path.join(os.getcwd(), "src")

# Ruta de la carpeta "models"
models_dir = os.path.join(dir_path, "..", "models")

# Crear la carpeta "models" si no existe
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

# Ruta del archivo de modelo dentro de la carpeta "models"
model_path = os.path.join(models_dir, "xgb_predictor.pkl")

# Guardar el modelo en disco
with open(model_path, 'wb') as file:
    pickle.dump(xgb_model, file)



In [27]:
# Cargar el modelo desde el archivo
with open(model_path, 'rb') as file:
    loaded_model = pickle.load(file)

In [34]:
!pip install --upgrade hsfs confluent_kafka

Collecting confluent_kafka
  Using cached confluent_kafka-2.1.1-cp39-cp39-win_amd64.whl (3.4 MB)
Installing collected packages: confluent-kafka
  Attempting uninstall: confluent-kafka
    Found existing installation: confluent-kafka 1.9.0
    Uninstalling confluent-kafka-1.9.0:
      Successfully uninstalled confluent-kafka-1.9.0
Successfully installed confluent-kafka-2.1.1


In [35]:
import hopsworks

project = hopsworks.login()
fs = project.get_feature_store()


Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/61594
Connected. Call `.close()` to terminate connection gracefully.


In [36]:
query = """
SELECT *
FROM `protean-fabric-386717.ml_datasets.churn_last`
WHERE fecha_ingreso = CURRENT_DATE()
"""

In [37]:
HopsFSConnector = fs.get_storage_connector("churn_synt")

In [38]:
HopsFSConnector.read(query=query, data_format=None)


FileNotFoundError: [Errno 2] No such file or directory: '/apps/hive/warehouse/churn_project_featurestore.db/storage_connector_resources/protean-fabric-386717-d6a21dd66382.json'

In [25]:
#guardar modelo en carperta
import os
import pickle

# Ruta del directorio actual (donde se encuentra el script.py)
dir_path = os.path.dirname(os.path.abspath(__file__))

# Ruta de la carpeta "models"
models_dir = os.path.join(dir_path, "..", "models")

# Crear la carpeta "models" si no existe
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

# Ruta del archivo de modelo dentro de la carpeta "models"
model_path = os.path.join(models_dir, "xgb_predictor.pkl")

# Guardar el modelo en disco
with open(model_path, 'wb') as file:
    pickle.dump(xgb_model, file)

# Cargar el modelo desde el archivo
with open(model_path, 'rb') as file:
    loaded_model = pickle.load(file)

NameError: name '__file__' is not defined