In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv("../data/clean_file.csv")
data.head()

Unnamed: 0,region,tenure,age,marital,address,income,ed,employ,retire,gender,reside,custcat
0,2,13,44,1,9,64.0,4,5,0,0,2,1
1,3,11,33,1,7,136.0,5,5,0,0,6,4
2,3,68,52,1,24,116.0,1,29,0,1,2,3
3,2,33,33,0,12,33.0,2,0,0,1,1,1
4,2,23,30,1,9,30.0,1,2,0,0,4,3


In [3]:
X = data.drop("custcat", axis=1)
y = data["custcat"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Estandarizar

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Train Models

In [6]:
# param_grid = {
#     'n_estimators': [50, 100, 200],  # Número de árboles
#     'max_depth': [None, 10, 20, 30],  # Profundidad máxima de los árboles
#     'min_samples_split': [2, 5, 10],  # Mínimo de muestras necesarias para dividir un nodo
#     'min_samples_leaf': [1, 2, 4],  # Mínimo de muestras necesarias en una hoja
#     'max_features': ['sqrt', 'log2', None],  # Número de características a considerar al dividir
# }

# model = RandomForestClassifier(class_weight="balanced")
# grid = GridSearchCV(model, param_grid, cv=5)

In [7]:
def print_metrics(y_test, y_pred):
    dict_metrics = classification_report(y_test, y_pred, output_dict=True)
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": dict_metrics['weighted avg']['precision'],
        "recall": dict_metrics['weighted avg']['recall'],
        "f1-score": dict_metrics['weighted avg']['f1-score']
    }
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print(f"classification_report:\n", classification_report(y_test, y_pred))
    return metrics

In [8]:
def split_data(data: pd.DataFrame):

    X = data.drop("custcat", axis=1)
    y = data["custcat"]

    return train_test_split(X, y, test_size=0.2, random_state=42)

def processing_data(X_train, X_test, y_train, y_test):
    # Estandarizar
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    label = LabelEncoder()
    y_train = label.fit_transform(y_train)
    y_test = label.transform(y_test)
    return [X_train, X_test, y_train, y_test]

In [9]:
import mlflow
import mlflow.artifacts
import mlflow.artifacts

def train_model(model, data: pd.DataFrame, name:str, version:str):
    mlflow.set_experiment(f"Refuerzo_ML")
    with mlflow.start_run(run_name=f"{name}-{version}"):
        mlflow.log_param("model_type", f"Model {name}")
        X_train, X_test, y_train, y_test = split_data(data)
        X_train, X_test, y_train, y_test = processing_data(X_train, X_test, y_train, y_test)
        model.fit(X_train, y_train)
        params = model.best_params_
        print("best params:", params)
        print(model.best_estimator_)


        y_pred = model.predict(X_test)
        test_metrics = print_metrics(y_test, y_pred)

        y_train_pred = model.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_pred)

        overfitting = train_accuracy - test_metrics["accuracy"]

        report = classification_report(y_test, y_pred)
        with open("classification_report.txt", "w") as f:
            f.write(report)
        for metric_name, metric_value in test_metrics.items():
            mlflow.log_metric(metric_name, metric_value)
        mlflow.log_metric("overfitting", overfitting)
        mlflow.log_params(params)
        mlflow.log_artifact("classification_report.txt")

# train_model()


In [10]:
# train_model(grid, data, "RFC_features_importances", "2")

In [11]:
# best_model = grid.best_estimator_
# best_model

In [12]:
# feature_names = data.drop("custcat", axis=1)
# feature_names = feature_names.columns
# importances = best_model.feature_importances_
# feature_importance_df = pd.DataFrame({
#     'Feature': feature_names,
#     'Importance': importances
# }).sort_values(by='Importance', ascending=False)
# feature_importance_df

In [13]:
# cols_importance = list(feature_importance_df.head(8)["Feature"])
# cols_importance

In [14]:
cols_importance = ['tenure', 'income', 'ed', 'employ', 'age', 'address', 'reside', 'region']
df_importance = data[cols_importance + ["custcat"]]
df_importance

Unnamed: 0,tenure,income,ed,employ,age,address,reside,region,custcat
0,13,64.0,4,5,44,9,2,2,1
1,11,136.0,5,5,33,7,6,3,4
2,68,116.0,1,29,52,24,2,3,3
3,33,33.0,2,0,33,12,1,2,1
4,23,30.0,1,2,30,9,4,2,3
...,...,...,...,...,...,...,...,...,...
995,10,27.0,3,0,39,0,3,3,1
996,7,22.0,5,5,34,2,1,1,1
997,67,944.0,5,33,59,40,1,3,4
998,70,87.0,2,22,49,18,1,3,3


In [15]:
# param_grid = {
#     'C': [0.1, 1, 10, 100],
#     'gamma': ['scale', 'auto', 0.01, 0.1, 1],
#     'kernel': ['linear', 'rbf', 'poly']
# }

# model = svm.SVC(random_state=42, class_weight='balanced')
# grid = GridSearchCV(model, param_grid, cv=5)
# train_model(grid, df_importance, "svm 7 features", "4")

In [16]:
from xgboost import XGBRFClassifier

xgb_model = XGBRFClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200, 250],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

In [17]:
grid = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=5
)
train_model(grid, data, "xgboost", "2")

best params: {'colsample_bytree': 1.0, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}
XGBRFClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bytree=1.0, device=None,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric=None, feature_types=None, gamma=None,
                grow_policy=None, importance_type=None,
                interaction_constraints=None, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=3, max_leaves=None,
                min_child_weight=1, missing=nan, monotone_constraints=None,
                multi_strategy=None, n_estimators=200, n_jobs=None,
                num_parallel_tree=None, objective='multi:softprob',
                random_state=42, reg_alpha=None, ...)
Accuracy:  0.415
classification_report:
               precision    recall  f1-score   support



In [18]:
best_model = grid.best_estimator_
feature_names = data.drop("custcat", axis=1)
feature_names = feature_names.columns
importances = best_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
6,ed,0.209357
1,tenure,0.178687
7,employ,0.092741
10,reside,0.088024
5,income,0.082
3,marital,0.065298
4,address,0.063393
2,age,0.062026
9,gender,0.057923
8,retire,0.051002


In [19]:
cols_importance = ["ed", "tenure", "employ", "reside", "income", "marital", "address"]
df_importance = data[cols_importance + ["custcat"]]
df_importance

Unnamed: 0,ed,tenure,employ,reside,income,marital,address,custcat
0,4,13,5,2,64.0,1,9,1
1,5,11,5,6,136.0,1,7,4
2,1,68,29,2,116.0,1,24,3
3,2,33,0,1,33.0,0,12,1
4,1,23,2,4,30.0,1,9,3
...,...,...,...,...,...,...,...,...
995,3,10,0,3,27.0,0,0,1
996,5,7,5,1,22.0,0,2,1
997,5,67,33,1,944.0,0,40,4
998,2,70,22,1,87.0,0,18,3


In [20]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
    'kernel': ['linear', 'rbf', 'poly']
}

model = svm.SVC(random_state=42, class_weight="balanced")
grid = GridSearchCV(model, param_grid, cv=5)
train_model(grid, df_importance, "svm 7f", "1")

best params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
SVC(C=1, class_weight='balanced', kernel='linear', random_state=42)
Accuracy:  0.415
classification_report:
               precision    recall  f1-score   support

           0       0.46      0.45      0.45        60
           1       0.31      0.38      0.34        39
           2       0.51      0.45      0.48        55
           3       0.36      0.35      0.36        46

    accuracy                           0.41       200
   macro avg       0.41      0.41      0.41       200
weighted avg       0.42      0.41      0.42       200

