In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_csv("../data/clean_file.csv")
data.head()

Unnamed: 0,region,tenure,age,marital,address,income,ed,employ,retire,gender,reside,custcat
0,2,13,44,1,9,64.0,4,5,0,0,2,1
1,3,11,33,1,7,136.0,5,5,0,0,6,4
2,3,68,52,1,24,116.0,1,29,0,1,2,3
3,2,33,33,0,12,33.0,2,0,0,1,1,1
4,2,23,30,1,9,30.0,1,2,0,0,4,3


In [4]:
X = data.drop("custcat", axis=1)
y = data["custcat"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Estandarizar

In [6]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Train Models

In [7]:
param_grid = {
    'n_estimators': [50, 100, 200],  # Número de árboles
    'max_depth': [None, 10, 20, 30],  # Profundidad máxima de los árboles
    'min_samples_split': [2, 5, 10],  # Mínimo de muestras necesarias para dividir un nodo
    'min_samples_leaf': [1, 2, 4],  # Mínimo de muestras necesarias en una hoja
    'max_features': ['sqrt', 'log2', None],  # Número de características a considerar al dividir
}

model = RandomForestClassifier()
grid = GridSearchCV(model, param_grid, cv=5)

In [8]:
def print_metrics(y_test, y_pred):
    dict_metrics = classification_report(y_test, y_pred, output_dict=True)
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": dict_metrics['weighted avg']['precision'],
        "recall": dict_metrics['weighted avg']['recall'],
        "f1-score": dict_metrics['weighted avg']['f1-score']
    }
    print("Accuracy SVM: ", accuracy_score(y_test, y_pred))
    print(f"classification_report:\n", classification_report(y_test, y_pred))
    return metrics

In [44]:
def split_data(data: pd.DataFrame):

    X = data.drop("custcat", axis=1)
    y = data["custcat"]

    return train_test_split(X, y, test_size=0.2, random_state=42)

def processing_data(X_train, X_test):
    # Estandarizar
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return [X_train, X_test]

In [None]:
import mlflow

def train_model(model, name, version):
    mlflow.set_experiment(f"Refuerzo_ML")
    with mlflow.start_run(run_name=f"Test {name}-{version}"):
        mlflow.log_param("model_type", f"Model {name}")

        model.fit(X_train, y_train)
        params = model.best_params_
        print("best params:", params)
        print(model.best_estimator_)

        y_pred = model.predict(X_test)
        test_metrics = print_metrics(y_test, y_pred)

        y_train_pred = model.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_pred)

        overfitting = train_accuracy - test_metrics["accuracy"]

        for metric_name, metric_value in test_metrics.items():
            mlflow.log_metric(metric_name, metric_value)
        mlflow.log_metric("overfitting", overfitting)
        mlflow.log_params(params)

# train_model()


In [11]:
best_model = grid.best_estimator_
best_model

In [19]:
len(data.columns)

12

In [24]:
feature_names = data.drop("custcat", axis=1)
feature_names = feature_names.columns
importances = best_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
1,tenure,0.187468
5,income,0.141356
6,ed,0.138381
7,employ,0.1301
4,address,0.120521
2,age,0.117337
10,reside,0.060339
0,region,0.050171
9,gender,0.025785
3,marital,0.025667


In [None]:
cols_importance = list(feature_importance_df.head(6)["Feature"])
cols_importance

['tenure', 'income', 'ed', 'employ', 'address', 'age']

In [39]:
df_importance = data[cols_importance + ["custcat"]]
df_importance

Unnamed: 0,tenure,income,ed,employ,address,age,custcat
0,13,64.0,4,5,9,44,1
1,11,136.0,5,5,7,33,4
2,68,116.0,1,29,24,52,3
3,33,33.0,2,0,12,33,1
4,23,30.0,1,2,9,30,3
...,...,...,...,...,...,...,...
995,10,27.0,3,0,0,39,1
996,7,22.0,5,5,2,34,1
997,67,944.0,5,33,40,59,4
998,70,87.0,2,22,18,49,3


In [43]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
    'kernel': ['linear', 'rbf', 'poly']
}

model = svm.SVC(random_state=42)
grid = GridSearchCV(model, param_grid, cv=5)
train_model(grid, "2")

best params: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
SVC(C=10, kernel='linear', random_state=42)
Accuracy SVM:  0.405
classification_report:
               precision    recall  f1-score   support

           1       0.44      0.42      0.43        60
           2       0.31      0.28      0.30        39
           3       0.44      0.49      0.46        55
           4       0.39      0.39      0.39        46

    accuracy                           0.41       200
   macro avg       0.39      0.40      0.39       200
weighted avg       0.40      0.41      0.40       200

