In [9]:
import numpy as np
import pandas as pd

X_train = pd.read_csv("C:/Users/Fernando/Desktop/Proyecto_Final_ML/data/X_train_3", index_col=0)
X_test = pd.read_csv("C:/Users/Fernando/Desktop/Proyecto_Final_ML/data/X_test_3", index_col=0)

features_columns = X_train.columns

y_train = pd.read_csv("C:/Users/Fernando/Desktop/Proyecto_Final_ML/data/y_train", index_col=0)
y_test = pd.read_csv("C:/Users/Fernando/Desktop/Proyecto_Final_ML/data/y_test", index_col=0)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

In [23]:
from sklearn.utils.class_weight import compute_class_weight

# Calcular pesos por clase y creación de diccionario para uso en modelos
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(zip(np.unique(y_train), class_weights))

In [3]:
from sklearn.linear_model import LogisticRegression

model_LR = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced")

model_LR.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [4]:
predictions_LR = model_LR.predict(X_test)
predictions_proba_LR = model_LR.predict_proba(X_test)

In [5]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions_LR))

              precision    recall  f1-score   support

           0       0.28      0.75      0.41       815
           1       0.89      0.50      0.64      3194

    accuracy                           0.55      4009
   macro avg       0.58      0.63      0.52      4009
weighted avg       0.77      0.55      0.59      4009



### Gridsearch Regresion logística

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression



# Definir pipeline: escalado + regresión logística
pipe = Pipeline([
    ("logreg", LogisticRegression(max_iter=10000, class_weight="balanced"))
])

# Espacio de búsqueda de hiperparámetros
param_grid = {
    "logreg__penalty": ["l1", "l2"],
    "logreg__C": [0.01, 0.1, 1, 10, 100],
    "logreg__solver": ["liblinear", "saga"]
}

# GridSearch con CV=5
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

# Ajustar
grid.fit(X_train, y_train)

# Resultados
print("Mejores hiperparámetros:", grid.best_params_)
print("Mejor score en CV:", grid.best_score_)
print("Score en test:", grid.score(X_test, y_test))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Mejores hiperparámetros: {'logreg__C': 10, 'logreg__penalty': 'l2', 'logreg__solver': 'liblinear'}
Mejor score en CV: 0.6052012416004604
Score en test: 0.595178911167446


In [19]:
y_pred = grid.best_estimator_.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.28      0.75      0.41       815
           1       0.89      0.50      0.64      3194

    accuracy                           0.55      4009
   macro avg       0.58      0.63      0.53      4009
weighted avg       0.76      0.55      0.60      4009



### GridSearch Random Forest

In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier




# Pipeline: (scaler es opcional en RF, pero lo dejamos por consistencia)
pipe = Pipeline([
    ("rf", RandomForestClassifier(random_state=42, class_weight="balanced"))
])

# Espacio de búsqueda de hiperparámetros
param_grid = {
    "rf__n_estimators": [100, 200, 500],     
    "rf__max_depth": [10, 20, 30]         
}

# GridSearch con CV=5
grid_RF = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring="f1_weighted",
    n_jobs=-1,
    verbose=1
)

# Ajustar
grid_RF.fit(X_train, y_train)



Fitting 5 folds for each of 9 candidates, totalling 45 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'rf__max_depth': [10, 20, ...], 'rf__n_estimators': [100, 200, ...]}"
,scoring,'f1_weighted'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,30
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
# Resultados
print("Mejores hiperparámetros:", grid_RF.best_params_)
print("Mejor score en CV:", grid_RF.best_score_)
print("Score en test:", grid_RF.score(X_test, y_test))

Mejores hiperparámetros: {'rf__max_depth': 30, 'rf__n_estimators': 200}
Mejor score en CV: 0.7329532331727084
Score en test: 0.7264380623754896


In [22]:
y_pred = grid_RF.best_estimator_.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.32      0.22      0.26       815
           1       0.82      0.88      0.85      3194

    accuracy                           0.74      4009
   macro avg       0.57      0.55      0.55      4009
weighted avg       0.71      0.74      0.73      4009



### GridSearch para SVM


In [10]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.svm import SVC


# Pipeline: escalado + SVM
pipe = Pipeline([
    ("svm", SVC( class_weight='balanced'))
])

# Espacio de búsqueda de hiperparámetros
param_grid = {
    "svm__C": [0.1, 1, 10],            
    "svm__kernel": ["linear", "rbf", "poly"],
    "svm__gamma": ["auto"],
    "svm__degree": [2]                
}

# GridSearch con CV=5
grid_SVM = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring="f1_weighted",   
    n_jobs=-1,
    verbose=1
)

# Ajustar
grid_SVM.fit(X_train, y_train)

# Resultados
print("Mejores hiperparámetros:", grid_SVM.best_params_)
print("Mejor score en CV:", grid_SVM.best_score_)
print("Score en test:", grid_SVM.score(X_test, y_test))

Fitting 5 folds for each of 9 candidates, totalling 45 fits


KeyboardInterrupt: 

In [5]:
from sklearn.metrics import classification_report

y_pred = grid_SVM.best_estimator_.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.27      0.65      0.39       815
           1       0.86      0.56      0.68      3194

    accuracy                           0.58      4009
   macro avg       0.57      0.60      0.53      4009
weighted avg       0.74      0.58      0.62      4009



In [12]:
## MEJORES MODELOS SEGÚN f1-score weighted

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import joblib 

## Regresión logística

modelo_RL = LogisticRegression(C= 10, penalty='l2', solver= 'liblinear', class_weight="balanced", random_state=42)

modelo_RL.fit(X_train, y_train)

y_pred_modelo_LR = modelo_RL.predict(X_test)

print("MODELO DE REGRESION LOGÍSICA")
print(classification_report(y_test, y_pred_modelo_LR))
print(f"AUC: {roc_auc_score(y_test, y_pred_modelo_LR)}")

joblib.dump(modelo_RL,"C:/Users/Fernando/Desktop/Proyecto_Final_ML/models/trained_model_RL.pkl")

print("-"*100)

## Random Forest

modelo_RF = RandomForestClassifier(max_depth= 30, n_estimators= 200, class_weight="balanced", random_state=42)

modelo_RF.fit(X_train, y_train)

y_pred_modelo_RF = modelo_RF.predict(X_test)

print("MODELO RANDOMFOREST")
print(classification_report(y_test, y_pred_modelo_RF))
print(f"AUC: {roc_auc_score(y_test, y_pred_modelo_RF)}")

joblib.dump(modelo_RL,"C:/Users/Fernando/Desktop/Proyecto_Final_ML/models/trained_model_RF.pkl")

print("-"*100)


## SVM

modelo_SVM = SVC(C= 10, degree= 2, gamma= 'auto', kernel= 'rbf', class_weight="balanced", probability=True, random_state=42)

modelo_SVM.fit(X_train, y_train)

y_pred_modelo_SVM = modelo_SVM.predict(X_test)

print("MODELO SVM")
print(classification_report(y_test, y_pred_modelo_SVM))
print(f"AUC:{roc_auc_score(y_test, y_pred_modelo_SVM)}")

joblib.dump(modelo_RL,"C:/Users/Fernando/Desktop/Proyecto_Final_ML/models/trained_model_SVM.pkl")

MODELO DE REGRESION LOGÍSICA
              precision    recall  f1-score   support

           0       0.28      0.75      0.41       815
           1       0.89      0.50      0.64      3194

    accuracy                           0.55      4009
   macro avg       0.58      0.63      0.53      4009
weighted avg       0.76      0.55      0.60      4009

AUC: 0.6285656387935968
----------------------------------------------------------------------------------------------------
MODELO RANDOMFOREST
              precision    recall  f1-score   support

           0       0.32      0.23      0.27       815
           1       0.82      0.88      0.85      3194

    accuracy                           0.75      4009
   macro avg       0.57      0.55      0.56      4009
weighted avg       0.72      0.75      0.73      4009

AUC: 0.5532023233747325
----------------------------------------------------------------------------------------------------
MODELO SVM
              precision    recall  f

['C:/Users/Fernando/Desktop/Proyecto_Final_ML/models/trained_model_SVM.pkl']