# Preprocess, balance, train and test split, fit and predict

In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [3]:
df = pd.read_csv("features.csv")

In [4]:
# Preprocesamiento
X = df.drop(['video_id', 'label'], axis=1)  # Incluye las nuevas columnas
y = df['label']
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
# Balancear clases con SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_scaled, y)

In [6]:
# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.15, random_state=42)

In [7]:
# Entrenar Random Forest
rf_params = {'n_estimators': [100, 200], 'max_depth': [10, 20], 'min_samples_split': [2, 5]}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='f1')
rf_grid.fit(X_train, y_train)
print("Mejores parámetros RF:", rf_grid.best_params_)
rf_model = rf_grid.best_estimator_

# Entrenar SVM
svm_params = {'C': [0.1, 1, 10], 'gamma': ['scale', 0.001, 0.01], 'kernel': ['rbf']}
svm_grid = GridSearchCV(SVC(probability=True), svm_params, cv=5, scoring='f1')
svm_grid.fit(X_train, y_train)
print("Mejores parámetros SVM:", svm_grid.best_params_)
svm_model = svm_grid.best_estimator_

Mejores parámetros RF: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Mejores parámetros SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


In [8]:
# Evaluar modelos
print("\nResultados Random Forest:")
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Matriz de confusión RF:\n", confusion_matrix(y_test, y_pred))
print("\nResultados SVM:")
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Matriz de confusión SVM:\n", confusion_matrix(y_test, y_pred))


Resultados Random Forest:
              precision    recall  f1-score   support

           0       0.97      0.94      0.96      5238
           1       0.94      0.97      0.96      5199

    accuracy                           0.96     10437
   macro avg       0.96      0.96      0.96     10437
weighted avg       0.96      0.96      0.96     10437

Matriz de confusión RF:
 [[4944  294]
 [ 163 5036]]

Resultados SVM:
              precision    recall  f1-score   support

           0       0.77      0.57      0.66      5238
           1       0.66      0.83      0.74      5199

    accuracy                           0.70     10437
   macro avg       0.72      0.70      0.70     10437
weighted avg       0.72      0.70      0.70     10437

Matriz de confusión SVM:
 [[3001 2237]
 [ 876 4323]]


In [9]:
# Guardar modelos
joblib.dump(rf_model, 'models/rf_model.pkl')
joblib.dump(svm_model, 'models/svm_model.pkl')
joblib.dump(scaler, 'models/scaler.pkl')

['models/scaler.pkl']