In [None]:
import pandas as pd
import os
from glob import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

csv_dir = "../dataset_landmarks"
csv_files = glob(os.path.join(csv_dir, "*.csv"))
df = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)

df.drop(columns=[c for c in df.columns if 'vis' in c or c == 'frame'], inplace=True)

le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

X = df.drop(columns=['label'])
y = df['label']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, stratify=y, random_state=42)

param_grid = {
'n_estimators': [100, 200],
'max_depth': [10, 20, None]
}

rf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rf, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid.fit(X_train, y_train)

best_rf = grid.best_estimator_
print("Mejores hiperparámetros:", grid.best_params_)

y_pred = best_rf.predict(X_test)
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Matriz de Confusión - Random Forest")
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.tight_layout()
plt.savefig("confusion_matrix_rf.png")
plt.show()

importances = best_rf.feature_importances_
top_idx = importances.argsort()[-10:][::-1]
top_features = [X.columns[i] for i in top_idx]

plt.figure(figsize=(8, 4))
sns.barplot(x=importances[top_idx], y=top_features)
plt.title("Top 10 características más importantes")
plt.xlabel("Importancia")
plt.tight_layout()
plt.savefig("importancia_rf.png")
plt.show()

joblib.dump(best_rf, "modelo_rf.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "label_encoder.pkl")