# Preparación de entorno

## Instalación de librerías

In [None]:
!pip install transformers
!pip install -U transformers
!pip install ydata-profiling
!pip install squarify
!pip install wordcloud
!pip install nltk
!pip install transformers torch --quiet
!pip install h2o
!pip install pyngrok



KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import squarify
from PIL import Image
from matplotlib.colors import LinearSegmentedColormap
from datetime import datetime, timezone
from collections import Counter
from math import log2

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, Lasso, LinearRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score, mean_squared_error, r2_score, recall_score, roc_auc_score, precision_score, make_scorer, mean_absolute_error, roc_curve, auc
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
import multiprocessing
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.base import ClassifierMixin

from itertools import product
from IPython.display import HTML
from ydata_profiling import ProfileReport

import nltk
nltk.download('stopwords')
from wordcloud import WordCloud
from wordcloud import STOPWORDS
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize, sent_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
import torch
from torch.utils.data import Dataset
import os

from transformers import pipeline
from typing import List, Union

import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from flask import Flask, request, jsonify
import joblib
from pyngrok import ngrok

## Importación de datos

In [None]:
df= pd.read_csv('/content/technographics.csv')

## Normalización de variables

In [None]:
# Funciones para normalizar variables
def normalize_cols(df):
    df = df.copy()
    df.columns = (df.columns
                  .str.strip()
                  .str.lower()
                  .str.replace(" ", "_")
                  .str.replace("-", "_"))
    return df

def parse_dates_inplace(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors="coerce", utc=True)

def pick_last_date_row(row, candidates):
    for c in candidates:
        if c in row and pd.notna(row[c]):
            return row[c]
    return pd.NaT

def shannon_entropy_normalized(series):
    counts = series.value_counts(dropna=False).values
    total = counts.sum()
    if total == 0:
        return 0.0
    p = counts / total
    ent = -(p * np.log2(p + 1e-12)).sum()
    k = (counts > 0).sum()
    return float(ent / np.log2(k)) if k > 1 else 0.0

# Normalizar variables (aplicar funciones)
df = normalize_cols(df)

# Fechas que interesa analizar
date_candidates = [
    "last_date_found",
    "last_date_found_source_job_url",
    "last_date_found_source_job_title",
    "last_date_found_source_job_description",
    "last_date_any",
    "first_date_found",
    "first_date_found_source_job_description",
    "first_date_found_source_job_title",
    "first_date_found_source_job_url"]

parse_dates_inplace(df, date_candidates)

# Última fecha válida por fila
df["last_date_any"] = df.apply(lambda r: pick_last_date_row(r, date_candidates), axis=1)

In [None]:
date_cols = [
    "last_date_found",
    "last_date_found_source_job_url",
    "last_date_found_source_job_title",
    "last_date_found_source_job_description",
    "last_date_any",
    "first_date_found",
    "first_date_found_source_job_description",
    "first_date_found_source_job_title",
    "first_date_found_source_job_url"]

def add_days_since_and_drop_dates(df: pd.DataFrame, date_cols: list) -> pd.DataFrame:
    df = df.copy()
    now_utc = pd.Timestamp(datetime.now(timezone.utc))

    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce", utc=True)
            new_col = f"days_since_{col}"
            df[new_col] = (now_utc - df[col]).dt.days
            max_days = df[new_col].max()
            df[new_col].fillna(max_days + 1 if pd.notna(max_days) else 0, inplace=True)

    # Eliminar columnas originales
    df.drop(columns=date_cols, inplace=True, errors="ignore")

    return df

# Aplicar la función
df = add_days_since_and_drop_dates(df, date_cols)

## Creación de variable target

In [None]:
# Lista de ategorías asociadas a ZL
zl_tech_subcategories = ['build-automation',
                         'crm-platforms',
                         'customer-satisfaction',
                         'marketing-automation',
                         'appointments-and-scheduling',
                         'customer-data-integration',
                         'business-intelligence-bi',
                         'resource-scheduling',
                         'landing-page',
                         'data-visualization',
                         'email-marketing-platforms'
                         'code-free-chatbot-builders',
                         'platform-as-a-service-paas']

# Uso de zl_tech por caso
df['zl_tech_sub'] = df['subcategory_slug'].isin(zl_tech_subcategories)

# Uso de zl_tech por empresa
df['zl_tech'] = df.groupby('company_id')['zl_tech_sub'].transform('max')

# Distribución de zl_tech
print("Frequency distribution of zl_tech:")
display(df['zl_tech'].value_counts())

# Análisis exploratorio de datos

In [None]:
profile = ProfileReport(df)
profile.to_file(output_file='Technographics.html')

In [None]:
# Tipo de tecnologías

colores = ["#FFD700", "#FF8C00", "#8B4513"]
colormap_naranja_marron = LinearSegmentedColormap.from_list("naranja_marron", colores)

palabras = (
    df['keyword_slug']
    .dropna()
    .astype(str)
    .str.replace("-", " ")
    .str.split()
    .sum())

texto = " ".join(palabras)

# nube
wordcloud = WordCloud(
    stopwords=STOPWORDS,
    background_color='white',
    colormap=colormap_naranja_marron,
    width=800,
    height=400
).generate(texto)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title ("Technologies")
plt.show()

In [None]:
# Estilo
sns.set(style="whitegrid")
fig, axes = plt.subplots(2, 2, figsize=(16, 14))
axes = axes.flatten()

# Paleta uniforme
palette = {"True": "#FF8C00", "False": "#B0C4DE"}

# 1: Subcategorías
top_subcats = df['subcategory_slug'].value_counts().head(20)
top_df = top_subcats.reset_index()
top_df.columns = ['subcategory_slug', 'count']
top_df['is_zl_tech'] = top_df['subcategory_slug'].isin(zl_tech_subcategories)

sns.barplot(
    data=top_df, x="count", y="subcategory_slug",
    hue="is_zl_tech", dodge=False, palette={True: "#FF8C00", False: "#B0C4DE"}, ax=axes[0])
axes[0].set_title("Top 20 Subcategory Slugs")
axes[0].set_xlabel("Count")
axes[0].set_ylabel("Subcategory Slug")

# Convertimos zl_tech a string para los boxplots
df['zl_tech_str'] = df['zl_tech'].astype(str)

# 2: Boxplot jobs (escala log)
sns.boxplot(x='zl_tech_str', y='jobs', data=df, palette=palette, ax=axes[1])
axes[1].set_title("Distribution of Jobs by Tech offered by ZL (Log Scale)")
axes[1].set_xlabel("Tech")
axes[1].set_ylabel("Jobs")
axes[1].set_yscale("log")

# 3: Boxplot relativa (escala normal)
sns.boxplot(x='zl_tech_str', y='relative_occurrence_within_category_source_jobs', data=df, palette=palette, ax=axes[2])
axes[2].set_title("Relative Occurrence within Category by Techs offered by ZL")
axes[2].set_xlabel("Tech")
axes[2].set_ylabel("Relative Occurrence")

# 4: Evolución de Jobs
job_metrics = ["jobs_last_180_days", "jobs_last_30_days", "jobs_last_7_days"]
df_jobs = df[['zl_tech_str'] + job_metrics]

df_melted = df_jobs.melt(id_vars="zl_tech_str", var_name="Job_Metric", value_name="Count")

# Orden
order = ["jobs_last_180_days", "jobs_last_30_days", "jobs_last_7_days"]
df_melted["Job_Metric"] = pd.Categorical(df_melted["Job_Metric"], categories=order, ordered=True)

sns.lineplot(
    data=df_melted, x="Job_Metric", y="Count", hue="zl_tech_str",
    marker="o", palette=palette, ax=axes[3], estimator="mean")
axes[3].set_title("Evolution of Job Metrics by Tech offered by ZL")
axes[3].set_xlabel("Job Metrics (180 → 30 → 7)")
axes[3].set_ylabel("Average Count")
axes[3].legend(title="Tech")

for tech in df_melted["zl_tech_str"].unique():
    subset = df_melted[df_melted["zl_tech_str"] == tech].groupby("Job_Metric")["Count"].mean()
    for i, (metric, val) in enumerate(subset.items()):
        axes[3].annotate(f"{val:.0f}", (i, val), textcoords="offset points", xytext=(0,5), ha='center', fontsize=9)

plt.tight_layout()
plt.show()

# Búsqueda del mejor modelo

## Comparación de modelos

In [None]:
def comparar_modelos_clasificacion(
    df: pd.DataFrame,
    target_col: str,
    incluir_columnas: List[str] = [],
    test_size: float = 0.2,
    random_state: int = 42,
    cv: int = 5
) -> pd.DataFrame:

    # Separar X e y
    X = df[incluir_columnas]
    y = df[target_col]

    # Codificar variable objetivo
    if y.dtype == 'object' or y.dtype.name == 'category':
        y = LabelEncoder().fit_transform(y)

    # Preprocesamiento para variables categóricas
    preprocessor = ColumnTransformer(transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), X.columns)])

    # Modelos con parámetros regularizados para reducir overfitting
    modelos = {
        'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=random_state),
        'LogisticRegression': LogisticRegression(max_iter=1000, C=0.5),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                                 max_depth=3, subsample=0.8, colsample_bytree=0.8,
                                 random_state=random_state),
        'MLP': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300, early_stopping=True,
                             random_state=random_state)}

    # Métricas a calcular con validación cruzada
    scorers = {
        'Accuracy': make_scorer(accuracy_score),
        'Precision': make_scorer(precision_score, average='weighted'),
        'Recall': make_scorer(recall_score, average='weighted'),
        'F1': make_scorer(f1_score, average='weighted')}

    resultados = {}

    for nombre, modelo in modelos.items():
        pipeline = Pipeline(steps=[
            ('preprocessing', preprocessor),
            ('classifier', modelo)])

        resultados[nombre] = {}
        for metric_name, scorer in scorers.items():
            scores = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer)
            resultados[nombre][metric_name] = round(np.mean(scores), 4)

    return pd.DataFrame(resultados).T

In [None]:
# A nivel académico: con todas las variables disponibles

comparar_modelos_clasificacion(df,
                               target_col='zl_tech',
                               incluir_columnas=['days_since_first_date_found', 'days_since_last_date_found_source_job_url','days_since_last_date_found_source_job_title',
                                                 'days_since_last_date_found_source_job_description','days_since_last_date_any','days_since_first_date_found_source_job_description',
                                                 'days_since_first_date_found_source_job_title','days_since_first_date_found_source_job_url', 'keyword_id',
                                                 'confidence', 'is_recruiting_agency', 'jobs','jobs_last_180_days', 'jobs_last_30_days','jobs_last_7_days','jobs_source_description',
                                                 'jobs_source_description_last_180_days','jobs_source_description_last_30_days','jobs_source_description_last_7_days', 'jobs_source_title',
                                                 'jobs_source_title_last_180_days', 'jobs_source_title_last_30_days', 'jobs_source_title_last_7_days','jobs_source_url',
                                                 'jobs_source_url_last_180_days', 'jobs_source_url_last_30_days','jobs_source_url_last_7_days', 'technology_rank_source_jobs',
                                                 'technology_rank_180_days_source_jobs','rank_last_date_found_source_job_url', 'rank_1_tie_source_jobs', 'rank_180_days_tie_source_jobs',
                                                 'relative_occurrence_within_category_source_jobs','relative_occurrence_within_category_180_days_source_jobs'])

In [None]:
# A nivel práctico: variables que puedo obtener de mis leads
comparar_modelos_clasificacion(df,
                               target_col='zl_tech',
                               incluir_columnas=['is_recruiting_agency',
                                                 'jobs_last_180_days', 'jobs_last_30_days', 'jobs_last_7_days',
                                                 'jobs_source_description_last_180_days',
                                                 'jobs_source_description_last_30_days',
                                                 'jobs_source_description_last_7_days',
                                                 'jobs_source_title_last_180_days',
                                                 'jobs_source_title_last_30_days', 'jobs_source_title_last_7_days',
                                                 'jobs_source_url_last_180_days',
                                                 'jobs_source_url_last_30_days',
                                                 'jobs_source_url_last_7_days'])

## Efecto de cada variable

In [None]:
# A nivel académico: usaré Random forest - Da el mejor resultado preliminar

target='zl_tech'
incluir_a = ['days_since_first_date_found', 'days_since_last_date_found_source_job_url',
             'days_since_last_date_found_source_job_title', 'days_since_last_date_found_source_job_description',
             'days_since_last_date_any','days_since_first_date_found_source_job_description',
             'days_since_first_date_found_source_job_title','days_since_first_date_found_source_job_url', 'keyword_id',
             'confidence', 'is_recruiting_agency', 'jobs','jobs_last_180_days', 'jobs_last_30_days',
             'jobs_last_7_days','jobs_source_description', 'jobs_source_description_last_180_days',
             'jobs_source_description_last_30_days','jobs_source_description_last_7_days', 'jobs_source_title',
             'jobs_source_title_last_180_days', 'jobs_source_title_last_30_days', 'jobs_source_title_last_7_days',
             'jobs_source_url', 'jobs_source_url_last_180_days', 'jobs_source_url_last_30_days',
             'jobs_source_url_last_7_days', 'technology_rank_source_jobs','technology_rank_180_days_source_jobs',
             'rank_last_date_found_source_job_url', 'rank_1_tie_source_jobs', 'rank_180_days_tie_source_jobs',
             'relative_occurrence_within_category_source_jobs',
             'relative_occurrence_within_category_180_days_source_jobs']

X_a = df[incluir_a]
y_a = df[target].astype(int)

# One-hot encoding
X_encoded_a = pd.get_dummies(X_a, drop_first=True)

# Train/Test
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(
    X_encoded_a, y_a, test_size=0.2, random_state=42, stratify=y_a)

# Modelo RandomForest
rf_a = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42)
rf_a.fit(X_train_a, y_train_a)

# Predicciones con probabilidad
y_pred_proba_a = rf_a.predict_proba(X_test_a)[:,1]

# Ajuste de threshold
threshold = 0.3
y_pred_adj_a = (y_pred_proba_a >= threshold).astype(int)

# Métricas
print("\nMétricas con threshold ajustado:")
print(f"Accuracy : {accuracy_score(y_test_a, y_pred_adj_a):.4f}")
print(f"Precision: {precision_score(y_test_a, y_pred_adj_a):.4f}")
print(f"Recall   : {recall_score(y_test_a, y_pred_adj_a):.4f}")
print(f"F1-score : {f1_score(y_test_a, y_pred_adj_a):.4f}")

# Curva ROC para analizar tradeoff
fpr, tpr, thresholds = roc_curve(y_test_a, y_pred_proba_a)

plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, label="RandomForest")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("Curva ROC - RandomForest")
plt.legend()
plt.show()

# Umbral óptimo según F1
f1_scores = []
for t in thresholds:
    y_temp_a = (y_pred_proba_a >= t).astype(int)
    f1_scores.append(f1_score(y_test_a, y_temp_a))
opt_threshold_a = thresholds[np.argmax(f1_scores)]
print(f"\n Threshold óptimo (según F1): {opt_threshold_a:.3f}")

# Importancia de variables
importances_a = rf_a.feature_importances_
features_a = X_encoded_a.columns

feat_importances_a = pd.DataFrame({
    "Variable": features_a,
    "Importancia": importances_a}).sort_values(by="Importancia", ascending=False)

print("\n Variables más influyentes en el modelo:")
print(feat_importances_a.head(30))

# Gráfico
plt.figure(figsize=(10,6))
plt.barh(feat_importances_a["Variable"].head(30), feat_importances_a["Importancia"].head(30))
plt.gca().invert_yaxis()
plt.xlabel("Importancia")
plt.title("Top 15 Variables más influyentes - RandomForest")
plt.show()

In [None]:
# A nivel práctico: usaré Random Forest

incluir_p = ['is_recruiting_agency',
             'jobs_last_180_days', 'jobs_last_30_days', 'jobs_last_7_days',
             'jobs_source_description_last_180_days',
             'jobs_source_description_last_30_days',
             'jobs_source_description_last_7_days',
             'jobs_source_title_last_180_days',
             'jobs_source_title_last_30_days', 'jobs_source_title_last_7_days',
             'jobs_source_url_last_180_days',
             'jobs_source_url_last_30_days',
             'jobs_source_url_last_7_days']

X_p = df[incluir_p]
y_p = df[target].astype(int)

# One-hot encoding
X_encoded_p = pd.get_dummies(X_p, drop_first=True)

# Train/Test
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_encoded_p, y_p, test_size=0.2, random_state=42, stratify=y_p)

# Modelo RandomForest
rf_p = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42)
rf_p.fit(X_train_p, y_train_p)

# Predicciones con probabilidad
y_pred_proba_p = rf_p.predict_proba(X_test_p)[:,1]

# Ajuste de threshold
threshold = 0.3
y_pred_adj_p = (y_pred_proba_p >= threshold).astype(int)

# Métricas
print("\nMétricas con threshold ajustado:")
print(f"Accuracy : {accuracy_score(y_test_p, y_pred_adj_p):.4f}")
print(f"Precision: {precision_score(y_test_p, y_pred_adj_p):.4f}")
print(f"Recall   : {recall_score(y_test_p, y_pred_adj_p):.4f}")
print(f"F1-score : {f1_score(y_test_p, y_pred_adj_p):.4f}")

# Curva ROC para analizar tradeoff
fpr, tpr, thresholds = roc_curve(y_test_p, y_pred_proba_p)

plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, label="RandomForest")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("Curva ROC - RandomForest")
plt.legend()
plt.show()

# Umbral óptimo según F1
f1_scores = []
for t in thresholds:
    y_temp_p = (y_pred_proba_p >= t).astype(int)
    f1_scores.append(f1_score(y_test_p, y_temp_p))
opt_threshold_p = thresholds[np.argmax(f1_scores)]
print(f"\n Threshold óptimo (según F1): {opt_threshold_p:.3f}")

# Importancia de variables
importances_p = rf_p.feature_importances_
features_p = X_encoded_p.columns

feat_importances_p = pd.DataFrame({
    "Variable": features_p,
    "Importancia": importances_p}).sort_values(by="Importancia", ascending=False)

print("\n Variables más influyentes en el modelo:")
print(feat_importances_p.head(30))

# Gráfico
plt.figure(figsize=(10,6))
plt.barh(feat_importances_p["Variable"].head(30), feat_importances_p["Importancia"].head(30))
plt.gca().invert_yaxis()
plt.xlabel("Importancia")
plt.title("Top 15 Variables más influyentes - RandomForest")
plt.show()

## Optimizar modelo seleccionando variables

In [None]:
#Modelo Académico
incluir_a = ['days_since_first_date_found',
             'days_since_first_date_found_source_job_description',
             'keyword_id',
             'days_since_last_date_any',
             'days_since_last_date_found_source_job_description',
             'jobs_source_description',
             'relative_occurrence_within_category_source_jobs',
             'jobs',
             'jobs_last_180_days']

X_a = df[incluir_a]
y_a = df[target].astype(int)

# One-hot encoding
X_encoded_a = pd.get_dummies(X_a, drop_first=True)

# Train/Test
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(
    X_encoded_a, y_a, test_size=0.2, random_state=42, stratify=y_a)

# Modelo RandomForest
rf_a = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42)
rf_a.fit(X_train_a, y_train_a)

# Predicciones con probabilidad
y_pred_proba_a = rf_a.predict_proba(X_test_a)[:,1]

# Ajuste de threshold
threshold = 0.3
y_pred_adj_a = (y_pred_proba_a >= threshold).astype(int)

# Métricas
print("\nMétricas con threshold ajustado:")
print(f"Accuracy : {accuracy_score(y_test_a, y_pred_adj_a):.4f}")
print(f"Precision: {precision_score(y_test_a, y_pred_adj_a):.4f}")
print(f"Recall   : {recall_score(y_test_a, y_pred_adj_a):.4f}")
print(f"F1-score : {f1_score(y_test_a, y_pred_adj_a):.4f}")

# Curva ROC para analizar tradeoff
fpr, tpr, thresholds = roc_curve(y_test_a, y_pred_proba_a)

plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, label="RandomForest")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("Curva ROC - RandomForest")
plt.legend()
plt.show()

# Umbral óptimo según F1
f1_scores = []
for t in thresholds:
    y_temp_a = (y_pred_proba_a >= t).astype(int)
    f1_scores.append(f1_score(y_test_a, y_temp_a))
opt_threshold_a = thresholds[np.argmax(f1_scores)]
print(f"\n Threshold óptimo (según F1): {opt_threshold_a:.3f}")

In [None]:
#Modelo Práctico optimizado
incluir_p = ['jobs_source_description_last_180_days',
             'jobs_last_180_days',
             'jobs_last_30_days',
             'jobs_source_description_last_30_days',
             'jobs_source_description_last_7_days',
             'jobs_last_7_days',
             'jobs_source_title_last_180_days']

X_p = df[incluir_p]
y_p = df[target].astype(int)

# One-hot encoding
X_encoded_p = pd.get_dummies(X_p, drop_first=True)

# Train/Test
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_encoded_p, y_p, test_size=0.2, random_state=42, stratify=y_p)

#Preprocesamiento
preprocessor = ColumnTransformer(transformers=[
        ('num', StandardScaler(), X_p.select_dtypes(include=np.number).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), X_p.select_dtypes(include='object').columns)])

# Crear el pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', rf_p)])

# Entrenar el pipeline con todos los datos
pipeline.fit(X_p, y_p)

# Modelo RandomForest
rf_p = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42)
rf_p.fit(X_train_p, y_train_p)

# Predicciones con probabilidad
y_pred_proba_p = rf_p.predict_proba(X_test_p)[:,1]

# Ajuste de threshold
threshold = 0.3
y_pred_adj_p = (y_pred_proba_p >= threshold).astype(int)

# Métricas
print("\nMétricas con threshold ajustado:")
print(f"Accuracy : {accuracy_score(y_test_p, y_pred_adj_p):.4f}")
print(f"Precision: {precision_score(y_test_p, y_pred_adj_p):.4f}")
print(f"Recall   : {recall_score(y_test_p, y_pred_adj_p):.4f}")
print(f"F1-score : {f1_score(y_test_p, y_pred_adj_p):.4f}")

# Curva ROC para analizar tradeoff
fpr, tpr, thresholds = roc_curve(y_test_p, y_pred_proba_p)

plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, label="RandomForest")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("Curva ROC - RandomForest")
plt.legend()
plt.show()

# Umbral óptimo según F1
f1_scores = []
for t in thresholds:
    y_temp_p = (y_pred_proba_p >= t).astype(int)
    f1_scores.append(f1_score(y_test_p, y_temp_p))
opt_threshold_p = thresholds[np.argmax(f1_scores)]
print(f"\n Threshold óptimo (según F1): {opt_threshold_p:.3f}")

# Productivización del modelo (práctico)

In [None]:
!ngrok config add-authtoken 325KD5cOKlBGzVkYLxGGy3gjqIn_7KCV6t5vbmU7dKmC5oH56
app = Flask(__name__)

In [None]:
# Guardar el modelo entrenado
joblib.dump(pipeline, 'modelo_practico_optimizado.pkl')

In [None]:
# Cargar el modelo entrenado

try:
    pipeline = joblib.load('modelo_practico_optimizado.pkl')
except FileNotFoundError:
    print("Error: modelo_practico_optimizado.pkl not found.")
    pipeline = None

# Ruta
if pipeline is not None:
    @app.route('/predigo', methods=['POST'])
    def predigo():
        data = request.get_json(force=True)
        df_predict = pd.DataFrame([data])
        preds = pipeline.predict(df_predict)
        return jsonify({'prediction': int(preds[0])})

In [None]:
# Abrir un túnel público
public_url = ngrok.connect(5000)
print("La API está disponible en:", public_url)

# Levantar Flask
app.run(port=5000)