In [None]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler


N = 1000
np.random.seed(42)
end_date = pd.to_datetime('2025-10-31')

users_data = pd.DataFrame({
    'user_id': np.arange(1, N + 1),
    'membership_type_id': np.random.choice([1, 2, 3], N, p=[0.6, 0.3, 0.1]),
    'has_biometrics': np.random.choice([True, False], N, p=[0.3, 0.7]),
})

num_sessions = 50000
sessions_data = pd.DataFrame({
    'user_id': np.random.choice(users_data['user_id'], num_sessions, p=users_data['membership_type_id'] / users_data['membership_type_id'].sum()),
    'started_at': pd.to_datetime(end_date - pd.to_timedelta(np.random.randint(1, 90, num_sessions), unit='D')),
    'distance_meters': np.random.randint(1000, 15000, num_sessions)
})


sessions_agg_ltv = sessions_data.groupby('user_id').agg(
    runs_last_90_days=('started_at', 'count'),
    distance_last_90_days_km=('distance_meters', lambda x: x.sum() / 1000),
    # Simula o ritmo médio (pace) - Menor é melhor (mais rápido)
    avg_pace_last_90_days=('distance_meters', lambda x: np.random.normal(6.5 - (x.count() * 0.01), 0.5))
).reset_index()

df_features_ltv = users_data.merge(sessions_agg_ltv, on='user_id', how='left').fillna({
    'runs_last_90_days': 0,
    'distance_last_90_days_km': 0,
    'avg_pace_last_90_days': 10.0 # Pace lento para inativos
})


df_features_ltv['achievement_count'] = np.random.poisson(
    (df_features_ltv['runs_last_90_days'] * 0.1) + (df_features_ltv['membership_type_id'] * 2)
)
df_features_ltv['achievement_count'] = np.clip(df_features_ltv['achievement_count'], 0, 20)

df_ltv = df_features_ltv[[
    'user_id',
    'membership_type_id',
    'has_biometrics',
    'runs_last_90_days',
    'distance_last_90_days_km',
    'avg_pace_last_90_days',
    'achievement_count'
]].copy()

print("--- Gerando Alvo de LTV (Classificação) ---")

base_distance = df_ltv['distance_last_90_days_km'] * 2.5 
ltv_potential = (base_distance * (df_ltv['membership_type_id'] * 0.5)) + (df_ltv['achievement_count'] * 10)
ltv_noise = np.random.normal(0, 100, N)
df_ltv['future_distance_6_months_km'] = np.clip(ltv_potential + ltv_noise, 0, None)

bins = [0, 100, 500, np.inf]
labels = [0, 1, 2]
df_ltv['ltv_target'] = pd.cut(
    df_ltv['future_distance_6_months_km'], 
    bins=bins, 
    labels=labels, 
    right=False
).astype(int)

df_ltv = df_ltv.drop(columns=['future_distance_6_months_km'])

print("Distribuição das Classes de LTV:")
print(df_ltv['ltv_target'].value_counts(normalize=True).sort_index())

X_ltv = df_ltv.drop(columns=['user_id', 'ltv_target'])
y_ltv = df_ltv['ltv_target']

X_ltv['has_biometrics'] = X_ltv['has_biometrics'].astype(int)

X_ltv = pd.get_dummies(X_ltv, columns=['membership_type_id'], drop_first=False)

X_train_ltv, X_test_ltv, y_train_ltv, y_test_ltv = train_test_split(
    X_ltv, y_ltv, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_ltv
)

print(f"\nConjunto de Treino LTV: {len(X_train_ltv)} amostras")
print(f"Conjunto de Teste LTV: {len(X_test_ltv)} amostras")

model_ltv = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model_ltv.fit(X_train_ltv, y_train_ltv)

print("\n--- ✅ Modelo de LTV Treinado com Sucesso ---")

y_pred_ltv = model_ltv.predict(X_test_ltv)
print(f"\nAcurácia no Conjunto de Teste LTV: {accuracy_score(y_test_ltv, y_pred_ltv):.4f}")
print("\nRelatório de Classificação LTV (Teste):")
print(classification_report(y_test_ltv, y_pred_ltv, target_names=['0 (Low)', '1 (Medium)', '2 (High)']))

filename_ltv = 'ltv_model.pkl'
if not os.path.exists('models'):
    os.makedirs('models')

with open(os.path.join('models', filename_ltv), 'wb') as file:
    pickle.dump(model_ltv, file)

print(f"\nModelo de LTV exportado para: models/{filename_ltv}")

--- Gerando Alvo de LTV (Classificação) ---
Distribuição das Classes de LTV:
ltv_target
0    0.003
1    0.481
2    0.516
Name: proportion, dtype: float64

Conjunto de Treino LTV: 800 amostras
Conjunto de Teste LTV: 200 amostras

--- ✅ Modelo de LTV Treinado com Sucesso ---

Acurácia no Conjunto de Teste LTV: 0.8700

Relatório de Classificação LTV (Teste):
              precision    recall  f1-score   support

     0 (Low)       0.00      0.00      0.00         1
  1 (Medium)       0.82      0.93      0.87        96
    2 (High)       0.92      0.83      0.87       103

    accuracy                           0.87       200
   macro avg       0.58      0.58      0.58       200
weighted avg       0.87      0.87      0.87       200


✅ Modelo de LTV exportado para: models/ltv_model.pkl


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
