In [36]:
import pandas as pd
import numpy as np
import pickle
import os

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [38]:
N = 1000
np.random.seed(42)
end_date = pd.to_datetime('2025-10-31')

In [39]:
users_data = pd.DataFrame({
    'user_id': np.arange(1, N + 1),
    'membership_type_id': np.random.choice([1, 2, 3], N, p=[0.6, 0.3, 0.1]),
    'has_biometrics': np.random.choice([True, False], N, p=[0.3, 0.7]),
    'created_at': pd.to_datetime(end_date - pd.to_timedelta(np.random.randint(30, 730, N), unit='D')), # 1 a 2 anos de conta
    'active': True 
})

In [40]:
logins_data = pd.DataFrame({
    'user_id': users_data['user_id'],
    'last_login_at': pd.to_datetime(end_date - pd.to_timedelta(
        np.where(np.random.rand(N) < 0.15,
                 np.random.randint(90, 180, N), 
                 np.random.randint(1, 45, N)),
        unit='D'))
})

In [41]:
num_sessions = 50000
sessions_data = pd.DataFrame({
    'running_session_id': np.arange(1, num_sessions + 1),
    'user_id': np.random.choice(users_data['user_id'], num_sessions, p=users_data['membership_type_id'] / users_data['membership_type_id'].sum()),
    'started_at': pd.to_datetime(end_date - pd.to_timedelta(np.random.randint(1, 90, num_sessions), unit='D')),
    'distance_meters': np.random.randint(1000, 15000, num_sessions)
})

In [42]:
print("--- Simulated Raw Data (Sample From Each Table) ---")
print("USERS:")
print(users_data.head(2))
print("\nLOGINS:")
print(logins_data.head(2))
print("\nRUNNING_SESSIONS:")
print(sessions_data.head(2))

--- Simulated Raw Data (Sample From Each Table) ---
USERS:
   user_id  membership_type_id  has_biometrics created_at  active
0        1                   1            True 2025-02-24    True
1        2                   3           False 2025-04-22    True

LOGINS:
   user_id last_login_at
0        1    2025-10-12
1        2    2025-10-13

RUNNING_SESSIONS:
   running_session_id  user_id started_at  distance_meters
0                   1        9 2025-09-08             3001
1                   2      381 2025-10-12            13918


In [None]:

df_features_en = users_data[['user_id', 'membership_type_id', 'has_biometrics', 'created_at']].copy()

df_features_en = df_features_en.merge(logins_data[['user_id', 'last_login_at']], on='user_id', how='left')
df_features_en['days_since_last_login'] = (end_date - df_features_en['last_login_at']).dt.days


sessions_agg_en = sessions_data.groupby('user_id').agg(
    runs_last_90_days=('running_session_id', 'count'),
    distance_last_90_days_km=('distance_meters', lambda x: x.sum() / 1000),
    last_run_date=('started_at', 'max')
).reset_index()

df_features_en = df_features_en.merge(sessions_agg_en, on='user_id', how='left').fillna({
    'runs_last_90_days': 0,
    'distance_last_90_days_km': 0,
    'last_run_date': end_date - pd.Timedelta(days=365) # Data antiga para usuários sem sessões
})

df_features_en['days_since_last_run'] = (end_date - df_features_en['last_run_date']).dt.days

df_features_en['days_on_platform'] = (end_date - df_features_en['created_at']).dt.days

df_final_churn_en = df_features_en[[
    'user_id', 
    'membership_type_id', 
    'has_biometrics', 
    'days_on_platform', 
    'days_since_last_login',
    'days_since_last_run', 
    'runs_last_90_days', 
    'distance_last_90_days_km'
]].copy()


risco_alto = (df_final_churn_en['days_since_last_login'] > 60) & (df_final_churn_en['days_since_last_run'] > 60) & (df_final_churn_en['runs_last_90_days'] < 5)
risco_medio = (df_final_churn_en['days_since_last_run'] > 30) & (df_final_churn_en['runs_last_90_days'] < 10) & (df_final_churn_en['membership_type_id'] == 1)

df_final_churn_en['prob_churn'] = np.where(risco_alto, 0.7, 0)
df_final_churn_en['prob_churn'] = np.where(risco_medio, df_final_churn_en['prob_churn'] + 0.3, df_final_churn_en['prob_churn'])

df_final_churn_en['churn_target'] = (np.random.rand(N) < df_final_churn_en['prob_churn']).astype(int)

df_final_churn_en = df_final_churn_en.drop(columns=['prob_churn']) # Remove a coluna auxiliar

print("\n--- Final CHURN DataFrame (English Columns) ---")
print(df_final_churn_en.head())
print(f"\nTotal Churn Rate (Simulated): {df_final_churn_en['churn_target'].mean() * 100:.2f}%")


--- Final CHURN DataFrame (English Columns) ---
   user_id  membership_type_id  has_biometrics  days_on_platform  \
0        1                   1            True               249   
1        2                   3           False               192   
2        3                   2           False               249   
3        4                   1           False               299   
4        5                   1           False               661   

   days_since_last_login  days_since_last_run  runs_last_90_days  \
0                     19                    5                 33   
1                     18                    1                104   
2                     12                    1                 52   
3                     28                    2                 40   
4                     28                    3                 29   

   distance_last_90_days_km  churn_target  
0                   258.184             0  
1                   846.849             0  
2

In [44]:
df_final_churn_en.head()

Unnamed: 0,user_id,membership_type_id,has_biometrics,days_on_platform,days_since_last_login,days_since_last_run,runs_last_90_days,distance_last_90_days_km,churn_target
0,1,1,True,249,19,5,33,258.184,0
1,2,3,False,192,18,1,104,846.849,0
2,3,2,False,249,12,1,52,439.769,0
3,4,1,False,299,28,2,40,343.724,0
4,5,1,False,661,28,3,29,196.497,0


In [None]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

N = 1000 
np.random.seed(42)
end_date = pd.to_datetime('2025-10-31') # Data de observação

print(f"Simulando dados para {N} usuários até {end_date.date()}...")

users_data = pd.DataFrame({
    'user_id': np.arange(1, N + 1),
    'membership_type_id': np.random.choice([1, 2, 3], N, p=[0.6, 0.3, 0.1]),
    'has_biometrics': np.random.choice([True, False], N, p=[0.3, 0.7]),
    'created_at': pd.to_datetime(end_date - pd.to_timedelta(np.random.randint(30, 730, N), unit='D')),
    'active': True
})

logins_data = pd.DataFrame({
    'user_id': users_data['user_id'],
    'last_login_at': pd.to_datetime(end_date - pd.to_timedelta(
        np.where(np.random.rand(N) < 0.15,
                 np.random.randint(90, 180, N), 
                 np.random.randint(1, 45, N)), 
        unit='D'))
})

num_sessions = 50000
sessions_data = pd.DataFrame({
    'running_session_id': np.arange(1, num_sessions + 1),
    'user_id': np.random.choice(users_data['user_id'], num_sessions, p=users_data['membership_type_id'] / users_data['membership_type_id'].sum()),
    'started_at': pd.to_datetime(end_date - pd.to_timedelta(np.random.randint(1, 90, num_sessions), unit='D')),
    'distance_meters': np.random.randint(1000, 15000, num_sessions)
})


df_features_en = users_data[['user_id', 'membership_type_id', 'has_biometrics', 'created_at']].copy()

df_features_en = df_features_en.merge(logins_data[['user_id', 'last_login_at']], on='user_id', how='left')
df_features_en['days_since_last_login'] = (end_date - df_features_en['last_login_at']).dt.days

sessions_agg_en = sessions_data.groupby('user_id').agg(
    runs_last_90_days=('running_session_id', 'count'),
    distance_last_90_days_km=('distance_meters', lambda x: x.sum() / 1000),
    last_run_date=('started_at', 'max')
).reset_index()

df_features_en = df_features_en.merge(sessions_agg_en, on='user_id', how='left').fillna({
    'runs_last_90_days': 0,
    'distance_last_90_days_km': 0,
    'last_run_date': end_date - pd.Timedelta(days=365)
})

df_features_en['days_since_last_run'] = (end_date - df_features_en['last_run_date']).dt.days

df_features_en['days_on_platform'] = (end_date - df_features_en['created_at']).dt.days

df_final_churn_en = df_features_en[[
    'user_id', 
    'membership_type_id', 
    'has_biometrics', 
    'days_on_platform', 
    'days_since_last_login',
    'days_since_last_run', 
    'runs_last_90_days', 
    'distance_last_90_days_km'
]].copy()


print("\n--- Gerando Alvo de Churn (Determinístico) ---")


deterministic_churn_rule = (df_final_churn_en['days_since_last_login'] > 60) & \
                             (df_final_churn_en['days_since_last_run'] > 90)

df_final_churn_en['churn_target'] = np.where(deterministic_churn_rule, 1, 0).astype(int)

if df_final_churn_en['churn_target'].nunique() < 2:
    print("Regra determinística não foi suficiente, aplicando regra de backup...")
    cutoff = df_final_churn_en['days_since_last_run'].quantile(0.90)
    df_final_churn_en['churn_target'] = (df_final_churn_en['days_since_last_run'] >= cutoff).astype(int)


print(f"Taxa de Churn Simulado: {df_final_churn_en['churn_target'].mean() * 100:.2f}%")
print(f"Total de classes no dataset: {df_final_churn_en['churn_target'].nunique()}")



X = df_final_churn_en.drop(columns=['user_id', 'churn_target'])
y = df_final_churn_en['churn_target']

X['has_biometrics'] = X['has_biometrics'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y 
)

print(f"\nConjunto de Treino: {len(X_train)} amostras (Churn: {y_train.mean()*100:.2f}%)")
print(f"Conjunto de Teste: {len(X_test)} amostras (Churn: {y_test.mean()*100:.2f}%)")


model_churn = LogisticRegression(solver='liblinear', random_state=42)
model_churn.fit(X_train, y_train)

print("\n--- ✅ Modelo de Churn Treinado com Sucesso ---")

y_pred = model_churn.predict(X_test)
print(f"\nAcurácia no Conjunto de Teste: {accuracy_score(y_test, y_pred):.4f}")
print("\nRelatório de Classificação (Teste):")
print(classification_report(y_test, y_pred, zero_division=0))

filename = 'churn_model.pkl'
if not os.path.exists('models'):
    os.makedirs('models')

with open(os.path.join('models', filename), 'wb') as file:
    pickle.dump(model_churn, file)

print(f"\n✅ Modelo de Churn exportado para: models/{filename}")

Simulando dados para 1000 usuários até 2025-10-31...

--- Gerando Alvo de Churn (Determinístico) ---
Regra determinística não foi suficiente, aplicando regra de backup...
Taxa de Churn Simulado: 13.30%
Total de classes no dataset: 2

Conjunto de Treino: 800 amostras (Churn: 13.25%)
Conjunto de Teste: 200 amostras (Churn: 13.50%)

--- ✅ Modelo de Churn Treinado com Sucesso ---

Acurácia no Conjunto de Teste: 0.9900

Relatório de Classificação (Teste):
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       173
           1       0.96      0.96      0.96        27

    accuracy                           0.99       200
   macro avg       0.98      0.98      0.98       200
weighted avg       0.99      0.99      0.99       200


✅ Modelo de Churn exportado para: models/churn_model.pkl
