## 1.Connexion et Chargement des donn√©es

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import joblib
import os

In [None]:
# Configuration de la connexion (H√¥te localhost car lanc√© depuis Windows)
# On utilise 'airflow' comme DB d'apr√®s ton docker-compose
engine = create_engine('postgresql://airflow:airflow@localhost:5432/airflow')

print("Chargement d'un √©chantillon de donn√©es depuis Postgres...")

# On prend 300 000 lignes al√©atoires pour l'entra√Ænement (rapide et pr√©cis)
query = """
SELECT trip_distance, pickup_hour, day_of_week, month, duration_minutes 
FROM silver_taxi_trips 
ORDER BY RANDOM() 
LIMIT 300000
"""

df = pd.read_sql(query, engine)
print(f"Donn√©es charg√©es : {df.shape[0]} lignes.")

## 2.Pr√©paration et Modeling (Machine Learning)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
# 1. D√©finition des Features (X) et de la Target (y)
X = df[['trip_distance', 'pickup_hour', 'day_of_week', 'month']]
y = df['duration_minutes']

In [None]:
# 2. Split 80% Entra√Ænement / 20% Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 3. Entra√Ænement du mod√®le
print("Entra√Ænement du RandomForest (max_depth=10)...")
model = RandomForestRegressor(n_estimators=100, max_depth=10, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

In [None]:
# 4. Pr√©dictions
y_pred = model.predict(X_test)

In [None]:
# 5. √âvaluation
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"\n--- R√âSULTATS DU MOD√àLE ---")
print(f"Score R¬≤ : {r2:.4f} (Plus c'est proche de 1, mieux c'est)")
print(f"MAE : {mae:.2f} minutes d'erreur moyenne")

## 4. Visualisation des performances

In [None]:
plt.figure(figsize=(12, 5))

# Graphique 1 : R√©el vs Pr√©dictions
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.1, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.title("R√©el vs Pr√©dictions")
plt.xlabel("Dur√©e R√©elle (min)")
plt.ylabel("Dur√©e Pr√©dite (min)")

# Graphique 2 : Importance des variables
plt.subplot(1, 2, 2)
importances = pd.Series(model.feature_importances_, index=X.columns).sort_values()
importances.plot(kind='barh', color='green')
plt.title("Importance des variables")

plt.tight_layout()
plt.show()

## 5.Sauvegarde du mod√®le

In [None]:
# Cr√©ation du dossier models s'il n'existe pas
if not os.path.exists('../models'):
    os.makedirs('../models')

model_path = '../models/model_eta.pkl'
joblib.dump(model, model_path)
print(f"üíæ Mod√®le sauvegard√© avec succ√®s dans : {model_path}")