# SmartCare Analytics 

 Extraction & Mod√®les de Pr√©vision
Extraction du fichier SLP-CHF2012.pdf, construction base de donn√©es et pr√©visions

## 1. Import des biblioth√®ques n√©cessaires

In [None]:
!pip install pdfplumber tabula-py pandas numpy scikit-learn statsmodels prophet tensorflow matplotlib seaborn openpyxl


In [None]:
# Installation des d√©pendances (d√©commenter si n√©cessaire)

import os
import sys
import re
import csv
import json
import sqlite3
import warnings
from datetime import datetime, timedelta
from pathlib import Path
import logging

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# PDF extraction
import pdfplumber
try:
    import tabula
except ImportError:
    tabula = None

# Machine Learning & Time Series
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

try:
    import statsmodels.api as sm
    from statsmodels.tsa.arima.model import ARIMA
except ImportError:
    sm = None
    ARIMA = None

try:
    from prophet import Prophet
except ImportError:
    Prophet = None

try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
except ImportError:
    keras = None

import pickle
import joblib

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("‚úì Toutes les biblioth√®ques ont √©t√© import√©es avec succ√®s !")


## 2. Configuration et chemins

In [None]:
# Configuration des chemins
WORKSPACE_DIR = Path.cwd()
FICHIER_DIR = WORKSPACE_DIR / "fichier"
PDF_FILE = FICHIER_DIR / "SLP-CHF2012.pdf"
OUTPUT_DIR = WORKSPACE_DIR / "output"
DB_FILE = OUTPUT_DIR / "smartcare.db"
CSV_FILE = OUTPUT_DIR / "smartcare_data.csv"
MODELS_DIR = OUTPUT_DIR / "models"

# Cr√©er les r√©pertoires s'ils n'existent pas
OUTPUT_DIR.mkdir(exist_ok=True)
MODELS_DIR.mkdir(exist_ok=True)

# V√©rifier l'existence du fichier PDF
if PDF_FILE.exists():
    logger.info(f"‚úì PDF trouv√© : {PDF_FILE}")
    pdf_exists = True
else:
    logger.warning(f"‚úó PDF non trouv√© : {PDF_FILE}")
    pdf_exists = False

print(f"R√©pertoire de travail : {WORKSPACE_DIR}")
print(f"Base de donn√©es : {DB_FILE}")
print(f"Fichier PDF : {PDF_FILE} (Existe: {pdf_exists})")


## 3. Charger et inspecter le fichier PDF

In [None]:
def inspect_pdf(pdf_path):
    """Inspecte les m√©tadonn√©es et structure du PDF"""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            print(f"=== INSPECTION DU PDF ===")
            print(f"Nombre de pages : {len(pdf.pages)}")
            
            # M√©tadonn√©es
            if pdf.metadata:
                print(f"\nM√©tadonn√©es :")
                for key, value in pdf.metadata.items():
                    print(f"  {key}: {value}")
            
            # Aper√ßu des premi√®res pages
            print(f"\n=== APER√áU DES PREMI√àRES PAGES ===")
            for i in range(min(3, len(pdf.pages))):
                page = pdf.pages[i]
                print(f"\nPage {i+1}:")
                print(f"  Texte brut (premiers 200 caract√®res):")
                text = page.extract_text()[:200] if page.extract_text() else "Aucun texte"
                print(f"  {text}...")
                
                tables = page.extract_tables()
                print(f"  Nombre de tableaux : {len(tables) if tables else 0}")
                
                return len(pdf.pages)
    except Exception as e:
        logger.error(f"Erreur lors de l'inspection du PDF : {e}")
        return None

# Inspecter le PDF
if pdf_exists:
    num_pages = inspect_pdf(PDF_FILE)
else:
    print("‚ö†Ô∏è PDF non disponible pour l'inspection")
    num_pages = None


## 4. Extraction du texte et des tableaux

In [None]:
def extract_text_and_tables(pdf_path):
    """Extrait le texte et les tableaux du PDF"""
    all_text = []
    all_tables = []
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            logger.info(f"Extraction de {len(pdf.pages)} pages...")
            
            for page_num, page in enumerate(pdf.pages, 1):
                # Extraction du texte
                text = page.extract_text()
                if text:
                    all_text.append({
                        'page': page_num,
                        'text': text
                    })
                
                # Extraction des tableaux
                tables = page.extract_tables()
                if tables:
                    for table_idx, table in enumerate(tables):
                        df = pd.DataFrame(table)
                        df['page'] = page_num
                        df['table_id'] = f"P{page_num}_T{table_idx+1}"
                        all_tables.append(df)
                
                if page_num % 10 == 0:
                    logger.info(f"  {page_num} pages trait√©es...")
        
        logger.info(f"‚úì Extraction termin√©e : {len(all_text)} pages, {len(all_tables)} tableaux")
        return all_text, all_tables
    
    except Exception as e:
        logger.error(f"Erreur lors de l'extraction : {e}")
        return [], []

# Extraction
if pdf_exists:
    extracted_text, extracted_tables = extract_text_and_tables(PDF_FILE)
    print(f"\n‚úì Texte extrait : {len(extracted_text)} pages")
    print(f"‚úì Tableaux extraits : {len(extracted_tables)} tableaux")
    
    if extracted_tables:
        print(f"\nAper√ßu du premier tableau :")
        print(extracted_tables[0].head())
else:
    extracted_text = []
    extracted_tables = []
    print("‚ö†Ô∏è Extraction impossible sans le PDF")


## 5. Nettoyage et pr√©traitement des donn√©es

In [None]:
def clean_numeric_value(val):
    """Nettoie et convertit les valeurs num√©riques"""
    if pd.isna(val) or val == '' or val is None:
        return np.nan
    
    val_str = str(val).strip()
    
    # Supprimer les caract√®res non num√©riques sauf point et tiret
    val_str = re.sub(r'[^\d\.,\-]', '', val_str)
    
    # Remplacer virgule par point pour standardiser
    val_str = val_str.replace(',', '.')
    
    try:
        return float(val_str) if val_str else np.nan
    except:
        return np.nan

def clean_dataframe(df):
    """Nettoie un DataFrame extract√© du PDF"""
    # Supprimer les lignes compl√®tement vides
    df = df.dropna(how='all')
    
    # Supprimer les colonnes enti√®rement vides
    df = df.dropna(axis=1, how='all')
    
    # Nettoyer les en-t√™tes
    df.columns = [str(col).strip() for col in df.columns]
    
    # Supprimer les doublons de lignes
    df = df.drop_duplicates()
    
    # Nettoyer le texte dans toutes les colonnes
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].apply(lambda x: str(x).strip() if pd.notna(x) else '')
    
    return df

def combine_tables(tables_list):
    """Combine plusieurs tableaux extraits"""
    if not tables_list:
        return pd.DataFrame()
    
    cleaned_tables = [clean_dataframe(t) for t in tables_list]
    
    # Fusionner tous les tableaux
    combined = pd.concat(cleaned_tables, ignore_index=True, sort=False)
    
    logger.info(f"Tableaux combin√©s : {len(combined)} lignes, {len(combined.columns)} colonnes")
    return combined

# Nettoyage
if extracted_tables:
    df_clean = combine_tables(extracted_tables)
    print(f"‚úì DataFrames combin√©s : {df_clean.shape[0]} lignes √ó {df_clean.shape[1]} colonnes")
    print(f"\nAper√ßu :")
    print(df_clean.head(10))
else:
    df_clean = pd.DataFrame()
    print("‚ö†Ô∏è Aucun tableau √† nettoyer")


## 6. Validation et gestion des valeurs manquantes

In [None]:
def validate_and_fill_missing_values(df):
    """Valide les donn√©es et g√®re les valeurs manquantes"""
    print("=== VALIDATION ET GESTION DES VALEURS MANQUANTES ===\n")
    
    # Rapport sur les valeurs manquantes
    missing_report = pd.DataFrame({
        'Colonne': df.columns,
        'Manquantes': df.isnull().sum(),
        'Pourcentage': (df.isnull().sum() / len(df) * 100).round(2)
    }).sort_values('Pourcentage', ascending=False)
    
    print("Valeurs manquantes par colonne :")
    print(missing_report)
    
    # Supprimer les colonnes avec plus de 80% de valeurs manquantes
    cols_to_drop = missing_report[missing_report['Pourcentage'] > 80]['Colonne'].tolist()
    if cols_to_drop:
        print(f"\nColonnes supprim√©es (>80% manquantes) : {cols_to_drop}")
        df = df.drop(columns=cols_to_drop)
    
    # Pour les colonnes num√©riques, imputer par la m√©diane
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df[col].isnull().sum() > 0:
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)
    
    # Pour les colonnes texte, imputer par 'Inconnu'
    text_cols = df.select_dtypes(include=['object']).columns
    for col in text_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna('Inconnu', inplace=True)
    
    print(f"\n‚úì Validation compl√©t√©e. Shape: {df.shape}")
    return df

# Application
if not df_clean.empty:
    df_clean = validate_and_fill_missing_values(df_clean)
    print("\n‚úì Donn√©es valid√©es et compl√©t√©es")


## 7. Stocker dans une base de donn√©es SQLite

In [None]:
def save_to_sqlite(df, db_path, table_name='smartcare_data'):
    """Sauvegarde le DataFrame dans une base de donn√©es SQLite"""
    try:
        conn = sqlite3.connect(db_path)
        df.to_sql(table_name, conn, if_exists='replace', index=False)
        conn.commit()
        
        # V√©rifier l'insertion
        cursor = conn.cursor()
        cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
        count = cursor.fetchone()[0]
        
        logger.info(f"‚úì {count} lignes sauvegard√©es dans {table_name}")
        
        # Afficher le sch√©ma
        cursor.execute(f"PRAGMA table_info({table_name})")
        schema = cursor.fetchall()
        print(f"\nSch√©ma de la table '{table_name}' :")
        for col_id, name, type_, notnull, default, pk in schema:
            print(f"  - {name}: {type_}")
        
        conn.close()
        return True
    except Exception as e:
        logger.error(f"Erreur lors de la sauvegarde SQLite : {e}")
        return False

def save_to_csv(df, csv_path):
    """Sauvegarde le DataFrame en CSV"""
    try:
        df.to_csv(csv_path, index=False, encoding='utf-8-sig', sep=',')
        logger.info(f"‚úì Donn√©es sauvegard√©es en CSV : {csv_path}")
        return True
    except Exception as e:
        logger.error(f"Erreur lors de la sauvegarde CSV : {e}")
        return False

# Sauvegarde
if not df_clean.empty:
    save_to_sqlite(df_clean, DB_FILE, 'smartcare_data')
    save_to_csv(df_clean, CSV_FILE)
    print(f"\n‚úì Fichiers de sortie cr√©√©s :")
    print(f"  - Base de donn√©es : {DB_FILE}")
    print(f"  - CSV : {CSV_FILE}")
else:
    print("‚ö†Ô∏è Aucune donn√©e √† sauvegarder")


## 8. Analyse exploratoire et visualisations

In [None]:
def exploratory_analysis(df):
    """Analyse exploratoire des donn√©es"""
    print("=== ANALYSE EXPLORATOIRE ===\n")
    
    # Statistiques g√©n√©rales
    print(f"Shape : {df.shape}")
    print(f"\nTypes de donn√©es :\n{df.dtypes}\n")
    
    print("Statistiques descriptives :")
    print(df.describe())
    
    # Donn√©es num√©riques
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if numeric_cols:
        print(f"\n\nColonnes num√©riques : {numeric_cols}")
        
        # Cr√©er des visualisations
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        fig.suptitle('Analyse Exploratoire des Donn√©es', fontsize=16)
        
        # Distribution des premi√®res colonnes num√©riques
        for idx, col in enumerate(numeric_cols[:4]):
            ax = axes[idx // 2, idx % 2]
            df[col].hist(bins=30, ax=ax, edgecolor='black')
            ax.set_title(f'Distribution : {col}')
            ax.set_ylabel('Fr√©quence')
        
        plt.tight_layout()
        plt.savefig(OUTPUT_DIR / 'exploratory_analysis.png', dpi=100, bbox_inches='tight')
        plt.show()
        logger.info("‚úì Graphique sauvegard√© : exploratory_analysis.png")
    
    # Colonnes cat√©gories
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    if categorical_cols and len(categorical_cols) > 0:
        print(f"\n\nColonnes cat√©gories : {categorical_cols[:5]}")
        for col in categorical_cols[:3]:
            print(f"\n{col} - Top 5 valeurs :")
            print(df[col].value_counts().head())

# Analyse
if not df_clean.empty:
    exploratory_analysis(df_clean)
else:
    print("‚ö†Ô∏è Pas de donn√©es pour l'analyse")


## 9. Pr√©paration des donn√©es pour la mod√©lisation

In [None]:
# Cr√©er des donn√©es synth√©tiques de d√©monstration si le PDF ne contient pas assez de donn√©es
def create_synthetic_healthcare_data(n_samples=1000):
    """Cr√©e des donn√©es de sant√© synth√©tiques pour la d√©monstration"""
    np.random.seed(42)
    
    dates = pd.date_range(start='2020-01-01', periods=n_samples, freq='D')
    
    data = {
        'date': dates,
        'patient_id': np.random.randint(1000, 2000, n_samples),
        'age': np.random.randint(18, 85, n_samples),
        'frequence_cardiaque': np.random.normal(70, 10, n_samples),  # bpm
        'tension_arterielle_sys': np.random.normal(120, 15, n_samples),  # mmHg
        'tension_arterielle_dia': np.random.normal(80, 10, n_samples),
        'glucose': np.random.normal(100, 20, n_samples),  # mg/dL
        'cholesterol': np.random.normal(200, 40, n_samples),  # mg/dL
        'diagnose': np.random.choice(['Sain', 'Hypertension', 'Diab√®te', 'CHF', 'Autre'], n_samples),
        'traitement': np.random.choice(['Aucun', 'M√©dicament A', 'M√©dicament B', 'Intervention'], n_samples)
    }
    
    return pd.DataFrame(data)

# Pr√©paration des donn√©es
if not df_clean.empty:
    df_model = df_clean.copy()
else:
    # Cr√©er des donn√©es synth√©tiques
    print("‚ö†Ô∏è Utilisation de donn√©es synth√©tiques pour la d√©monstration")
    df_model = create_synthetic_healthcare_data(500)
    print(f"Donn√©es synth√©tiques cr√©√©es : {df_model.shape}")

# Sauvegarder les donn√©es mod√®le
if df_model.empty is False:
    df_model.to_csv(OUTPUT_DIR / 'smartcare_model_data.csv', index=False)
    logger.info("‚úì Donn√©es de mod√©lisation sauvegard√©es")

print(f"\n‚úì Donn√©es pr√™tes pour la mod√©lisation : {df_model.shape}")
print(df_model.head())


## 10. Mod√®les de pr√©vision - Regression

In [None]:
def train_regression_models(df_model):
    """Entra√Æne des mod√®les de r√©gression pour pr√©dire la fr√©quence cardiaque"""
    print("=== MOD√àLES DE R√âGRESSION ===\n")
    
    # Pr√©parer les donn√©es
    numeric_cols = df_model.select_dtypes(include=[np.number]).columns.tolist()
    
    if 'frequence_cardiaque' not in numeric_cols:
        # Si non disponible, utiliser la premi√®re colonne num√©rique comme cible
        target_col = numeric_cols[0] if numeric_cols else None
        if target_col is None:
            print("‚ö†Ô∏è Aucune colonne num√©rique trouv√©e")
            return {}
    else:
        target_col = 'frequence_cardiaque'
    
    # S√©lectionner les features (colonnes num√©riques sauf la cible)
    feature_cols = [col for col in numeric_cols if col != target_col]
    
    if len(feature_cols) < 2:
        print("‚ö†Ô∏è Pas assez de features num√©riques")
        return {}
    
    X = df_model[feature_cols].fillna(0)
    y = df_model[target_col].fillna(0)
    
    # Diviser train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Normaliser
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    models = {}
    results = []
    
    # 1. R√©gression Lin√©aire
    print("1. R√©gression Lin√©aire...")
    lr = LinearRegression()
    lr.fit(X_train_scaled, y_train)
    y_pred_lr = lr.predict(X_test_scaled)
    
    mae_lr = mean_absolute_error(y_test, y_pred_lr)
    rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
    r2_lr = r2_score(y_test, y_pred_lr)
    
    models['Linear Regression'] = lr
    results.append({'Model': 'Linear Regression', 'MAE': mae_lr, 'RMSE': rmse_lr, 'R¬≤': r2_lr})
    print(f"   MAE: {mae_lr:.4f}, RMSE: {rmse_lr:.4f}, R¬≤: {r2_lr:.4f}")
    
    # 2. Random Forest
    print("2. Random Forest...")
    rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    
    mae_rf = mean_absolute_error(y_test, y_pred_rf)
    rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
    r2_rf = r2_score(y_test, y_pred_rf)
    
    models['Random Forest'] = rf
    results.append({'Model': 'Random Forest', 'MAE': mae_rf, 'RMSE': rmse_rf, 'R¬≤': r2_rf})
    print(f"   MAE: {mae_rf:.4f}, RMSE: {rmse_rf:.4f}, R¬≤: {r2_rf:.4f}")
    
    # 3. Gradient Boosting
    print("3. Gradient Boosting...")
    gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb.fit(X_train, y_train)
    y_pred_gb = gb.predict(X_test)
    
    mae_gb = mean_absolute_error(y_test, y_pred_gb)
    rmse_gb = np.sqrt(mean_squared_error(y_test, y_pred_gb))
    r2_gb = r2_score(y_test, y_pred_gb)
    
    models['Gradient Boosting'] = gb
    results.append({'Model': 'Gradient Boosting', 'MAE': mae_gb, 'RMSE': rmse_gb, 'R¬≤': r2_gb})
    print(f"   MAE: {mae_gb:.4f}, RMSE: {rmse_gb:.4f}, R¬≤: {r2_gb:.4f}")
    
    # R√©sum√©
    results_df = pd.DataFrame(results)
    print(f"\n{results_df.to_string(index=False)}")
    
    # Visualisation
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    fig.suptitle('Comparaison des Mod√®les de R√©gression', fontsize=14)
    
    predictions = [y_pred_lr, y_pred_rf, y_pred_gb]
    model_names = ['Linear Regression', 'Random Forest', 'Gradient Boosting']
    
    for ax, pred, name in zip(axes, predictions, model_names):
        ax.scatter(y_test, pred, alpha=0.5)
        ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
        ax.set_xlabel('Valeurs R√©elles')
        ax.set_ylabel('Pr√©dictions')
        ax.set_title(name)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'regression_models.png', dpi=100, bbox_inches='tight')
    plt.show()
    
    return models, results_df, scaler, feature_cols

# Entra√Ænement
regression_models, regression_results, scaler, feature_cols = train_regression_models(df_model)


## 11. Mod√®les de pr√©vision - S√©ries Temporelles (ARIMA)

In [None]:
def train_arima_model(df_model):
    """Entra√Æne un mod√®le ARIMA pour les s√©ries temporelles"""
    print("=== MOD√àLE ARIMA - S√âRIES TEMPORELLES ===\n")
    
    if ARIMA is None:
        print("‚ö†Ô∏è statsmodels non install√©. Installation recommand√©e : pip install statsmodels")
        return None, None
    
    # Pr√©parer les donn√©es de s√©rie temporelle
    numeric_cols = df_model.select_dtypes(include=[np.number]).columns.tolist()
    
    if not numeric_cols:
        print("‚ö†Ô∏è Aucune colonne num√©rique")
        return None, None
    
    # Utiliser la premi√®re colonne num√©rique
    ts_data = df_model[numeric_cols[0]].dropna()
    
    if len(ts_data) < 50:
        print("‚ö†Ô∏è Pas assez de donn√©es pour ARIMA (min 50)")
        return None, None
    
    print(f"S√©rie temporelle : {len(ts_data)} points de donn√©es")
    
    try:
        # Diviser train/test
        train_size = int(len(ts_data) * 0.8)
        train, test = ts_data[:train_size], ts_data[train_size:]
        
        # Entra√Æner ARIMA(1,1,1)
        print("Entra√Ænement d'ARIMA(1,1,1)...")
        model_arima = ARIMA(train, order=(1, 1, 1))
        results_arima = model_arima.fit()
        
        # Pr√©dictions
        predictions = results_arima.forecast(steps=len(test))
        
        mae = mean_absolute_error(test, predictions)
        rmse = np.sqrt(mean_squared_error(test, predictions))
        
        print(f"MAE: {mae:.4f}, RMSE: {rmse:.4f}")
        
        # Visualisation
        fig, ax = plt.subplots(figsize=(12, 5))
        ax.plot(range(len(train)), train, label='Entra√Ænement', color='blue')
        ax.plot(range(len(train), len(train) + len(test)), test, label='Test R√©el', color='green')
        ax.plot(range(len(train), len(train) + len(predictions)), predictions, label='Pr√©dictions ARIMA', color='red', linestyle='--')
        ax.set_title('Mod√®le ARIMA - Pr√©dictions vs R√©alit√©')
        ax.set_xlabel('Temps')
        ax.set_ylabel('Valeur')
        ax.legend()
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(OUTPUT_DIR / 'arima_forecast.png', dpi=100, bbox_inches='tight')
        plt.show()
        
        return results_arima, {'MAE': mae, 'RMSE': rmse}
    
    except Exception as e:
        logger.error(f"Erreur ARIMA : {e}")
        return None, None

# Entra√Ænement ARIMA
arima_model, arima_results = train_arima_model(df_model)
if arima_results:
    print(f"‚úì ARIMA mod√®le cr√©√© avec succ√®s")


## 12. Mod√®les de pr√©vision - Prophet

In [None]:
def train_prophet_model(df_model):
    """Entra√Æne un mod√®le Prophet pour les pr√©visions"""
    print("=== MOD√àLE PROPHET ===\n")
    
    if Prophet is None:
        print("‚ö†Ô∏è Prophet non install√©. Installation recommand√©e : pip install prophet")
        return None, None
    
    # Pr√©parer les donn√©es
    if 'date' in df_model.columns:
        df_prophet = df_model[['date']].copy()
    else:
        df_prophet = pd.DataFrame({'date': pd.date_range(start='2020-01-01', periods=len(df_model), freq='D')})
    
    numeric_cols = df_model.select_dtypes(include=[np.number]).columns.tolist()
    if not numeric_cols:
        print("‚ö†Ô∏è Aucune colonne num√©rique")
        return None, None
    
    y_col = numeric_cols[0]
    df_prophet['y'] = df_model[y_col].values
    
    # Renommer pour Prophet
    df_prophet.columns = ['ds', 'y']
    df_prophet = df_prophet.dropna()
    
    if len(df_prophet) < 50:
        print("‚ö†Ô∏è Pas assez de donn√©es pour Prophet")
        return None, None
    
    try:
        print(f"Entra√Ænement de Prophet sur {len(df_prophet)} points...")
        
        model_prophet = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False)
        model_prophet.fit(df_prophet)
        
        # Pr√©visions futures
        future = model_prophet.make_future_dataframe(periods=30)
        forecast = model_prophet.predict(future)
        
        # Visualisation
        fig = model_prophet.plot(forecast)
        plt.title('Pr√©visions Prophet')
        plt.tight_layout()
        plt.savefig(OUTPUT_DIR / 'prophet_forecast.png', dpi=100, bbox_inches='tight')
        plt.show()
        
        # Composants
        fig = model_prophet.plot_components(forecast)
        plt.tight_layout()
        plt.savefig(OUTPUT_DIR / 'prophet_components.png', dpi=100, bbox_inches='tight')
        plt.show()
        
        return model_prophet, forecast
    
    except Exception as e:
        logger.error(f"Erreur Prophet : {e}")
        return None, None

# Entra√Ænement Prophet
prophet_model, prophet_forecast = train_prophet_model(df_model)
if prophet_model:
    print(f"‚úì Prophet mod√®le cr√©√© avec succ√®s")


## 13. Sauvegarde des mod√®les et pipeline

In [None]:
def save_models(models_dict, scaler, feature_cols):
    """Sauvegarde les mod√®les entra√Æn√©s"""
    print("=== SAUVEGARDE DES MOD√àLES ===\n")
    
    try:
        # Sauvegarder les mod√®les
        for model_name, model in models_dict.items():
            model_path = MODELS_DIR / f"{model_name.lower().replace(' ', '_')}.pkl"
            joblib.dump(model, model_path)
            logger.info(f"‚úì {model_name} sauvegard√© : {model_path}")
        
        # Sauvegarder le scaler
        scaler_path = MODELS_DIR / "scaler.pkl"
        joblib.dump(scaler, scaler_path)
        logger.info(f"‚úì Scaler sauvegard√© : {scaler_path}")
        
        # Sauvegarder les features utilis√©es
        config = {
            'feature_cols': feature_cols,
            'n_features': len(feature_cols),
            'model_names': list(models_dict.keys()),
            'created_at': datetime.now().isoformat()
        }
        
        config_path = MODELS_DIR / "config.json"
        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)
        logger.info(f"‚úì Configuration sauvegard√©e : {config_path}")
        
        print("\n‚úì Tous les mod√®les ont √©t√© sauvegard√©s")
        return True
    
    except Exception as e:
        logger.error(f"Erreur lors de la sauvegarde : {e}")
        return False

# Sauvegarde
if regression_models:
    save_models(regression_models, scaler, feature_cols)

# Cr√©er un script d'inf√©rence
inference_script = '''#!/usr/bin/env python
"""Script d'inf√©rence - Utilisation des mod√®les entra√Æn√©s"""

import joblib
import json
import pandas as pd
import numpy as np
from pathlib import Path

# Charger les mod√®les et config
MODELS_DIR = Path(__file__).parent / "models"

config = json.load(open(MODELS_DIR / "config.json"))
scaler = joblib.load(MODELS_DIR / "scaler.pkl")

def predict(X_new):
    """Effectue une pr√©diction avec les mod√®les"""
    # Charger les mod√®les
    models = {}
    for model_name in config['model_names']:
        model_path = MODELS_DIR / f"{model_name.lower().replace(' ', '_')}.pkl"
        models[model_name] = joblib.load(model_path)
    
    # S√©lectionner les features
    X_new = X_new[config['feature_cols']]
    
    # Normaliser
    X_scaled = scaler.transform(X_new)
    
    # Pr√©dictions
    predictions = {}
    for model_name, model in models.items():
        try:
            pred = model.predict(X_scaled if 'Linear' in model_name else X_new)
            predictions[model_name] = pred
        except:
            predictions[model_name] = None
    
    return predictions

if __name__ == "__main__":
    # Exemple d'utilisation
    print("Script d'inf√©rence - SmartCare Analytics")
    print(f"Mod√®les disponibles : {config['model_names']}")
    print(f"Features attendues : {config['feature_cols']}")
'''

with open(MODELS_DIR / "inference.py", 'w') as f:
    f.write(inference_script)

logger.info("‚úì Script d'inf√©rence cr√©√©")


## 14. R√©sum√© et rapport final

In [None]:
def generate_final_report():
    """G√©n√®re un rapport final complet"""
    report = f"""
{'='*80}
RAPPORT FINAL - SMARTCARE ANALYTICS
{'='*80}

üìä EXTRACTION DES DONN√âES
  - Fichier PDF : {PDF_FILE}
  - Texte extrait : {len(extracted_text)} pages
  - Tableaux extraits : {len(extracted_tables)} tableaux
  
üìà BASE DE DONN√âES
  - Format : SQLite et CSV
  - Emplacement : {OUTPUT_DIR}
  - Lignes : {df_clean.shape[0] if not df_clean.empty else 'N/A'}
  - Colonnes : {df_clean.shape[1] if not df_clean.empty else 'N/A'}

ü§ñ MOD√àLES ENTRA√éN√âS
  
  1. R√âGRESSION
"""
    
    if not regression_results.empty:
        report += f"\n     {regression_results.to_string(index=False)}"
    
    report += f"""
  
  2. S√âRIES TEMPORELLES
"""
    
    if arima_results:
        report += f"\n     ARIMA : MAE={arima_results.get('MAE', 'N/A')}, RMSE={arima_results.get('RMSE', 'N/A')}"
    else:
        report += "\n     ARIMA : Non disponible"
    
    report += f"""
  
  3. PROPHET
"""
    if prophet_model:
        report += "\n     Prophet : Entra√Æn√© avec succ√®s"
    else:
        report += "\n     Prophet : Non disponible"
    
    report += f"""

üìÅ FICHIERS G√âN√âR√âS
  - Base de donn√©es SQLite : {DB_FILE}
  - Donn√©es CSV : {CSV_FILE}
  - Donn√©es mod√®le : {OUTPUT_DIR / 'smartcare_model_data.csv'}
  - Mod√®les sauvegard√©s : {MODELS_DIR}
  - Visualisations :
    * exploratory_analysis.png
    * regression_models.png
    * arima_forecast.png
    * prophet_forecast.png
    * prophet_components.png

‚úÖ √âTAPES COMPL√âT√âES
  ‚úì Extraction du PDF
  ‚úì Nettoyage des donn√©es
  ‚úì Validation et imputation
  ‚úì Cr√©ation de la base de donn√©es
  ‚úì Analyse exploratoire
  ‚úì Mod√®les de r√©gression
  ‚úì Mod√®les de s√©ries temporelles
  ‚úì Sauvegarde des mod√®les

üöÄ PROCHAINES √âTAPES
  1. Optimiser les hyperparam√®tres des mod√®les
  2. Impl√©menter des mod√®les plus avanc√©s (LSTM, Transformer)
  3. Effectuer des pr√©dictions sur de nouvelles donn√©es
  4. D√©ployer les mod√®les en production
  5. Mettre en place un syst√®me de monitoring

{'='*80}
"""
    
    print(report)
    
    # Sauvegarder le rapport
    report_path = OUTPUT_DIR / "RAPPORT_FINAL.txt"
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write(report)
    
    logger.info(f"‚úì Rapport sauvegard√© : {report_path}")

# G√©n√©rer le rapport
generate_final_report()

print("\n" + "="*80)
print("‚ú® PIPELINE COMPLET TERMIN√â AVEC SUCC√àS ‚ú®")
print("="*80)
