# SmartCare Analytics 

In [None]:
"""
Syst√®me d'extraction et d'analyse des donn√©es hospitali√®res
H√¥pitaux Universitaires Piti√© Salp√™tri√®re - Charles Foix
"""

import pdfplumber
import pandas as pd
import re
from pathlib import Path
import json
from typing import Dict, List, Any


class HospitalDataExtractor:
    """Classe pour extraire et structurer les donn√©es du rapport hospitalier"""
    
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.data = {
            'informations_generales': {},
            'activites_hospitalieres': {},
            'ressources_humaines': {},
            'equipements': {},
            'finances': {},
            'qualite': {},
            'recherche': {},
            'logistique': {},
            'pathologies': {}
        }
    
    def extract_text_from_pdf(self) -> str:
        """Extrait tout le texte du PDF"""
        full_text = ""
        try:
            with pdfplumber.open(self.pdf_path) as pdf:
                for page in pdf.pages:
                    text = page.extract_text()
                    if text:
                        full_text += text + "\n"
        except Exception as e:
            print(f"Erreur lors de l'extraction du texte: {e}")
        return full_text
    
    def extract_general_info(self, text: str):
        """Extrait les informations g√©n√©rales de l'h√¥pital"""
        general_info = {}
        
        # Extraction des grands chiffres cl√©s
        patterns = {
            'nombre_lits': r'Nombre de lits\s+(\d+(?:\s+\d+)?)',
            'nombre_poles': r'Nombre de p√¥les\s+(\d+)',
            'centres_maladies_rares': r'Nombre de centres maladies rares\s+(\d+)',
            'hospitalisations_total': r'Nombre total d\'hospitalisations\s+([\d\s]+)',
            'sejours_plus_24h': r'Nombre de s√©jours \+ 24 h\s+([\d\s]+)',
            'sejours_ambulatoires': r'Nombre de s√©jours ambulatoires\s+([\d\s]+)',
            'consultations': r'Nombre de consultations\s+([\d\s]+)',
            'urgences': r'Passages aux urgences\s+([\d\s]+)',
            'naissances': r'Nombre de naissances\s+([\d\s]+)',
            'greffes': r'Nombre de greffes\s+(\d+)',
            'effectif_medecins': r'Effectif des m√©decins\s+([\d\s]+)',
            'effectif_paramedical': r'Effectif du personnel param√©dical\s+([\d\s]+)',
            'depenses': r'D√©penses\s+([\d\s]+)\s*M‚Ç¨',
            'recettes': r'Recettes\s+([\d\s]+)\s*M‚Ç¨'
        }
        
        for key, pattern in patterns.items():
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                value = match.group(1).replace(' ', '').replace('\xa0', '')
                general_info[key] = int(value) if value.isdigit() else value
        
        # √Çge moyen des patients
        age_patterns = {
            'age_moyen_psl': r'PSL\s+([\d\.]+)\s+ans',
            'age_moyen_cfx': r'CFX\s+([\d\.]+)\s+ans'
        }
        
        for key, pattern in age_patterns.items():
            match = re.search(pattern, text)
            if match:
                general_info[key] = float(match.group(1))
        
        self.data['informations_generales'] = general_info
    
    def extract_hospital_activities(self, text: str):
        """Extrait les donn√©es d'activit√©s hospitali√®res"""
        activities = {}
        
        # S√©jours MCO
        mco_pattern = r'Nombre de s√©jours MCO.*?PSL\s+(\d+(?:\s+\d+)?)\s+(\d+(?:\s+\d+)?)'
        match = re.search(mco_pattern, text)
        if match:
            activities['sejours_mco_psl_2011'] = int(match.group(1).replace(' ', ''))
            activities['sejours_mco_psl_2012'] = int(match.group(2).replace(' ', ''))
        
        # Consultations externes
        consult_pattern = r'Les consultations externes.*?PSL\s+([\d\s]+)\s+([\d\s]+)'
        match = re.search(consult_pattern, text)
        if match:
            activities['consultations_2011'] = int(match.group(1).replace(' ', ''))
            activities['consultations_2012'] = int(match.group(2).replace(' ', ''))
        
        # Urgences
        urgences_pattern = r'Passages\s+Patients Admis.*?(\d+\s+\d+)\s+(\d+\s+\d+)'
        match = re.search(urgences_pattern, text)
        if match:
            activities['passages_urgences_2012'] = int(match.group(2).split()[0].replace(' ', ''))
        
        self.data['activites_hospitalieres'] = activities
    
    def extract_human_resources(self, text: str):
        """Extrait les donn√©es des ressources humaines"""
        hr_data = {}
        
        # Personnel m√©dical
        medical_patterns = {
            'personnels_universitaires': r'Personnels Hospitalo-Universitaires\s+(\d+)\s+(\d+)\s+(\d+)',
            'praticiens_hospitaliers': r'Praticiens hospitaliers.*?contractuels\s+(\d+)\s+(\d+)\s+(\d+)',
            'internes': r'Internes, r√©sidents.*?(\d+)\s+(\d+)\s+(\d+)'
        }
        
        for key, pattern in medical_patterns.items():
            match = re.search(pattern, text, re.DOTALL)
            if match:
                hr_data[f'{key}_psl'] = int(match.group(1))
                hr_data[f'{key}_cfx'] = int(match.group(2))
                hr_data[f'{key}_total'] = int(match.group(3))
        
        # Personnel param√©dical total
        param_pattern = r'Personnel hospitalier\s+(\d+(?:\s+\d+)?)\s+(\d+)\s+(\d+(?:\s+\d+)?)'
        match = re.search(param_pattern, text)
        if match:
            hr_data['personnel_hospitalier_psl'] = int(match.group(1).replace(' ', ''))
            hr_data['personnel_hospitalier_cfx'] = int(match.group(2))
            hr_data['personnel_hospitalier_total'] = int(match.group(3).replace(' ', ''))
        
        self.data['ressources_humaines'] = hr_data
    
    def extract_equipment(self, text: str):
        """Extrait les donn√©es des √©quipements"""
        equipment = {}
        
        equipment_patterns = {
            'laboratoires_psl': r'Laboratoires.*?PSL\s+(\d+)',
            'blocs_operatoires': r'Blocs op√©ratoires.*?(\d+)',
            'irm': r'IRM.*?(\d+)(?:\s+(\d+))?',
            'scanner': r'Scanner diagnostique.*?(\d+)',
            'lits_reanimation': r'Lits de r√©animation.*?(\d+)',
            'lits_soins_intensifs': r'Lits de soins intensifs.*?(\d+)'
        }
        
        for key, pattern in equipment_patterns.items():
            match = re.search(pattern, text, re.DOTALL)
            if match:
                equipment[key] = int(match.group(1))
        
        self.data['equipements'] = equipment
    
    def extract_financial_data(self, text: str):
        """Extrait les donn√©es financi√®res"""
        finances = {}
        
        # D√©penses
        depenses_pattern = r'Charges relatives au personnel\s+([\d\s]+)\s+([\d\s]+)\s+([\d\s]+)'
        match = re.search(depenses_pattern, text)
        if match:
            finances['charges_personnel_psl'] = int(match.group(1).replace(' ', ''))
            finances['charges_personnel_cfx'] = int(match.group(2).replace(' ', ''))
            finances['charges_personnel_total'] = int(match.group(3).replace(' ', ''))
        
        # Recettes
        recettes_pattern = r'Produits de l\'assurance maladie\s+([\d\s]+)\s+([\d\s]+)\s+([\d\s]+)'
        match = re.search(recettes_pattern, text)
        if match:
            finances['produits_assurance_maladie_psl'] = int(match.group(1).replace(' ', ''))
            finances['produits_assurance_maladie_cfx'] = int(match.group(2).replace(' ', ''))
            finances['produits_assurance_maladie_total'] = int(match.group(3).replace(' ', ''))
        
        # Solde
        solde_pattern = r'Solde\s+([\d\s-]+)\s+([\d\s-]+)\s+([\d\s-]+)'
        match = re.search(solde_pattern, text)
        if match:
            finances['solde_psl'] = int(match.group(1).replace(' ', ''))
            finances['solde_cfx'] = int(match.group(2).replace(' ', '').replace('-', '-'))
            finances['solde_total'] = int(match.group(3).replace(' ', ''))
        
        self.data['finances'] = finances
    
    def extract_pathologies(self, text: str):
        """Extrait les principales pathologies trait√©es"""
        pathologies = {}
        
        pathology_patterns = {
            'cancereuses': r'([\d\s\.]+)\s+Pathologies\s+canc√©reuses',
            'neurologiques': r'([\d\s\.]+)\s+Pathologies neurologiques',
            'genito_urinaire': r'([\d\s\.]+)\s+Pathologies\s+de l\'appareil\s+g√©nito-urinaire',
            'orthopediques': r'([\d\s\.]+)\s+Pathologies orthop√©diques',
            'digestif': r'([\d\s\.]+)\s+Pathologies\s+de l\'appareil digestif',
            'cardio_vasculaires': r'([\d\s\.]+)\s+Pathologies\s+cardio-vasculaires',
            'endocriniennes': r'([\d\s\.]+)\s+Pathologies endocriniennes',
            'infectieuses': r'([\d\s\.]+)\s+Pathologies infectieuses',
            'obstetrique': r'([\d\s\.]+)\s+Obst√©trique'
        }
        
        for key, pattern in pathology_patterns.items():
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                value = match.group(1).replace(' ', '').replace('.', '')
                pathologies[key] = int(value) if value.isdigit() else value
        
        self.data['pathologies'] = pathologies
    
    def extract_quality_indicators(self, text: str):
        """Extrait les indicateurs qualit√©"""
        quality = {}
        
        # Satisfaction patients
        satisfaction_pattern = r'Excellent\s+([\d,\.]+)'
        matches = re.findall(satisfaction_pattern, text)
        if matches:
            quality['satisfaction_soins_excellent'] = float(matches[0].replace(',', '.'))
        
        self.data['qualite'] = quality
    
    def extract_logistics(self, text: str):
        """Extrait les donn√©es logistiques"""
        logistics = {}
        
        # Restauration
        resto_pattern = r'Restauration\s+([\d\s]+)\s+([\d\s]+)\s+([\d\s]+)'
        match = re.search(resto_pattern, text)
        if match:
            logistics['repas_psl'] = int(match.group(1).replace(' ', ''))
            logistics['repas_cfx'] = int(match.group(2).replace(' ', ''))
            logistics['repas_total'] = int(match.group(3).replace(' ', ''))
        
        # Lingerie
        lingerie_pattern = r'Lingerie \(en kg\)\s+([\d\s]+)\s+([\d\s]+)\s+([\d\s]+)'
        match = re.search(lingerie_pattern, text)
        if match:
            logistics['lingerie_kg_psl'] = int(match.group(1).replace(' ', ''))
            logistics['lingerie_kg_cfx'] = int(match.group(2).replace(' ', ''))
            logistics['lingerie_kg_total'] = int(match.group(3).replace(' ', ''))
        
        # D√©chets DAOM
        daom_pattern = r'DAOM\*\s+([\d,]+)\s+([\d]+)\s+([\d,]+)'
        match = re.search(daom_pattern, text)
        if match:
            logistics['dechets_daom_psl'] = float(match.group(1).replace(',', '.'))
            logistics['dechets_daom_cfx'] = float(match.group(2))
            logistics['dechets_daom_total'] = float(match.group(3).replace(',', '.'))
        
        self.data['logistique'] = logistics
    
    def extract_research(self, text: str):
        """Extrait les donn√©es de recherche"""
        research = {}
        
        # Publications
        pub_pattern = r'Publications\*\s+([\d,]+)\s+([\d,]+)\s+(\d+)\s*%'
        match = re.search(pub_pattern, text)
        if match:
            research['publications_montant'] = float(match.group(1).replace(',', '.'))
            research['publications_aphp'] = float(match.group(2).replace(',', '.'))
            research['publications_pourcentage'] = int(match.group(3))
        
        # Essais cliniques
        essai_pattern = r'Essais Cliniques\s+([\d]+)\s+([\d,]+)\s+(\d+)\s*%'
        match = re.search(essai_pattern, text)
        if match:
            research['essais_montant'] = float(match.group(1))
            research['essais_aphp'] = float(match.group(2).replace(',', '.'))
            research['essais_pourcentage'] = int(match.group(3))
        
        self.data['recherche'] = research
    
    def extract_all_data(self):
        """Extrait toutes les donn√©es du PDF"""
        print("Extraction du texte du PDF...")
        text = self.extract_text_from_pdf()
        
        print("Extraction des informations g√©n√©rales...")
        self.extract_general_info(text)
        
        print("Extraction des activit√©s hospitali√®res...")
        self.extract_hospital_activities(text)
        
        print("Extraction des ressources humaines...")
        self.extract_human_resources(text)
        
        print("Extraction des √©quipements...")
        self.extract_equipment(text)
        
        print("Extraction des donn√©es financi√®res...")
        self.extract_financial_data(text)
        
        print("Extraction des pathologies...")
        self.extract_pathologies(text)
        
        print("Extraction des indicateurs qualit√©...")
        self.extract_quality_indicators(text)
        
        print("Extraction de la logistique...")
        self.extract_logistics(text)
        
        print("Extraction des donn√©es de recherche...")
        self.extract_research(text)
        
        return self.data
    
    def create_dataframes(self) -> Dict[str, pd.DataFrame]:
        """Cr√©e des DataFrames pandas pour chaque cat√©gorie"""
        dataframes = {}
        
        for category, data_dict in self.data.items():
            if data_dict:
                df = pd.DataFrame([data_dict])
                dataframes[category] = df
        
        return dataframes
    
    def export_to_csv(self, output_dir: str = "/mnt/user-data/outputs"):
        """Exporte toutes les donn√©es en fichiers CSV"""
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        
        dataframes = self.create_dataframes()
        exported_files = []
        
        for category, df in dataframes.items():
            if not df.empty:
                filename = f"{output_dir}/hospital_{category}_2012.csv"
                df.to_csv(filename, index=False, encoding='utf-8-sig')
                exported_files.append(filename)
                print(f"‚úì Export√©: {category} ({len(df.columns)} colonnes)")
        
        # Cr√©er un fichier CSV r√©capitulatif
        self.create_summary_csv(output_dir)
        exported_files.append(f"{output_dir}/hospital_summary_2012.csv")
        
        return exported_files
    
    def create_summary_csv(self, output_dir: str):
        """Cr√©e un fichier CSV r√©capitulatif avec les indicateurs cl√©s"""
        summary_data = []
        
        # Informations g√©n√©rales
        gen_info = self.data.get('informations_generales', {})
        summary_data.append({
            'Cat√©gorie': 'Infrastructure',
            'Indicateur': 'Nombre de lits',
            'Valeur': gen_info.get('nombre_lits', 'N/A'),
            'Unit√©': 'lits'
        })
        summary_data.append({
            'Cat√©gorie': 'Infrastructure',
            'Indicateur': 'Nombre de p√¥les',
            'Valeur': gen_info.get('nombre_poles', 'N/A'),
            'Unit√©': 'p√¥les'
        })
        summary_data.append({
            'Cat√©gorie': 'Activit√©',
            'Indicateur': 'Hospitalisations totales',
            'Valeur': gen_info.get('hospitalisations_total', 'N/A'),
            'Unit√©': 's√©jours'
        })
        summary_data.append({
            'Cat√©gorie': 'Activit√©',
            'Indicateur': 'Consultations',
            'Valeur': gen_info.get('consultations', 'N/A'),
            'Unit√©': 'consultations'
        })
        summary_data.append({
            'Cat√©gorie': 'Urgences',
            'Indicateur': 'Passages aux urgences',
            'Valeur': gen_info.get('urgences', 'N/A'),
            'Unit√©': 'passages'
        })
        summary_data.append({
            'Cat√©gorie': 'Ressources Humaines',
            'Indicateur': 'Effectif m√©decins',
            'Valeur': gen_info.get('effectif_medecins', 'N/A'),
            'Unit√©': 'ETP'
        })
        summary_data.append({
            'Cat√©gorie': 'Ressources Humaines',
            'Indicateur': 'Effectif param√©dical',
            'Valeur': gen_info.get('effectif_paramedical', 'N/A'),
            'Unit√©': 'ETP'
        })
        summary_data.append({
            'Cat√©gorie': 'Finances',
            'Indicateur': 'D√©penses totales',
            'Valeur': gen_info.get('depenses', 'N/A'),
            'Unit√©': 'M‚Ç¨'
        })
        summary_data.append({
            'Cat√©gorie': 'Finances',
            'Indicateur': 'Recettes totales',
            'Valeur': gen_info.get('recettes', 'N/A'),
            'Unit√©': 'M‚Ç¨'
        })
        
        # Pathologies principales
        pathologies = self.data.get('pathologies', {})
        for path_type, count in pathologies.items():
            summary_data.append({
                'Cat√©gorie': 'Pathologies',
                'Indicateur': path_type.replace('_', ' ').title(),
                'Valeur': count,
                'Unit√©': 's√©jours'
            })
        
        df_summary = pd.DataFrame(summary_data)
        df_summary.to_csv(f"{output_dir}/hospital_summary_2012.csv", 
                          index=False, encoding='utf-8-sig')
        print(f"‚úì Export√©: R√©sum√© g√©n√©ral")
    
    def display_summary(self):
        """Affiche un r√©sum√© des donn√©es extraites"""
        print("\n" + "="*80)
        print(" R√âSUM√â DES DONN√âES EXTRAITES - H√îPITAUX PSL-CFX 2012")
        print("="*80 + "\n")
        
        gen_info = self.data.get('informations_generales', {})
        
        print("üìä INFORMATIONS G√âN√âRALES")
        print("-" * 80)
        print(f"  ‚Ä¢ Nombre de lits: {gen_info.get('nombre_lits', 'N/A')}")
        print(f"  ‚Ä¢ Nombre de p√¥les: {gen_info.get('nombre_poles', 'N/A')}")
        print(f"  ‚Ä¢ Centres maladies rares: {gen_info.get('centres_maladies_rares', 'N/A')}")
        hosp = gen_info.get('hospitalisations_total', 'N/A')
        print(f"  ‚Ä¢ Hospitalisations totales: {hosp if hosp == 'N/A' else f'{hosp:,}'.replace(',', ' ')}")
        cons = gen_info.get('consultations', 'N/A')
        print(f"  ‚Ä¢ Consultations: {cons if cons == 'N/A' else f'{cons:,}'.replace(',', ' ')}")
        urg = gen_info.get('urgences', 'N/A')
        print(f"  ‚Ä¢ Passages aux urgences: {urg if urg == 'N/A' else f'{urg:,}'.replace(',', ' ')}")
        print(f"  ‚Ä¢ Naissances: {gen_info.get('naissances', 'N/A')}")
        print(f"  ‚Ä¢ Greffes: {gen_info.get('greffes', 'N/A')}")
        
        print(f"\nüë• RESSOURCES HUMAINES")
        print("-" * 80)
        print(f"  ‚Ä¢ M√©decins: {gen_info.get('effectif_medecins', 'N/A')}")
        param = gen_info.get('effectif_paramedical', 'N/A')
        print(f"  ‚Ä¢ Personnel param√©dical: {param if param == 'N/A' else f'{param:,}'.replace(',', ' ')}")
        
        print(f"\nüí∞ FINANCES")
        print("-" * 80)
        print(f"  ‚Ä¢ D√©penses: {gen_info.get('depenses', 'N/A')} M‚Ç¨")
        print(f"  ‚Ä¢ Recettes: {gen_info.get('recettes', 'N/A')} M‚Ç¨")
        
        pathologies = self.data.get('pathologies', {})
        if pathologies:
            print(f"\nüè• PRINCIPALES PATHOLOGIES (Top 5)")
            print("-" * 80)
            sorted_patho = sorted(pathologies.items(), 
                                key=lambda x: int(x[1]) if isinstance(x[1], (int, str)) and str(x[1]).isdigit() else 0, 
                                reverse=True)[:5]
            for i, (name, count) in enumerate(sorted_patho, 1):
                count_str = f"{count:,}".replace(',', ' ') if isinstance(count, int) else count
                print(f"  {i}. {name.replace('_', ' ').title()}: {count_str} s√©jours")
        
        print("\n" + "="*80)
        print(f"Total de cat√©gories extraites: {len([v for v in self.data.values() if v])}")
        print("="*80 + "\n")


def main():
    """Fonction principale"""
    # Chemin du PDF
    pdf_path = "/mnt/user-data/uploads/SLP-CHF2012.pdf"
    
    print("="*80)
    print(" EXTRACTEUR DE DONN√âES HOSPITALI√àRES PSL-CFX 2012")
    print("="*80 + "\n")
    
    # Cr√©er l'extracteur
    extractor = HospitalDataExtractor(pdf_path)
    
    # Extraire toutes les donn√©es
    data = extractor.extract_all_data()
    
    # Afficher le r√©sum√©
    extractor.display_summary()
    
    # Exporter en CSV
    print("üìÅ Exportation des donn√©es en fichiers CSV...")
    exported_files = extractor.export_to_csv()
    
    print(f"\n‚úÖ Exportation termin√©e! {len(exported_files)} fichiers cr√©√©s.")
    print("\nFichiers g√©n√©r√©s:")
    for file in exported_files:
        print(f"  ‚Ä¢ {Path(file).name}")
    
    # Sauvegarder aussi en JSON
    json_path = "/mnt/user-data/outputs/hospital_data_complete_2012.json"
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"\n  ‚Ä¢ hospital_data_complete_2012.json")
    
    print("\n" + "="*80)
    print(" Traitement termin√© avec succ√®s!")
    print("="*80)
    
    return data, exported_files


if __name__ == "__main__":
    data, files = main()