Block 1 ‚Äî Preparation (imports, seeds, folders, logger)
Here is a single, executable cell that:

imports the necessary libraries,

sets the seeds,

creates the data/, results/, and logs/ folders,

delete /content/sample_data if it exists,

configures a logger with console and file output,

initializes a log file, logs/logs.csv, and appends it to logs/summary.md,

uses timezone-aware timestamps (without the deprecated utcnow()),

avoids escaping warnings with raw strings.

In [1]:
# ============================================================
# ‚öôÔ∏è Installation des d√©pendances du projet
# Cette cellule garantit que toutes les librairies n√©cessaires sont install√©es.
# ============================================================

import subprocess
import sys

def install_requirements(file_path="requirements.txt"):
    """Installe les paquets list√©s dans requirements.txt."""
    print(f"Installation/Mise √† jour des d√©pendances via {file_path}...")
    try:
        # Ex√©cute la commande pip
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", file_path])
        print("\n‚úÖ Toutes les d√©pendances ont √©t√© install√©es ou mises √† jour avec succ√®s.")
        print("Veuillez RED√âMARRER le noyau (kernel) du notebook si c'est la premi√®re ex√©cution.")
    except subprocess.CalledProcessError as e:
        print(f"\n‚ùå ERREUR lors de l'installation des d√©pendances : {e}")

# Ex√©cuter l'installation
install_requirements()


Installation/Mise √† jour des d√©pendances via requirements.txt...

‚úÖ Toutes les d√©pendances ont √©t√© install√©es ou mises √† jour avec succ√®s.
Veuillez RED√âMARRER le noyau (kernel) du notebook si c'est la premi√®re ex√©cution.


In [2]:
# Bloc 1 ‚Äî Pr√©paration
# Imports, seeds, dossiers, logger, journaux init

import os
import csv
import shutil
import random
import logging
from datetime import datetime, timezone

import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

# =========================
# Seeds et conventions
# =========================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# =========================
# Dossiers
# =========================
os.makedirs('data', exist_ok=True)
os.makedirs('results', exist_ok=True)
os.makedirs('logs', exist_ok=True)

# Supprimer sample_data si pr√©sent (environnements Colab)
sample_data_path = '/content/sample_data'
if os.path.isdir(sample_data_path):
    try:
        shutil.rmtree(sample_data_path)
    except Exception as e:
        # Silencieux mais on loguera plus bas
        pass

# =========================
# Timestamps et helpers
# =========================
def utc_timestamp():
    # Timezone-aware ISO 8601
    return datetime.now(timezone.utc).isoformat()

LOG_CSV_PATH = os.path.join('logs', 'logs.csv')
SUMMARY_MD_PATH = os.path.join('logs', 'summary.md')

# Init du fichier logs.csv si vide
if not os.path.isfile(LOG_CSV_PATH):
    with open(LOG_CSV_PATH, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['timestamp', 'level', 'message'])

# =========================
# Logger
# =========================
logger = logging.getLogger('TlogV01')
logger.setLevel(logging.INFO)
logger.handlers.clear()

# Console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_fmt = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')
console_handler.setFormatter(console_fmt)
logger.addHandler(console_handler)

# Fichier handler (logs/app.log)
file_handler = logging.FileHandler(os.path.join('logs', 'app.log'))
file_handler.setLevel(logging.INFO)
file_fmt = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')
file_handler.setFormatter(file_fmt)
logger.addHandler(file_handler)

# =========================
# Fonctions de journalisation
# =========================
def log_event(level: str, message: str):
    """
    √âcrit dans logs.csv et via logger standard.
    level: 'INFO' | 'WARNING' | 'ERROR'
    """
    ts = utc_timestamp()
    # Logger console/fichier
    if level.upper() == 'INFO':
        logger.info(message)
    elif level.upper() == 'WARNING':
        logger.warning(message)
    elif level.upper() == 'ERROR':
        logger.error(message)
    else:
        logger.info(message)

    # Ajout dans logs.csv
    with open(LOG_CSV_PATH, 'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([ts, level.upper(), message])

def append_summary_md(text: str):
    """
    Append dans summary.md. Utiliser des cha√Ænes brutes pour
    inclure LaTeX sans warnings d‚Äô√©chappement.
    """
    with open(SUMMARY_MD_PATH, 'a', encoding='utf-8') as f:
        f.write(text + '\n')

# =========================
# Banner de session
# =========================
session_header = r"""# Session Log T_log V0.1

- Session started: {ts}
- Conventions: bias=0 by default, seeds fixed (42), outputs in results/
""".format(ts=utc_timestamp())

# √âcrire header si le fichier est nouveau
if not os.path.isfile(SUMMARY_MD_PATH) or os.path.getsize(SUMMARY_MD_PATH) == 0:
    append_summary_md(session_header)

# =========================
# V√©rification environnement
# =========================
plt.figure(figsize=(4, 3))
x = np.linspace(0, 2*np.pi, 200)
plt.plot(x, np.sin(x), color='steelblue', lw=2)
plt.title('Env check plot')
plt.tight_layout()
env_plot_path = os.path.join('results', 'env_check_plot.png')
plt.savefig(env_plot_path, dpi=150)
plt.close()

# =========================
# Logs init
# =========================
log_event('INFO', 'Bloc 1 pr√™t: imports, seeds, dossiers et logger configur√©s.')
log_event('INFO', f'Plot de v√©rification sauvegard√©: {env_plot_path}')

append_summary_md(r"""---
## Bloc 1 ‚Äî Pr√©paration
- Imports, seeds, dossiers et logger configur√©s.
- Env check plot: results/env_check_plot.png
""")

# Affichage de confirmation minimal
print("Bloc 1 OK ‚Äî Dossiers et logger pr√™ts.")
print(f"Seeds fix√©s: {SEED}")
print(f"Logs: {LOG_CSV_PATH}")
print(f"Summary: {SUMMARY_MD_PATH}")


2025-11-11 02:14:33,438 [INFO] Bloc 1 pr√™t: imports, seeds, dossiers et logger configur√©s.
2025-11-11 02:14:33,440 [INFO] Plot de v√©rification sauvegard√©: results\env_check_plot.png


Bloc 1 OK ‚Äî Dossiers et logger pr√™ts.
Seeds fix√©s: 42
Logs: logs\logs.csv
Summary: logs\summary.md


Quick summary: We'll move on to Block 2 ‚Äî Data Acquisition. The goal is to unzip your Urban Air Quality & Climate Dataset (1958-2025).zip ZIP file into the data/ folder, verify its contents (CSV files, etc.), and log the operation.

Block 2 ‚Äî Data Acquisition (Air Quality)
Here is the corresponding Python cell:

In [3]:
import os
import pandas as pd
import json
import zipfile
from datetime import datetime

# --- 0. INSTALLATION DE KAGGLE ---
# Cette ligne assure que la librairie Kaggle est install√©e
!pip install kaggle --quiet

# --- D√©pendance Kaggle ---
try:
    # Tenter d'importer la librairie Kaggle
    import kaggle.api as kaggle_api
except ImportError:
    print("√âchec de l'importation de 'kaggle' m√™me apr√®s installation. Veuillez v√©rifier votre environnement.")
    raise
# ------------------------

# --- 1. CONFIGURATION ET FONCTIONS DE LOGGING ---

# Identifiants du Dataset Kaggle
KAGGLE_DATASET_ID = "krishd123/urban-air-quality-and-climate-dataset-1958-2025"
TARGET_FILE_NAME = "urban_climate.csv"

# Chemins de travail
DATA_DIR = 'data'
LOGS_DIR = 'logs'
RESULTS_DIR = 'results'

# Fichier de donn√©es apr√®s t√©l√©chargement/extraction
LOCAL_COPY = os.path.join(DATA_DIR, TARGET_FILE_NAME)
# Fichiers de log et de r√©sultats
RESULT_PREVIEW = os.path.join(RESULTS_DIR, 'urban_climate_preview.csv')
LOGS_CSV = os.path.join(LOGS_DIR, 'logs.csv')
SUMMARY_MD = os.path.join(LOGS_DIR, 'summary.md')

# Cr√©ation des dossiers
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(LOGS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)


def append_log(level, message):
    """Ajoute une entr√©e au fichier de log CSV et Markdown."""
    ts = datetime.utcnow().isoformat() + 'Z'
    entry = pd.DataFrame([{'timestamp': ts, 'level': level, 'message': message}])
    
    # √âcriture du log
    try:
        if os.path.exists(LOGS_CSV):
            df_logs = pd.read_csv(LOGS_CSV)
            df_logs = pd.concat([df_logs, entry], ignore_index=True)
        else:
            df_logs = entry
            
        df_logs.to_csv(LOGS_CSV, index=False)
        with open(SUMMARY_MD, 'a', encoding='utf-8') as f:
            f.write(f'\n- {ts} **{level}**: {message}\n')
    except Exception as e:
        print(f"[ALERTE] √âchec de l'√©criture du log: {e}")

# Alias pour utiliser 'log_event' si d√©sir√©, tout en utilisant la fonction `append_log`
log_event = append_log


def find_and_auth_kaggle():
    """Tente de trouver les cl√©s d'API et authentifie l'API Kaggle."""
    log_event('INFO', 'Tentative d\'authentification Kaggle...')
    
    # 1. V√©rifier les variables d'environnement (m√©thode Colab/Notebook)
    if os.getenv('KAGGLE_USERNAME') and os.getenv('KAGGLE_KEY'):
        log_event('INFO', 'Authentification via variables d\'environnement (KAGGLE_USERNAME/KEY).')
    
    # 2. Chercher le fichier kaggle.json
    else:
        locations = [
            os.path.join(os.path.expanduser('~'), '.kaggle', 'kaggle.json'), # Emplacement standard
            os.path.join(os.getcwd(), 'kaggle.json')                       # R√©pertoire actuel
        ]
        
        found = False
        for loc in locations:
            if os.path.exists(loc):
                try:
                    with open(loc, 'r') as f:
                        config = json.load(f)
                        username = config.get('username')
                        key = config.get('key')
                        if username and key:
                            os.environ['KAGGLE_USERNAME'] = username
                            os.environ['KAGGLE_KEY'] = key
                            log_event('INFO', f'Cl√©s lues et d√©finies via {loc}.')
                            found = True
                            break
                except (json.JSONDecodeError, Exception):
                    # Fichier trouv√© mais invalide, on continue la recherche
                    continue
        
        if not found:
            log_event('ERROR', "Fichier kaggle.json introuvable. Veuillez le placer dans ~/.kaggle/ ou le r√©pertoire courant.")
            return False

    # 3. Authentifier l'API
    try:
        kaggle_api.authenticate()
        log_event('SUCCESS', 'Authentification Kaggle r√©ussie.')
        return True
    except Exception as e:
        log_event('ERROR', f'√âchec de l\'authentification de l\'API: {e}')
        return False


# --- 2. AUTHENTIFICATION ET T√âL√âCHARGEMENT ---
try:
    if not find_and_auth_kaggle():
        # Lever une exception si l'authentification √©choue
        raise RuntimeError("Processus annul√©. √âchec de la configuration Kaggle. Assurez-vous d'avoir configur√© votre API Key.")
    
    print(f"\nD√©but du t√©l√©chargement de : {KAGGLE_DATASET_ID}")
    log_event('INFO', f"T√©l√©chargement et d√©compression du dataset : {KAGGLE_DATASET_ID}")
    
    # T√©l√©charger et d√©compresser directement le dataset dans le dossier 'data/'
    kaggle_api.dataset_download_files(
        KAGGLE_DATASET_ID, 
        path=DATA_DIR, 
        unzip=True,
        # 'force=True' pour re-t√©l√©charger si le fichier existe d√©j√† (reproductibilit√©)
        force=True, 
        quiet=True # Rendre l'API Kaggle moins verbeuse
    )
    
    if not os.path.exists(LOCAL_COPY):
        raise FileNotFoundError(f"Le fichier {TARGET_FILE_NAME} est introuvable apr√®s l'extraction du dataset Kaggle. Le dataset pourrait avoir une structure diff√©rente.")
        
    log_event('SUCCESS', f"T√©l√©chargement et pr√©paration du fichier : {LOCAL_COPY}")
    print(f"T√©l√©chargement termin√©. Fichier cible : {LOCAL_COPY}")


    # --- 3. LECTURE ROBUSTE ET ANALYSE DU FICHIER ---
    
    read_errors = []
    df = None
    log_event('INFO', f"Tentative de lecture du CSV : {LOCAL_COPY}")
    
    # Tentative 1: Standard (utf-8, comma)
    try:
        df = pd.read_csv(LOCAL_COPY)
    except Exception as e1:
        read_errors.append(f"Standard: {e1}")
        
        # Tentative 2: utf-8, semicolon
        try:
            df = pd.read_csv(LOCAL_COPY, encoding='utf-8', sep=';')
        except Exception as e2:
            read_errors.append(f"UTF-8/Semicolon: {e2}")
            
            # Tentative 3: latin1, standard sep
            try:
                df = pd.read_csv(LOCAL_COPY, encoding='latin1')
            except Exception as e3:
                read_errors.append(f"Latin1: {e3}")
                
                # √âchec total de lecture
                raise RuntimeError(f"Impossible de lire le CSV ({LOCAL_COPY}). √âchecs: {'; '.join(read_errors)}")
    
    # Si la lecture est r√©ussie:
    n_rows = df.shape[0]
    n_cols = df.shape[1]
    cols = list(df.columns)
    missing_counts = df.isna().sum()

    # Sauvegarder un aper√ßu
    df.head(200).to_csv(RESULT_PREVIEW, index=False)

    # Log et affichage du succ√®s
    log_event('SUCCESS', f'Chargement r√©ussi: {LOCAL_COPY}; rows={n_rows}; cols={n_cols}')
    log_event('INFO', f'Colonnes d√©tect√©es: {cols}')
    log_event('INFO', f'Missing per column (seulement > 0): {dict(missing_counts[missing_counts > 0])}') 
    
    print("\n" + "="*50)
    print(f"ANALYSE DU FICHIER {TARGET_FILE_NAME}")
    print(f" - Nombre d'enregistrements (n) : {n_rows}")
    print(f" - Nombre de colonnes : {n_cols}")
    print(" - Colonnes :", cols)
    print(f" - Aper√ßu sauvegard√© : {RESULT_PREVIEW}")
    print("\nComptes de valeurs manquantes (seulement les colonnes non nulles) :")
    print(missing_counts[missing_counts > 0])
    print("="*50)
    
except Exception as e:
    log_event('CRITICAL', f'√âchec du bloc de code: {e}')
    print("\n" + "#"*50)
    print("√âCHEC CRITIQUE: Le code n'a pas pu s'ex√©cuter.")
    print(f"Erreur: {e}")
    print(f"V√©rifiez que vous avez configur√© votre cl√© d'API Kaggle (fichier kaggle.json).")
    print("#"*50)
    raise

  ts = datetime.utcnow().isoformat() + 'Z'
  ts = datetime.utcnow().isoformat() + 'Z'
  ts = datetime.utcnow().isoformat() + 'Z'
  ts = datetime.utcnow().isoformat() + 'Z'



D√©but du t√©l√©chargement de : krishd123/urban-air-quality-and-climate-dataset-1958-2025
Dataset URL: https://www.kaggle.com/datasets/krishd123/urban-air-quality-and-climate-dataset-1958-2025
T√©l√©chargement termin√©. Fichier cible : data\urban_climate.csv

ANALYSE DU FICHIER urban_climate.csv
 - Nombre d'enregistrements (n) : 11040
 - Nombre de colonnes : 12
 - Colonnes : ['city', 'country', 'latitude', 'longitude', 'year', 'month', 'temperature_celsius', 'humidity_percent', 'precipitation_mm', 'wind_speed_ms', 'urban_heat_island_intensity', 'data_source']
 - Aper√ßu sauvegard√© : results\urban_climate_preview.csv

Comptes de valeurs manquantes (seulement les colonnes non nulles) :
Series([], dtype: int64)


  ts = datetime.utcnow().isoformat() + 'Z'
  ts = datetime.utcnow().isoformat() + 'Z'
  ts = datetime.utcnow().isoformat() + 'Z'
  ts = datetime.utcnow().isoformat() + 'Z'
  ts = datetime.utcnow().isoformat() + 'Z'


Perfect üëå, your **Block 2 ‚Äî Data Acquisition** is validated: the ZIP file has been successfully unzipped and we now have 5 usable components:

- `metadata.json` ‚Üí complete documentation
- `co2_emissions.csv` ‚Üí direct CO‚ÇÇ measurements (Mauna Loa, 1958‚Äì2025)
- `air_quality_global.csv` ‚Üí PM2.5 and NO‚ÇÇ for 20 cities (1999‚Äì2025)
- `urban_climate.csv` ‚Üí urban climate variables (1980‚Äì2025)
- `ice_core_co2.csv` ‚Üí paleoclimate reconstructions (2000 years BP)

---

### Next strategic choice
To apply your **T_log model V0.1**, we need to define:
- **n** = system size (number of observations or nodes)
- **d** = effective dimension (spectral, spatial, or chosen proxy)

üëâ Two possible approaches:
1. **Air Quality (PM2.5, NO‚ÇÇ)**:
- n = number of measurement points (per city or global)
- d = effective temporal/spatial dimension (e.g., d=1 for time series, d‚âà2‚Äì3 if combining several cities as a graph)

2. **Direct CO‚ÇÇ (Mauna Loa)**:
- n = number of months measured (‚âà800+)
- d = 1 (one-dimensional time series)

3. **Ice Core CO‚ÇÇ**:
- n = ~2000 years of data
- d = 1 (long time series)

---

**Quick Summary:** We'll tackle **Block 3 ‚Äî Calculating T_log** on the `air_quality_global.csv` file. We'll load the PM2.5 data, choose a city (or the global set), define n as the number of valid observations, set d=1 (one-dimensional time series), then calculate and classify T_{log}.

---

### Block 3 ‚Äî Calculating T_log (Air Quality Global, PM2.5)

Before starting the calculation of
ùëá
log
‚Å°
, it's more rigorous to check the state of the air_quality_global.csv file: structure, columns, missing values, duplicates, etc. This will constitute our Block 3a ‚Äî Data Inspection and Validation.

Block 3a ‚Äî Inspection of the air_quality_global.csv File

In [4]:
# Bloc 3a ‚Äî Inspection et validation du fichier air_quality_global.csv

import pandas as pd

aq_path = "data/air_quality_global.csv"

# Charger un √©chantillon pour inspection
df_aq = pd.read_csv(aq_path)

# Aper√ßu g√©n√©ral
print("=== Aper√ßu du dataset ===")
print(df_aq.head(10))   # premi√®res lignes
print("\nColonnes disponibles :", df_aq.columns.tolist())
print("Nombre total de lignes :", len(df_aq))

# V√©rification des types et valeurs manquantes
print("\n=== Info ===")
print(df_aq.info())

print("\n=== Valeurs manquantes par colonne ===")
print(df_aq.isna().sum())

# V√©rification des doublons
nb_duplicates = df_aq.duplicated().sum()
print(f"\nNombre de doublons d√©tect√©s : {nb_duplicates}")

# Aper√ßu statistique des colonnes num√©riques
print("\n=== Statistiques descriptives ===")
print(df_aq.describe(include='all').transpose().head(20))

# Log
log_event("INFO", f"Inspection du fichier {aq_path} effectu√©e : {len(df_aq)} lignes, {nb_duplicates} doublons")
append_summary_md(f"- {utc_timestamp()} [INFO] Inspection du fichier {aq_path} : {len(df_aq)} lignes, {nb_duplicates} doublons")


  ts = datetime.utcnow().isoformat() + 'Z'


=== Aper√ßu du dataset ===
       city country  latitude  longitude  year  month  pm25_ugm3  no2_ugm3  \
0  New York     USA   40.7128    -74.006  1999      1      18.11     35.98   
1  New York     USA   40.7128    -74.006  1999      2      27.79     17.71   
2  New York     USA   40.7128    -74.006  1999      3      12.05     40.99   
3  New York     USA   40.7128    -74.006  1999      4      35.25     17.18   
4  New York     USA   40.7128    -74.006  1999      5      38.39     25.07   
5  New York     USA   40.7128    -74.006  1999      6      14.89     28.95   
6  New York     USA   40.7128    -74.006  1999      7      19.66     27.85   
7  New York     USA   40.7128    -74.006  1999      8      10.00     26.14   
8  New York     USA   40.7128    -74.006  1999      9      15.04     38.56   
9  New York     USA   40.7128    -74.006  1999     10      15.32     29.50   

  data_quality           measurement_method data_source  
0     Moderate  Reference/Equivalent Method     EPA_AQS 

Quick summary: We will calculate
ùëá
log
‚Å°
in global mode on the entire PM2.5 dataset (n = 6480, d = 1), then prepare a variant for each city (e.g., New York, n = 324).

Block 3b ‚Äî Calculation of T_log (global PM2.5)