# imports

In [8]:
import pandas as pd

import re

# Temperature

In [9]:
def load_temperature_data(filepath, country):
    with open(filepath, "r") as file:
        lines = file.readlines()

    data_lines = [line.strip() for line in lines if re.match(r"^\s*\d{4}\s+\d+\s+[-+]?\d*\.\d+", line)]
    data = []
    for line in data_lines:
        parts = re.split(r'\s+', line.strip())
        year = int(parts[0])
        month = int(parts[1])
        anomaly = float(parts[2])
        data.append({"année": year, "mois": month, "anomalie_temp": anomaly, "pays": country})
    
    df = pd.DataFrame(data)
    return df.groupby(['année', 'pays'])['anomalie_temp'].mean().reset_index()

# Charger les deux fichiers de température
df_temp_fr = load_temperature_data("datasets/temp/france-TemperatureAVG-Trend.txt", "France")
df_temp_in = load_temperature_data("datasets/temp/india-TemperatureAVG-Trend.txt", "Inde")

# Fusionner les txt de temperature
df_temp = pd.concat([df_temp_fr, df_temp_in])

# Precipitations

In [10]:
df_precip = pd.read_csv("datasets/pre/precipitations_1970_2020_synthetiques.csv")

# Melt : conversion en format long
df_precip_melted = df_precip.melt(id_vars='année', var_name='colonne_origine', value_name='precipitations_mm')

# Extraire le nom de pays de la colonne et renommer proprement
df_precip_melted['pays'] = df_precip_melted['colonne_origine'].str.extract(r'(france|inde)', expand=False).str.capitalize()

# Supprimer la colonne temporaire inutile
df_precip_melted = df_precip_melted.drop(columns='colonne_origine')



# Disasters

In [11]:
# Lire les fichiers
df_fr = pd.read_csv("datasets/cata/france_disasters.csv")
df_in = pd.read_csv("datasets/cata/india_disasters.csv")

# Ajouter pays
df_fr['pays'] = 'France'
df_in['pays'] = 'Inde'

# Fusionner
df_cat = pd.concat([df_fr, df_in], ignore_index=True)

# Extraire l’année à partir du format "YYYY-XXXX-XXX"
df_cat['année'] = df_cat['DisNo.'].str.extract(r'^(\d{4})').astype(int)

# Filtrer entre 1970 et 2020
df_cat = df_cat[df_cat['année'].between(1970, 2020)]

# Compter les catastrophes par année/pays
df_catastrophes = df_cat.groupby(['année', 'pays']).size().reset_index(name='nb_catastrophes')


# CO2 emission

In [12]:
df_co2 = pd.read_csv("datasets/co2/owid-co2-data.csv")
df_co2 = df_co2[df_co2['country'].isin(['France', 'India'])]
df_co2 = df_co2[(df_co2['year'] >= 1970) & (df_co2['year'] <= 2020)]
df_co2 = df_co2[['year', 'country', 'co2']]
df_co2.columns = ['année', 'pays', 'co2_emis']
df_co2['pays'] = df_co2['pays'].replace({'India': 'Inde', 'France': 'France'})


In [13]:
df_catastrophes.head()

Unnamed: 0,année,pays,nb_catastrophes
0,1970,France,3
1,1970,Inde,4
2,1971,Inde,5
3,1972,France,2
4,1972,Inde,4


# Fusion finale

In [14]:
df_merged = df_temp.merge(df_precip_melted, on=['année', 'pays'], how='left') \
                   .merge(df_co2, on=['année', 'pays'], how='left') \
                   .merge(df_catastrophes, on=['année', 'pays'], how='left')

# ---------- 6. Nettoyage final ----------
df_merged = df_merged.sort_values(['pays', 'année']).reset_index(drop=True)

df_final = df_merged[
    (df_merged['année'].between(1970, 2020)) &
    (df_merged['pays'].isin(['France', 'Inde']))
]

# ---------- 7. Sauvegarde ----------
df_final.to_csv("donnees_climatiques_combinees.csv", index=False)
