In [12]:
import pandas as pd

##### Ajout de la population dan le DataSet #####

# Charger les données
population_df = pd.read_csv('country_population_2020.csv')
covid_df = pd.read_csv('data_etl_output.csv')

# Créer un dictionnaire de mapping pour les noms de pays qui diffèrent
country_name_mapping = {
    'Burma': 'Myanmar',
    'Congo (Brazzaville)': 'Congo',
    'Congo (Kinshasa)': 'DR Congo',
    'Czechia': 'Czech Republic (Czechia)',
    'Taiwan*': 'Taiwan',
    'US': 'United States',
    'West Bank and Gaza': 'State of Palestine',
    'Kosovo': 'Kosovo',  # Kosovo n'est pas dans le fichier population
    "Cote d'Ivoire": "Côte d'Ivoire",
    'Saint Kitts and Nevis': 'Saint Kitts & Nevis',
    'Saint Vincent and the Grenadines': 'St. Vincent & Grenadines',
    'Sao Tome and Principe': 'Sao Tome & Principe'
}

# Ajout manuel de la population du Kosovo car vide
kosovo_data = pd.DataFrame({
    'Country (or dependency)': ['Kosovo'],
    'Population (2020)': [1920079],
    'Yearly Change': [""],
    'Net Change': [""],
    'Density (P/Km²)': [""],
    'Land Area (Km²)': [""],
    'Migrants (net)': [""],
    'Fert. Rate': [""],
    'Med. Age': [""],
    'Urban Pop %': [""],
    'World Share': [""]
})

# Concatenate Kosovo data with population data
population_df = pd.concat([population_df, kosovo_data], ignore_index=True)

# Nettoyer les noms de pays dans les deux dataframes
population_df['Country'] = population_df['Country (or dependency)'].str.strip()
covid_df['Country'] = covid_df['Country'].str.strip()

# Appliquer le mapping aux noms de pays dans covid_df
covid_df['Country'] = covid_df['Country'].replace(country_name_mapping)

# Fusionner les données en gardant toutes les lignes de covid_df
final_df = pd.merge(
    covid_df,
    population_df[['Country', 'Population (2020)']],
    on='Country',
    how='left'
)

# Renommer la colonne de population
final_df.rename(columns={'Population (2020)': 'Population'}, inplace=True)

In [26]:
##### Ajoute les colonnes Confirmed_per_100K, ... pour apporter de nouvelles metrics à nos modèles #####

final_df["Confirmed_per_100K"] = final_df["Confirmed"] / (final_df["Population"] / 100000)
final_df["Deaths_per_100K"] = final_df["Deaths"] / (final_df["Population"] / 100000)
final_df["Recovered_per_100K"] = final_df["Recovered"] / (final_df["Population"] / 100000)

##### Calcul des tendances et moyennes mobiles #####
for col in ['Confirmed', 'Deaths', 'Recovered']:
    final_df[f'{col}_rolling_avg3'] = final_df.groupby('Country')[col].transform(
        lambda x: x.rolling(3, min_periods=1).mean()
    )
    
    final_df[f'{col}_trend'] = final_df.groupby('Country')[col].transform(
        lambda x: x.diff().rolling(3, min_periods=1).mean().fillna(0)
    )

In [27]:
# Sauvegarder le résultat
final_df.to_csv('mspr2_dataset.csv', index=False)