<center><img style="width:500px;" src="https://s3.amazonaws.com/thinkific-import/370184/U9zyVFboQScufA7Iih3p_Capture_d_e_cran_2020_09_23_a__00_11_29_png"></center>

# Parcours : Le Data Mining pour tous

## Niveau 3 : Automatiser votre travail

### Cas pratique : Les données COVID-19

#### Etape 0 : Initialisation du notebook

In [1]:
# Import des modules

import pandas as pd
from datetime import datetime, timedelta, date

#### Etape 1 : Récupération des données sources

Source COVID-19 (John Hopkins University)<br>
Données mise à jour tous les jours entre 2h et 8h (CET)<br> 
https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

In [2]:
# Récupérer un fichier sur les données COVID
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
df = pd.read_csv(url)
df.head(5)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,9/19/20,9/20/20,9/21/20,9/22/20,9/23/20,9/24/20,9/25/20,9/26/20,9/27/20,9/28/20
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,38919,39044,39074,39096,39145,39170,39186,39192,39227,39233
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,12226,12385,12535,12666,12787,12921,13045,13153,13259,13391
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,49623,49826,50023,50214,50400,50579,50754,50914,51067,51213
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,1564,1564,1681,1681,1753,1753,1836,1836,1836,1966
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,3901,3991,4117,4236,4363,4475,4590,4672,4718,4797


In [3]:
# Url du répertoire github
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'

# Liste des fichiers à récupérer sur le répertoire
files = [
         'time_series_covid19_confirmed_global.csv',
         'time_series_covid19_deaths_global.csv',
         'time_series_covid19_recovered_global.csv'
        ]

# Boucle pour récupérer et concaténer les fichier sources
data_source = pd.DataFrame()
for file in files:
    df = pd.read_csv(f'{url}{file}')
    df['Type'] = file[20:-11].capitalize()
    data_source = pd.concat([data_source, df], axis=0)
    
data_source.head(5)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,9/20/20,9/21/20,9/22/20,9/23/20,9/24/20,9/25/20,9/26/20,9/27/20,9/28/20,Type
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,39044,39074,39096,39145,39170,39186,39192,39227,39233,Confirmed
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,12385,12535,12666,12787,12921,13045,13153,13259,13391,Confirmed
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,49826,50023,50214,50400,50579,50754,50914,51067,51213,Confirmed
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,1564,1681,1681,1753,1753,1836,1836,1836,1966,Confirmed
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,3991,4117,4236,4363,4475,4590,4672,4718,4797,Confirmed


#### Etape 2 - Mise au format base de donnée

In [4]:
# Initialisation du dataframe
data_pivot = data_source.copy()

# Colonnes à supprimer
to_del = ['Province/State', 'Lat', 'Long']

# Suppression des colonnes inutiles
data_pivot = data_pivot.drop(to_del, axis=1)

# Colonnes à conserver
to_keep = ['Type', 'Country/Region']

# Pivot des colonnes
data_pivot = data_pivot.melt(id_vars=to_keep, 
                             value_name='Value', 
                             var_name='Date')



# Colonnes à aggréger
to_group = ['Date', 'Country/Region', 'Type']

# Regroupement et aggrégation
data_pivot = data_pivot.groupby(to_group, as_index=False).agg({'Value': 'sum'})
data_pivot.head(5)

Unnamed: 0,Date,Country/Region,Type,Value
0,1/22/20,Afghanistan,Confirmed,0
1,1/22/20,Afghanistan,Deaths,0
2,1/22/20,Afghanistan,Recovered,0
3,1/22/20,Albania,Confirmed,0
4,1/22/20,Albania,Deaths,0


#### Etape 3 - Nettoyage de la base

In [5]:
# Initialisation du dataframe
data_clean = data_pivot.copy()

# Mise au format de la date
data_clean['Date'] = pd.to_datetime(data_clean['Date'], format='%m/%d/%y')

# Renomage de la colonne 'Close' en 'VALUE'
data_clean = data_clean.rename(columns={'Country/Region': 'Country'})

# Format des colonnes en majuscule
data_clean.columns = map(lambda x: str(x).upper(), data_clean.columns)
    
data_clean.head(5)

Unnamed: 0,DATE,COUNTRY,TYPE,VALUE
0,2020-01-22,Afghanistan,Confirmed,0
1,2020-01-22,Afghanistan,Deaths,0
2,2020-01-22,Afghanistan,Recovered,0
3,2020-01-22,Albania,Confirmed,0
4,2020-01-22,Albania,Deaths,0


#### Etape 4 - Enrichissement des données

4.1 - CALCUL DE LA VALEUR DU J-1

In [6]:
# Ajout des indexes dans le data frame principal
data_value = data_clean.copy()
data_value['DATE_D-1'] = data_value['DATE'] + timedelta(days=-1)

# Création du dataframe pour récupérer la valeur précédente
data_value_last = data_clean.copy()
to_rename = {'VALUE': 'VALUE_D-1', 'DATE': 'DATE_D-1'}
data_value_last = data_value_last.rename(columns=to_rename)

data_value = pd.merge(data_value,
                      data_value_last,
                      on=['DATE_D-1', 'COUNTRY', 'TYPE'],
                      how='left')

data_value.head(5)

Unnamed: 0,DATE,COUNTRY,TYPE,VALUE,DATE_D-1,VALUE_D-1
0,2020-01-22,Afghanistan,Confirmed,0,2020-01-21,
1,2020-01-22,Afghanistan,Deaths,0,2020-01-21,
2,2020-01-22,Afghanistan,Recovered,0,2020-01-21,
3,2020-01-22,Albania,Confirmed,0,2020-01-21,
4,2020-01-22,Albania,Deaths,0,2020-01-21,


4.2 - AJOUT DES CONTINENTS

In [7]:
# Initialisation du dataframe
data_enr = data_value.copy()

# Suppression de la colonne 'DATE_D-1'
data_enr = data_enr.drop(['DATE_D-1'], axis=1)

# Lecture du fichier REF_CONTINENT
ref_continent = pd.read_excel('REF_CS.xlsx',
                              sheet_name='REF_CONTINENT')

# Jointure avec le dataframe principal
data_enr = pd.merge(data_enr, 
                    ref_continent, 
                    on=['COUNTRY'],
                    how='left'
                   )

# Nettoyage et affectation d'une valeur pour les valeurs nulles
data_enr['CONTINENT'] = data_enr['CONTINENT'].fillna(0)
data_enr.loc[data_enr['CONTINENT'] == 0, 'CONTINENT'] = 'To be affected'

data_enr.head(5)

Unnamed: 0,DATE,COUNTRY,TYPE,VALUE,VALUE_D-1,CONTINENT
0,2020-01-22,Afghanistan,Confirmed,0,,Asia
1,2020-01-22,Afghanistan,Deaths,0,,Asia
2,2020-01-22,Afghanistan,Recovered,0,,Asia
3,2020-01-22,Albania,Confirmed,0,,Europe
4,2020-01-22,Albania,Deaths,0,,Europe


4.3 - CALCUL DES CAS ACTIFS

In [8]:
# Calcul des cas actifs
data_active = data_enr.copy()
data_active.loc[data_active['TYPE'] != 'Confirmed', 'VALUE'] = data_active['VALUE'] * (-1)
data_active.loc[data_active['TYPE'] != 'Confirmed', 'VALUE_D-1'] = data_active['VALUE_D-1'] * (-1)
data_active.loc[:, 'TYPE'] = 'Active Cases'

# Colonnes à grouper
to_group = ['DATE', 'COUNTRY', 'CONTINENT', 'TYPE']

# Colonnes à aggréger 
to_agg = {'VALUE': 'sum', 'VALUE_D-1': 'sum'}

# Regroupement et aggrégation
data_active = data_active.groupby(to_group, as_index=False).agg(to_agg)
data_active.head(5)

Unnamed: 0,DATE,COUNTRY,CONTINENT,TYPE,VALUE,VALUE_D-1
0,2020-01-22,Afghanistan,Asia,Active Cases,0,0.0
1,2020-01-22,Albania,Europe,Active Cases,0,0.0
2,2020-01-22,Algeria,Africa,Active Cases,0,0.0
3,2020-01-22,Andorra,Europe,Active Cases,0,0.0
4,2020-01-22,Angola,Africa,Active Cases,0,0.0


4.4 - CONCATENATION DES 2 BASES DE DONNEES + CALCUL DES VARIATIONS

In [9]:
# Initialisation du dataframe
data_bdd = data_enr.copy()

# Concat
data_bdd = pd.concat([data_enr, data_active], axis=0)

# Calcul des variations
data_bdd['VARV'] = data_bdd['VALUE'] - data_bdd['VALUE_D-1']
data_bdd['VARP'] = (data_bdd['VARV'] / abs(data_bdd['VALUE_D-1']))

data_bdd.head(5)

Unnamed: 0,DATE,COUNTRY,TYPE,VALUE,VALUE_D-1,CONTINENT,VARV,VARP
0,2020-01-22,Afghanistan,Confirmed,0,,Asia,,
1,2020-01-22,Afghanistan,Deaths,0,,Asia,,
2,2020-01-22,Afghanistan,Recovered,0,,Asia,,
3,2020-01-22,Albania,Confirmed,0,,Europe,,
4,2020-01-22,Albania,Deaths,0,,Europe,,


#### Etape 5 : Sauvegarde et exposition du csv

In [10]:
# Ajout de la date de mise à jour
data_bdd['DATE_MAJ'] = datetime.now().strftime('%Y/%m/%d %H:%M:%S')

# Sauvegarde des données au format csv
data_bdd.to_csv('BASE_COVID.csv',
                sep=";",
                decimal=",",
                index=False)

print('CSV sauvegardé dans le répertoire courant.')

CSV sauvegardé dans le répertoire courant.
