# Import d'un csv depuis Google Drive, nettoyage et re-export vers BigQuery.

In [32]:
import pandas as pd
import numpy as np
import os
import shapely
# libs nécessaires pour requêter BigQuery
from google.cloud import bigquery
from google.oauth2 import service_account



## Chargement du fichier csv depuis Google Drive

In [27]:
url='https://drive.google.com/file/d/1S0IrERlohKS74B95lEIageVfuf83jYsc/view?usp=drive_link'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
df = pd.read_csv(url)

In [28]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,PA15L:NO2,ARG:NO2,OPERA:NO2,RN20:NO2,BP_EST:NO2,AUB:NO2,A1:NO2,BASCH:NO2,BOB:NO2,...,TREMB:NO2,HAUS:NO2,VERS:NO2,VILLEM:NO2,VITRY:NO2,RN2:NO2,RN6:NO2,PA04C:NO2,AUT:NO2,SOULT:NO2
0,,Paris stade Lenglen,ARGENTEUIL,Place de l'Opéra,RN20 Montlhéry,Boulevard Périphérique Est,AUBERVILLIERS,Autoroute A1 - Saint-Denis,Place Victor Basch,BOBIGNY,...,TREMBLAY-EN-FRANCE,Boulevard Haussmann,VERSAILLES,VILLEMOMBLE,VITRY-SUR-SEINE,Route nationale 2 - PANTIN,Route Nationale 6 - MELUN,Paris Centre,Boulevard Péripherique Auteuil,Boulevard Soult
1,,PA15L,ARG,OPERA,RN20,BP_EST,AUB,A1,BASCH,BOB,...,TREMB,HAUS,VERS,VILLEM,VITRY,RN2,RN6,PA04C,AUT,SOULT
2,,dioxyde d azote,dioxyde d azote,dioxyde d azote,dioxyde d azote,dioxyde d azote,dioxyde d azote,dioxyde d azote,dioxyde d azote,dioxyde d azote,...,dioxyde d azote,dioxyde d azote,dioxyde d azote,dioxyde d azote,dioxyde d azote,dioxyde d azote,dioxyde d azote,dioxyde d azote,dioxyde d azote,dioxyde d azote
3,,NO2,NO2,NO2,NO2,NO2,NO2,NO2,NO2,NO2,...,NO2,NO2,NO2,NO2,NO2,NO2,NO2,NO2,NO2,NO2
4,,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3,...,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3,microg/m3
5,2018-01-01 01:00:00Z,4.0,,,13.0,13.0,12.0,65.0,23.0,9.0,...,9.0,53.0,1.0,5.0,4.0,18.0,12.0,17.0,,14.0
6,2018-01-01 02:00:00Z,4.0,,,14.0,15.0,13.0,59.0,23.0,8.0,...,9.0,44.0,2.0,6.0,3.0,19.0,14.0,14.0,,14.0
7,2018-01-01 03:00:00Z,4.0,,,26.0,20.0,12.0,60.0,30.0,9.0,...,10.0,46.0,2.0,8.0,5.0,21.0,19.0,15.0,,16.0
8,2018-01-01 04:00:00Z,4.0,,,21.0,25.0,14.0,75.0,27.0,8.0,...,10.0,40.0,2.0,8.0,8.0,22.0,13.0,13.0,,20.0
9,2018-01-01 05:00:00Z,4.0,,,17.0,25.0,13.0,61.0,25.0,8.0,...,10.0,34.0,2.0,10.0,8.0,19.0,9.0,12.0,,18.0


## Cleaning des rows inutiles et renaming de la première valeur du header null en "Date"

In [29]:
df.loc[1, 'Unnamed: 0'] = 'Date'
df = df.drop(df.index[[0, 2, 3, 4]])
new_header = df.iloc[0] #grab the first row for the header
df = df[1:] #take the data less the header row
df.columns = new_header #set the header row as the df header
df.head(10)

1,Date,PA15L,ARG,OPERA,RN20,BP_EST,AUB,A1,BASCH,BOB,...,TREMB,HAUS,VERS,VILLEM,VITRY,RN2,RN6,PA04C,AUT,SOULT
5,2018-01-01 01:00:00Z,4.0,,,13.0,13.0,12.0,65.0,23.0,9.0,...,9.0,53.0,1.0,5.0,4.0,18.0,12.0,17.0,,14.0
6,2018-01-01 02:00:00Z,4.0,,,14.0,15.0,13.0,59.0,23.0,8.0,...,9.0,44.0,2.0,6.0,3.0,19.0,14.0,14.0,,14.0
7,2018-01-01 03:00:00Z,4.0,,,26.0,20.0,12.0,60.0,30.0,9.0,...,10.0,46.0,2.0,8.0,5.0,21.0,19.0,15.0,,16.0
8,2018-01-01 04:00:00Z,4.0,,,21.0,25.0,14.0,75.0,27.0,8.0,...,10.0,40.0,2.0,8.0,8.0,22.0,13.0,13.0,,20.0
9,2018-01-01 05:00:00Z,4.0,,,17.0,25.0,13.0,61.0,25.0,8.0,...,10.0,34.0,2.0,10.0,8.0,19.0,9.0,12.0,,18.0
10,2018-01-01 06:00:00Z,5.0,,,17.0,35.0,14.0,57.0,26.0,9.0,...,9.0,29.0,7.0,10.0,11.0,19.0,9.0,15.0,,31.0
11,2018-01-01 07:00:00Z,6.0,,,12.0,32.0,16.0,37.0,27.0,11.0,...,11.0,22.0,7.0,11.0,8.0,24.0,12.0,20.0,,26.0
12,2018-01-01 08:00:00Z,5.0,,,12.0,32.0,14.0,32.0,29.0,13.0,...,10.0,16.0,2.0,11.0,7.0,30.0,12.0,15.0,,15.0
13,2018-01-01 09:00:00Z,5.0,,,13.0,31.0,11.0,37.0,31.0,9.0,...,7.0,19.0,2.0,7.0,7.0,25.0,9.0,12.0,,13.0
14,2018-01-01 10:00:00Z,6.0,,,19.0,44.0,12.0,43.0,35.0,10.0,...,6.0,27.0,2.0,8.0,8.0,28.0,11.0,16.0,,14.0


## Interpolation des valeurs NaN grâce à des méthodes linéaires et cubic, et résumé de l'opération.

In [30]:
# Count the number of NaN values before interpolation
nan_count_before = df.isna().sum().sum()

# Perform the interpolation (your existing code)
for col in df.columns:
    if col != 'Date':
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, set invalid values to NaN

df = df.apply(lambda col: col.interpolate(method='linear', limit=3) if col.name != 'Date' else col)

df = df.apply(
    lambda col: col.interpolate(method='cubic') if col.name != 'Date' and col.notna().sum() >= 4 else col
)

for col in df.columns:
    if col != 'Date':
        nan_indices = df[col][df[col].isna()].index
        for idx in nan_indices:
            neighbors = df.drop(columns=['Date', col]).loc[idx]
            df.at[idx, col] = neighbors.mean()

# Count the number of NaN values after interpolation
nan_count_after = df.isna().sum().sum()

# Calculate the number of interpolated values
interpolated_values = nan_count_before - nan_count_after

# Calculate the total number of values in the DataFrame (excluding the 'Date' column)
total_values = df.drop(columns=['Date']).size

# Calculate the percentage of interpolated values
percentage_interpolated = (interpolated_values / total_values) * 100

print(f"Number of interpolated values: {interpolated_values}")
print(f"Total number of values: {total_values}")
print(f"Percentage of interpolated values: {percentage_interpolated:.2f}%")

# Check the result
df.head(10)

Number of interpolated values: 24377
Total number of values: 359160
Percentage of interpolated values: 6.79%


1,Date,PA15L,ARG,OPERA,RN20,BP_EST,AUB,A1,BASCH,BOB,...,TREMB,HAUS,VERS,VILLEM,VITRY,RN2,RN6,PA04C,AUT,SOULT
5,2018-01-01 01:00:00Z,4.0,12.108108,12.108108,13.0,13.0,12.0,65.0,23.0,9.0,...,9.0,53.0,1.0,5.0,4.0,18.0,12.0,17.0,12.108108,14.0
6,2018-01-01 02:00:00Z,4.0,12.297297,12.297297,14.0,15.0,13.0,59.0,23.0,8.0,...,9.0,44.0,2.0,6.0,3.0,19.0,14.0,14.0,12.297297,14.0
7,2018-01-01 03:00:00Z,4.0,13.594595,13.594595,26.0,20.0,12.0,60.0,30.0,9.0,...,10.0,46.0,2.0,8.0,5.0,21.0,19.0,15.0,13.594595,16.0
8,2018-01-01 04:00:00Z,4.0,13.891892,13.891892,21.0,25.0,14.0,75.0,27.0,8.0,...,10.0,40.0,2.0,8.0,8.0,22.0,13.0,13.0,13.891892,20.0
9,2018-01-01 05:00:00Z,4.0,12.27027,12.27027,17.0,25.0,13.0,61.0,25.0,8.0,...,10.0,34.0,2.0,10.0,8.0,19.0,9.0,12.0,12.27027,18.0
10,2018-01-01 06:00:00Z,5.0,13.432432,13.432432,17.0,35.0,14.0,57.0,26.0,9.0,...,9.0,29.0,7.0,10.0,11.0,19.0,9.0,15.0,13.432432,31.0
11,2018-01-01 07:00:00Z,6.0,13.72973,13.72973,12.0,32.0,16.0,37.0,27.0,11.0,...,11.0,22.0,7.0,11.0,8.0,24.0,12.0,20.0,13.72973,26.0
12,2018-01-01 08:00:00Z,5.0,12.27027,12.27027,12.0,32.0,14.0,32.0,29.0,13.0,...,10.0,16.0,2.0,11.0,7.0,30.0,12.0,15.0,12.27027,15.0
13,2018-01-01 09:00:00Z,5.0,11.459459,11.459459,13.0,31.0,11.0,37.0,31.0,9.0,...,7.0,19.0,2.0,7.0,7.0,25.0,9.0,12.0,11.459459,13.0
14,2018-01-01 10:00:00Z,6.0,13.459459,13.459459,19.0,44.0,12.0,43.0,35.0,10.0,...,6.0,27.0,2.0,8.0,8.0,28.0,11.0,16.0,13.459459,14.0


## Export BigQuery

In [34]:
# Mise en place des credentials BigQuery

# définition de l'emplacement de la clé du compte de service Google
credpath = os.path.join("..", "..", "..", "..", "..", "credentials", "artefact-da53-projet-final-b60d2589fda1.json")

# définition des credentials Google en variable d'environnement en pointant vers la clé du compte de service Google
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credpath

# préparation et définition des credentials du compte de service Google en pointant vers la clé JSON et en définissant les scopes d'action : drive (on a des tables à base de Google Sheets), cloud plateform, et bigquery.
credentials = service_account.Credentials.from_service_account_file(credpath,  scopes=["https://www.googleapis.com/auth/drive","https://www.googleapis.com/auth/cloud-platform","https://www.googleapis.com/auth/bigquery"])

In [35]:
# Export BigQuery - Création d'un nouveau dataset, d'une nouvelle table, et remplissage de la table avec un dataframe Pandas

# Init the BQ client
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# Define the dataset id
dataset_id = 'artefact-da53-projet-final.04bis_AirQuality_AlternativeMethod'

# Create a Dataset object
dataset = bigquery.Dataset(dataset_id)

# Create the dataset
dataset = client.create_dataset(dataset, exists_ok=True)

print(f'Dataset créé : {client.project}.{dataset.dataset_id}')

# Define the table id
table_id = 'artefact-da53-projet-final.04bis_AirQuality_AlternativeMethod.2018_AirQuality_NO2_clean_interpolated' # remplacer nom du dataset (si besoin) et de la table (obligatoire !) 

# Create a Table object
table = bigquery.Table(table_id)

# Create a table
table = client.create_table(table, exists_ok=True)

print(f'Table créée : {client.project}.{dataset.dataset_id}.{table.table_id}')


# envoi du dataframe vers BQ
df.to_gbq(destination_table=table_id, if_exists='replace')

Dataset créé : artefact-da53-projet-final.04bis_AirQuality_AlternativeMethod
Table créée : artefact-da53-projet-final.04bis_AirQuality_AlternativeMethod.2018_AirQuality_NO2_clean_interpolated


  df.to_gbq(destination_table=table_id, if_exists='replace')
100%|██████████| 1/1 [00:00<00:00, 11949.58it/s]
