# Cleaning the synop dataset

It can be found here:
https://donneespubliques.meteofrance.fr/?fond=produit&id_produit=90&id_rubrique=32

### Import data:

In [1]:
import pandas as pd
import os
import glob

print(os.getcwd())

C:\Users\avere\Documents\Cours\Projet PPAL\AirIQ2\0_ResearchWork\3_CreateAndCleanDatasets


In [2]:
filesList = glob.glob(os.path.join("HistoricDataset/synopDownload",'*.csv.gz'))
    
dfList = []
    
for filename in filesList:        
    df = pd.read_csv(filename, header=0, delimiter=';')

    #select data from Lille-Lesquin
    df = df.loc[df['numer_sta'] == 7015]
    #change datetime format
    df['date'] = pd.to_datetime(df['date'],format = '%Y%m%d%H%M%S')
    dfList.append(df)

# concatenate
df = pd.concat(dfList, axis=0, ignore_index=True)    
df.head()

Unnamed: 0,numer_sta,date,pmer,tend,cod_tend,dd,ff,t,td,u,...,nnuage2,ctype2,hnuage2,nnuage3,ctype3,hnuage3,nnuage4,ctype4,hnuage4,Unnamed: 59
0,7015,1996-01-01 00:00:00,100020,100,1,0,0.0,275.95,275.35,96,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
1,7015,1996-01-01 03:00:00,100080,60,0,150,2.0,275.25,274.85,97,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
2,7015,1996-01-01 06:00:00,100080,0,5,150,2.0,275.15,274.75,97,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
3,7015,1996-01-01 09:00:00,100110,30,0,90,2.0,275.35,274.95,97,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
4,7015,1996-01-01 12:00:00,100110,0,0,70,1.0,276.45,276.04,97,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,


Here is the description of the dataset:
![id station](pictures/doc_parametres_synop_168-1.jpg)
And the id station of Lille
![dataset description](pictures/idstation.png)

### Selecting features:

In [3]:
interestingFeatures = ["date","pres","dd","ff","u","t"]
df = df[interestingFeatures]
df.head()

Unnamed: 0,date,pres,dd,ff,u,t
0,1996-01-01 00:00:00,99380,0,0.0,96,275.95
1,1996-01-01 03:00:00,99440,150,2.0,97,275.25
2,1996-01-01 06:00:00,99440,150,2.0,97,275.15
3,1996-01-01 09:00:00,99470,90,2.0,97,275.35
4,1996-01-01 12:00:00,99470,70,1.0,97,276.45


In [4]:
df.columns = ["date", "pressure","wind_direction","wind_force","humidity","temperature"]
df = df.sort_values(by=['date'])
df.index=df['date']
df.head()

Unnamed: 0_level_0,date,pressure,wind_direction,wind_force,humidity,temperature
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1996-01-01 00:00:00,1996-01-01 00:00:00,99380,0,0.0,96,275.95
1996-01-01 03:00:00,1996-01-01 03:00:00,99440,150,2.0,97,275.25
1996-01-01 06:00:00,1996-01-01 06:00:00,99440,150,2.0,97,275.15
1996-01-01 09:00:00,1996-01-01 09:00:00,99470,90,2.0,97,275.35
1996-01-01 12:00:00,1996-01-01 12:00:00,99470,70,1.0,97,276.45


### Cleaning missing values

In [5]:
import numpy as np
df = df.replace('mq',np.NaN)

In [6]:
df.isna().sum()

date                0
pressure           17
wind_direction    137
wind_force        134
humidity           25
temperature        11
dtype: int64

In [7]:
df = df.fillna(method='ffill')
df.isna().sum()

date              0
pressure          0
wind_direction    0
wind_force        0
humidity          0
temperature       0
dtype: int64

### Cleaning the types of data:

In [8]:
df.dtypes

date              datetime64[ns]
pressure                  object
wind_direction            object
wind_force                object
humidity                  object
temperature               object
dtype: object

In [9]:
df["pressure"] = df["pressure"].astype(int)
df["wind_direction"] = df["wind_direction"].astype(int)
df["wind_force"] = df["wind_force"].astype(float)
df["humidity"] = df["humidity"].astype(int)
df["temperature"] = df["temperature"].astype(float)
df.dtypes

date              datetime64[ns]
pressure                   int32
wind_direction             int32
wind_force               float64
humidity                   int32
temperature              float64
dtype: object

In [10]:
df.head()

Unnamed: 0_level_0,date,pressure,wind_direction,wind_force,humidity,temperature
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1996-01-01 00:00:00,1996-01-01 00:00:00,99380,0,0.0,96,275.95
1996-01-01 03:00:00,1996-01-01 03:00:00,99440,150,2.0,97,275.25
1996-01-01 06:00:00,1996-01-01 06:00:00,99440,150,2.0,97,275.15
1996-01-01 09:00:00,1996-01-01 09:00:00,99470,90,2.0,97,275.35
1996-01-01 12:00:00,1996-01-01 12:00:00,99470,70,1.0,97,276.45


In [11]:
df.describe()

Unnamed: 0,pressure,wind_direction,wind_force,humidity,temperature
count,70381.0,70381.0,70381.0,70381.0,70381.0
mean,101026.891349,186.342905,4.267139,79.244611,284.229234
std,967.774294,97.296695,2.501746,15.969099,6.836402
min,96100.0,0.0,0.0,18.0,260.85
25%,100460.0,110.0,2.4,70.0,279.35
50%,101090.0,200.0,4.1,84.0,284.15
75%,101670.0,260.0,5.9,92.0,288.95
max,104280.0,360.0,22.1,100.0,320.05


In [12]:
df.resample('3H').mean()

Unnamed: 0_level_0,pressure,wind_direction,wind_force,humidity,temperature
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1996-01-01 00:00:00,99380.0,0.0,0.0,96.0,275.95
1996-01-01 03:00:00,99440.0,150.0,2.0,97.0,275.25
1996-01-01 06:00:00,99440.0,150.0,2.0,97.0,275.15
1996-01-01 09:00:00,99470.0,90.0,2.0,97.0,275.35
1996-01-01 12:00:00,99470.0,70.0,1.0,97.0,276.45
...,...,...,...,...,...
2020-02-29 09:00:00,98190.0,180.0,10.4,74.0,283.45
2020-02-29 12:00:00,98270.0,250.0,11.7,83.0,280.55
2020-02-29 15:00:00,98600.0,240.0,10.8,57.0,282.35
2020-02-29 18:00:00,98820.0,230.0,7.5,59.0,280.85


### Save dataset

In [13]:
df.to_csv("HistoricDataset/synopComplete/completeSynopDataset.csv", index=False,sep=';')