# Mix datasets

This script mix all our datasets together in a way to have everything we need in only one file.

In [1]:
import pandas as pd
from datetime import datetime

### Import datasets

#### IQ :

In [2]:
dfiq = pd.read_csv("HistoricDataset/iqComplete/completeIQDataset.csv", header=0, delimiter=';')
dfiq['date'] = pd.to_datetime(dfiq['date'],utc=True)
dfiq.head()

Unnamed: 0,date,IQ
0,2019-01-01 00:00:00+00:00,3
1,2019-01-02 00:00:00+00:00,3
2,2019-01-03 00:00:00+00:00,4
3,2019-01-04 00:00:00+00:00,4
4,2019-01-05 00:00:00+00:00,4


In [3]:
dfiq.dtypes

date    datetime64[ns, UTC]
IQ                    int64
dtype: object

In [4]:
len(dfiq)

448

#### Synop : 

In [5]:
dfsynop = pd.read_csv("HistoricDataset/synopComplete/completeSynopDataset.csv", header=0, delimiter=';')
dfsynop['date'] = pd.to_datetime(dfsynop['date'],utc=True)
dfsynop.head()

Unnamed: 0,date,pressure,wind_direction,wind_force,humidity,temperature
0,1996-01-01 00:00:00+00:00,99380,0,0.0,96,275.95
1,1996-01-01 03:00:00+00:00,99440,150,2.0,97,275.25
2,1996-01-01 06:00:00+00:00,99440,150,2.0,97,275.15
3,1996-01-01 09:00:00+00:00,99470,90,2.0,97,275.35
4,1996-01-01 12:00:00+00:00,99470,70,1.0,97,276.45


In [6]:
dfsynop.dtypes

date              datetime64[ns, UTC]
pressure                        int64
wind_direction                  int64
wind_force                    float64
humidity                        int64
temperature                   float64
dtype: object

In [7]:
len(dfsynop)

70381

### Mix both datasets
We want to associate an IQ to each line depending on the day.

In [8]:
def getDay(row):
    return(row["date"].year,row["date"].month,row["date"].day)

dfsynop["day"] = dfsynop.apply(lambda row: getDay(row), axis=1)
dfiq["day"] = dfiq.apply(lambda row: getDay(row), axis=1)

In [9]:
dfMerged = pd.merge(dfiq, dfsynop, how='inner', on="day")
dfMerged.head()

Unnamed: 0,date_x,IQ,day,date_y,pressure,wind_direction,wind_force,humidity,temperature
0,2019-01-01 00:00:00+00:00,3,"(2019, 1, 1)",2019-01-01 00:00:00+00:00,102870,250,2.9,89,280.35
1,2019-01-01 00:00:00+00:00,3,"(2019, 1, 1)",2019-01-01 03:00:00+00:00,102800,270,2.9,85,280.35
2,2019-01-01 00:00:00+00:00,3,"(2019, 1, 1)",2019-01-01 06:00:00+00:00,102720,260,2.9,84,280.25
3,2019-01-01 00:00:00+00:00,3,"(2019, 1, 1)",2019-01-01 09:00:00+00:00,102730,260,3.6,84,280.55
4,2019-01-01 00:00:00+00:00,3,"(2019, 1, 1)",2019-01-01 12:00:00+00:00,102640,270,5.7,85,281.75


In [10]:
dfMerged = dfMerged.drop(columns=["date_x","day"])
dfMerged = dfMerged.rename(columns={"date_y":"date"})
dfMerged.head()

Unnamed: 0,IQ,date,pressure,wind_direction,wind_force,humidity,temperature
0,3,2019-01-01 00:00:00+00:00,102870,250,2.9,89,280.35
1,3,2019-01-01 03:00:00+00:00,102800,270,2.9,85,280.35
2,3,2019-01-01 06:00:00+00:00,102720,260,2.9,84,280.25
3,3,2019-01-01 09:00:00+00:00,102730,260,3.6,84,280.55
4,3,2019-01-01 12:00:00+00:00,102640,270,5.7,85,281.75


In [11]:
len(dfMerged)

3396

### Save dataset

In [12]:
dfMerged.to_csv("HistoricDataset/completeMixDataset/completeDataset.csv", index=False,sep=';')