In [2]:
import pandas as pd 

In [3]:
air_quality_no2 = pd.read_csv(r"C:\Users\Casper\Desktop\pandada\air_quality_no2_long.csv", parse_dates=True)

air_quality_no2 = air_quality_no2[["date.utc", "location",
                                    "parameter", "value"]]

air_quality_no2.head()

Unnamed: 0,date.utc,location,parameter,value
0,2019-06-21 00:00:00+00:00,FR04014,no2,20.0
1,2019-06-20 23:00:00+00:00,FR04014,no2,21.8
2,2019-06-20 22:00:00+00:00,FR04014,no2,26.5
3,2019-06-20 21:00:00+00:00,FR04014,no2,24.9
4,2019-06-20 20:00:00+00:00,FR04014,no2,21.4


In [4]:
air_quality_pm25 = pd.read_csv(r"C:\Users\Casper\Desktop\pandada\air_quality_pm25_long.csv", parse_dates=True)

air_quality_pm25 = air_quality_pm25[["date.utc", "location",
                                    "parameter", "value"]]

air_quality_pm25.head()

Unnamed: 0,date.utc,location,parameter,value
0,2019-06-18 06:00:00+00:00,BETR801,pm25,18.0
1,2019-06-17 08:00:00+00:00,BETR801,pm25,6.5
2,2019-06-17 07:00:00+00:00,BETR801,pm25,18.5
3,2019-06-17 06:00:00+00:00,BETR801,pm25,16.0
4,2019-06-17 05:00:00+00:00,BETR801,pm25,7.5


# How to combine data from multiple tables 

## Concetaning objects 

In [7]:
# I want to combine the measurement of NO2 and PM25, two tables with a similar structure, in a single table. 
air_quality = pd.concat([air_quality_pm25, air_quality_no2], axis=0)

air_quality.head()

Unnamed: 0,date.utc,location,parameter,value
0,2019-06-18 06:00:00+00:00,BETR801,pm25,18.0
1,2019-06-17 08:00:00+00:00,BETR801,pm25,6.5
2,2019-06-17 07:00:00+00:00,BETR801,pm25,18.5
3,2019-06-17 06:00:00+00:00,BETR801,pm25,16.0
4,2019-06-17 05:00:00+00:00,BETR801,pm25,7.5


The concat() function performs concatenation operations of multiple tables along one of the axes. 

By default concatenation is along axis 0, so the resulting table combines the rows of the input tables. 
Let's check the shape of the original and the concatenated tables to verify the operation: 

In [8]:
print('Shape of the ''air_quality_pm25'' table: ', air_quality_pm25.shape )

Shape of the air_quality_pm25 table:  (1110, 4)


In [9]:
print('Shape of the ''air_quality_no2'' table: ', air_quality_no2.shape)

Shape of the air_quality_no2 table:  (2068, 4)


In [10]:
print('Shape of the resulting ''air_quality'' table: ', air_quality.shape)

Shape of the resulting air_quality table:  (3178, 4)


Sorting the table on the datetime information illustrates also the combination of both tables, with the parameter column defining the origin of the table. 

In [11]:
air_quality = air_quality.sort_values("date.utc")

air_quality.head()

Unnamed: 0,date.utc,location,parameter,value
2067,2019-05-07 01:00:00+00:00,London Westminster,no2,23.0
1003,2019-05-07 01:00:00+00:00,FR04014,no2,25.0
100,2019-05-07 01:00:00+00:00,BETR801,pm25,12.5
1098,2019-05-07 01:00:00+00:00,BETR801,no2,50.5
1109,2019-05-07 01:00:00+00:00,London Westminster,pm25,8.0


In this specific example, the parameter column provided by the data ensures that each of the original tables can be identified. This is not always the case. 
The concat function provides a convenient solution with the keys argument, adding an additional (hieraarchical) row index. For example: 

In [12]:
air_quality = pd.concat([air_quality_pm25, air_quality_no2], keys=["PM25", "NO2"])

air_quality.head()

Unnamed: 0,Unnamed: 1,date.utc,location,parameter,value
PM25,0,2019-06-18 06:00:00+00:00,BETR801,pm25,18.0
PM25,1,2019-06-17 08:00:00+00:00,BETR801,pm25,6.5
PM25,2,2019-06-17 07:00:00+00:00,BETR801,pm25,18.5
PM25,3,2019-06-17 06:00:00+00:00,BETR801,pm25,16.0
PM25,4,2019-06-17 05:00:00+00:00,BETR801,pm25,7.5


## Join tables using a common identifier

In [13]:
stations_coord = pd.read_csv(r"C:\Users\Casper\Desktop\pandada\air_quality_stations.csv")

stations_coord.head()

Unnamed: 0,location,coordinates.latitude,coordinates.longitude
0,BELAL01,51.23619,4.38522
1,BELHB23,51.1703,4.341
2,BELLD01,51.10998,5.00486
3,BELLD02,51.12038,5.02155
4,BELR833,51.32766,4.36226


In [14]:
air_quality.head()

Unnamed: 0,Unnamed: 1,date.utc,location,parameter,value
PM25,0,2019-06-18 06:00:00+00:00,BETR801,pm25,18.0
PM25,1,2019-06-17 08:00:00+00:00,BETR801,pm25,6.5
PM25,2,2019-06-17 07:00:00+00:00,BETR801,pm25,18.5
PM25,3,2019-06-17 06:00:00+00:00,BETR801,pm25,16.0
PM25,4,2019-06-17 05:00:00+00:00,BETR801,pm25,7.5


In [15]:
air_quality = pd.merge(air_quality, stations_coord, how="left", on="location")

air_quality.head()

Unnamed: 0,date.utc,location,parameter,value,coordinates.latitude,coordinates.longitude
0,2019-06-18 06:00:00+00:00,BETR801,pm25,18.0,51.20966,4.43182
1,2019-06-17 08:00:00+00:00,BETR801,pm25,6.5,51.20966,4.43182
2,2019-06-17 07:00:00+00:00,BETR801,pm25,18.5,51.20966,4.43182
3,2019-06-17 06:00:00+00:00,BETR801,pm25,16.0,51.20966,4.43182
4,2019-06-17 05:00:00+00:00,BETR801,pm25,7.5,51.20966,4.43182


Using the merge() function, for each of the rows in the air_quality table, the corresponding coordinates are added from the air_quality_stations_coord table. Both tables have the column location in common which is used as a key to combine the information. By choosing the left join, only the locations available in the air_quality (left) table. 
The merge function supports multiple join options similar to database-style operations.  

In [16]:
air_quality_parameters = pd.read_csv(r"C:\Users\Casper\Desktop\pandada\air_quality_parameters.csv")

air_quality_parameters.head()

Unnamed: 0,id,description,name
0,bc,Black Carbon,BC
1,co,Carbon Monoxide,CO
2,no2,Nitrogen Dioxide,NO2
3,o3,Ozone,O3
4,pm10,Particulate matter less than 10 micrometers in...,PM10


In [17]:
air_quality = pd.merge(air_quality, air_quality_parameters,
                                    how='left', left_on='parameter', right_on='id')

air_quality.head()

KeyError: 'paramter'