In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Data integration

### URLs
- https://github.com/jorisvandenbossche/2015-PyDataParis

### Working with a Pandas dataframe

<img src="https://github.com/FIIT-IAU/2015-PyDataParis/raw/b900fdb9f3c12e9206bb417022dd004abf023c0f/img/dataframe.png" width="50%" height="50%" />


# Case study: Air quality in Europe
**[European air quality information reported by EEA member countries](https://www.eea.europa.eu/data-and-maps/data#c0=5&c11=&c5=all&b_start=0).**

AirBase (The European Air quality dataBase): hourly measurements of all air quality monitoring stations from Europe.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns

In [None]:
filename = "data/input/BETR8010000800100hour.1-1-1990.31-12-2012"
df = pd.read_csv(filename)
df.head()

We can see that there are several problems with loading. So let's try looking at the data in some editor before we load it:

In [None]:
%%bash
head data/input/BETR8010000800100hour.1-1-1990.31-12-2012

All we know about this so far is that it will be a **csv format, the value separator is \t**, there are only numerical data and we do not have named attributes.

In [None]:
%%bash
ls -lh data/input/BETR8010000800100hour.1-1-1990.31-12-2012

In [None]:
%%bash 
wc -l data/input/BETR8010000800100hour.1-1-1990.31-12-2012

So the data is not that much and I don't have to worry about loading it all into memory

In [None]:
data = pd.read_csv(filename, 
                   sep='\t', 
                   header=None)
data.head()

We have 49 columns. Date and 48 other numeric attributes. Everyone else seems to be binary. Probably some symptom.

The data are made up of measurements of some quantity in about individual hours of the day. 

What a day, what a line. Each hour has a separate column + there is a column for some symptom that we are not interested in now.

There are some weird values ​​that probably shouldn't be there: -999 and -9999.

The date will probably be an index

In [None]:
data = pd.read_csv(filename, 
                   sep='\t', 
                   header=None,
                   na_values=[-999, -9999], 
                   index_col=0
                  )
data.head()

In [None]:
# we will try to discard those flags that do not interest us. Coincidentally, it is every other column
data.columns[1::2]

In [None]:
data = data.drop(data.columns[1::2], 
                 axis=1)
data.head()

In [None]:
# Let's try to name the resulting columns in a normal way
["{:02d}".format(i) for i in range(len(data.columns))]

In [None]:
# I have the names of the columns scattered somehow
data.columns = ["{:02d}".format(i) for i in range(len(data.columns))]
data.head()

**Let's try to move each measurement to a separate line**

In [None]:
data = data.stack()
data.head()

In [None]:
# the result of the rearrangement is a multidimensional Series object, not a DataFrame.
type(data)  

In [None]:
# we could give the column a normal name, e.g. by the name of the measuring station, which is in the file name
_, fname = os.path.split(filename)
station = fname[:7]
print(filename)
print(station)

In [None]:
#reset index turns it into a data frame for me
data = data.reset_index(name=station) 
# data = data.reset_index()

print(type(data))
data.head()

In [None]:
data = data.rename(columns = {0:'date', 'level_1':'hour'})
data.head()

In [None]:
# now we will create a new index for it from the date and time
data.index = pd.to_datetime(data['date'] + ' ' + data['hour'])
data.head()

In [None]:
# delete unnecessary columns
data = data.drop(['date', 'hour'], axis=1)
data.head()

# Above-code for one station is inserted into the python file `airbase.py`
**We are going to work with more stations.**

In [None]:
import airbase
no2 = airbase.load_data()

In [None]:
no2.head(3)

In [None]:
no2.tail()

In [None]:
no2.info()

In [None]:
no2.describe()

In [None]:
no2.plot(kind='box')

In [None]:
# boxplot can also show outliers
# sns.boxplot(no2, sym='k.')
sns.boxplot(data=no2, sym='k.')

In [None]:
no2['BETN029'].plot(kind='hist', bins=50)

In [None]:
sns.violinplot(data=no2)

In [None]:
# first plotting
no2.plot(figsize=(12,6))

In [None]:
# I can say that I only want a smaller part
no2[-500:].plot(figsize=(12,6))

**Or I will use more interesting operations with timeseries**

In [None]:
# since the index is times, I can do interesting things with them
no2.index 

In [None]:
# for example, define ranges using a string with a date
no2["2010-01-01 09:00": "2010-01-01 12:00"] 

In [None]:
# or to select all data from one specific year in this way
no2.loc['2012']
# no2['2012']
# no2['2012'].head()

# or just data from March
# no2['2012/03']

In [None]:
# date components are accessible from the index
# no2.index.hour
no2.index.year

In [None]:
# and what is more interesting, I can change the sampling frequency
no2.resample('D').mean().head()

In [None]:
# is there any seasonality?
no2.resample('M').mean().plot()

In [None]:
# long term trend?
no2.resample('A').mean().plot()

In [None]:
# weekly seasonality?
no2['2012-3':'2012-4'].resample('D').mean().plot()

In [None]:
# I can also use several aggregation functions and compare them
no2.loc['2009':, 'FR04037'].resample('M').agg(['mean', 'median']).plot()
# no2.loc['2009':, 'FR04037'].resample('M').agg(['mean', 'std']).plot()

## Attention resample != groupby

In [None]:
# This is a time course with monthly granularity. The values ​​are averaged over the course of the month
no2.resample('M').mean().plot()

In [None]:
# This is the average of all values ​​for the month with the given number. Even over the years.
# So I got the average course of the value over the year with monthly granularity.
no2.groupby(no2.index.month).mean().plot()

# Summary, what to take away from this EDA

* Make sure that the data is coded correctly (most often you have to look into the data manually)
* Make sure the data falls within the expected range and all have the expected shape (for example time format)
* Never change the data manually. Always use a code that you save and use every time you repeat the experiment. We want the analysis to be reproducible
* Graph everything you can to visually confirm that something is as it should be