# Notebook visualising the preprocessing of the 'green_index' dataset.

In [17]:
import pandas as pd
import src.utils.preprocess_helper as pph

#### Reading the dataset.

In [18]:
green_index = pd.read_csv('../../../data/green_index_dataset.csv')

#### Converting the dataframe to a `pivot` like object by using the unique years as a column so that the missing values for each Neighbourhood can be filled row by row.

In [19]:
pivot = pph.create_pivot_from_df(green_index, 'Neighbourhood', 'year', 'green_score')
pivot

year,2009,2010,2014,2015,2016,2017,2018
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bavel,31.860021,25.604167,,,30.643434,33.5,30.423769
Belcrum,16.537037,,20.300813,20.139881,22.051724,23.031008,19.787879
Biesdonk,27.041199,,27.062602,23.245935,34.290476,34.081301,23.806109
Blauwe Kei,,,28.866242,37.387097,28.994574,25.720539,24.038073
Boeimeer,20.833333,,31.136166,33.102564,31.439535,32.571111,26.107912
Brabantpark,25.857143,,29.036937,24.29702,27.824074,27.403234,
Buitengebied Bavel,46.369963,41.21308,,,42.199234,40.727477,34.149123
Buitengebied Prinsenbeek,42.243768,49.720238,40.658422,,41.658333,38.776042,36.773638
Buitengebied Teteringen,40.096491,,,52.952381,37.984375,,27.266667
Buitengebied Ulvenhout,44.343468,42.418919,54.666667,46.357576,31.635417,44.025194,40.143541


#### Using the `fill_nan_values` created by me which calculates the missing values by taking the `closest` value on the `left` and `right` in order to calculate the most accurate result for our time data.

In [20]:
pivot = pph.fill_nan_values(pivot.reset_index())
pivot

year,Neighbourhood,2009,2010,2014,2015,2016,2017,2018
0,Bavel,31.860021,25.604167,28.123801,28.123801,30.643434,33.5,30.423769
1,Belcrum,16.537037,18.418925,20.300813,20.139881,22.051724,23.031008,19.787879
2,Biesdonk,27.041199,27.0519,27.062602,23.245935,34.290476,34.081301,23.806109
3,Blauwe Kei,14.433121,14.433121,28.866242,37.387097,28.994574,25.720539,24.038073
4,Boeimeer,20.833333,25.984749,31.136166,33.102564,31.439535,32.571111,26.107912
5,Brabantpark,25.857143,27.44704,29.036937,24.29702,27.824074,27.403234,13.701617
6,Buitengebied Bavel,46.369963,41.21308,41.706157,41.706157,42.199234,40.727477,34.149123
7,Buitengebied Prinsenbeek,42.243768,49.720238,40.658422,41.158378,41.658333,38.776042,36.773638
8,Buitengebied Teteringen,40.096491,46.524436,46.524436,52.952381,37.984375,32.625521,27.266667
9,Buitengebied Ulvenhout,44.343468,42.418919,54.666667,46.357576,31.635417,44.025194,40.143541


#### Predicting the future `green_score` values using the ARIMA Time Series model.

In [21]:
future_years = [2019, 2020]
future_data = pph.predict_future_yearly_data(pivot.melt(
    id_vars=['Neighbourhood'],
    var_name='year',
    value_vars=pivot.columns,
    value_name='green_score'), future_years, 'Neighbourhood', 'year', 'green_score')

processed_df = pd.concat([pivot, future_data])
processed_df.groupby(['year', 'Neighbourhood'])['green_score'].mean().reset_index().pivot(index="Neighbourhood",
                                                                                          columns="year",
                                                                                          values="green_score")

  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregres

year,2019.0,2020.0
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1
Bavel,30.007825,29.760575
Belcrum,20.056664,20.147086
Biesdonk,27.579967,27.746002
Blauwe Kei,23.734733,23.653759
Boeimeer,25.878458,26.095412
Brabantpark,10.584251,12.013891
Buitengebied Bavel,27.625209,24.201277
Buitengebied Prinsenbeek,36.815262,36.796181
Buitengebied Teteringen,26.575382,26.226482
Buitengebied Ulvenhout,42.772472,43.84897


#### Saving the preprocessed dataframe to a .csv file.

In [12]:
processed_df.loc[processed_df['year'] >= 2014].to_csv('../../../data/preprocessed/green_index_preprocessed.csv',
                                                      index=False)