UA is not included in table 2 or 4 ub Kozarcanin et al. Apply adjustment based on neighbouring country(ies)

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
cwd=os.getcwd()
path_raw_ninja_data=os.path.join(cwd, "../../raw_data/ninja_weather/")
path_adjustment_model=os.path.join(cwd, 'ninja_temp_adjustment.csv')

df_adjustment=pd.read_csv(path_adjustment_model, index_col=0)
df_adjustment=df_adjustment[['slope', 'intercept']]
# focus on neighbouring countries
df_adjustment=df_adjustment.loc[['PL', 'RO']]


In [3]:
df_adjustment.mean()

slope        0.916767
intercept    1.284968
dtype: float64

In [4]:
# make a dataframe for the 4 countries of interest
df_raw=pd.DataFrame()
for country in ['PL', 'RO', 'UA']:
    _df_raw=pd.read_csv(os.path.join(path_raw_ninja_data, 
                    'ninja_weather_country_%s_merra-2_population_weighted.csv' %country),
                    skiprows=2, usecols=[0,2])
    _df_raw['time']=pd.to_datetime(_df_raw['time'])
    _df_raw=_df_raw.set_index('time')
    exec('df_raw_%s= _df_raw' %country )
    
    if len(df_raw)==0:
        df_raw=_df_raw.rename({'temperature':country},axis=1)
    else:
        df_raw=pd.concat([df_raw, _df_raw.rename({'temperature':country},axis=1)], axis=1)

df_raw.describe()

Unnamed: 0,PL,RO,UA
count,350640.0,350640.0,350640.0
mean,8.056733,10.128155,8.509547
std,9.37958,10.312758,10.775677
min,-28.29,-21.135,-28.655
25%,0.736,1.63,-0.071
50%,7.996,10.207,8.495
75%,15.359,18.137,17.281
max,35.787,38.875,35.837


In [5]:
# compute RMSE and Pearson correlation
from sklearn.metrics import mean_squared_error
for country in ['PL', 'RO']:
    print(country)
    print('rmse= ', np.sqrt(mean_squared_error(df_raw[country], df_raw['UA'])))
    print('correlation= ', np.corrcoef((df_raw[country], df_raw['UA'])))


PL
rmse=  3.7546145806819555
correlation=  [[1.         0.94091776]
 [0.94091776 1.        ]]
RO
rmse=  3.060301908558894
correlation=  [[1.         0.97061343]
 [0.97061343 1.        ]]


In [6]:
# take the average from PL and RO for the adjustment params.

alpha=df_adjustment.loc[['PL', 'RO'], 'slope'].mean()
beta=df_adjustment.loc[['PL', 'RO'], 'intercept'].mean()
print(alpha, beta)

df_UA_adjusted=df_raw['UA']*alpha + beta

# rename the series temperature to match with others
df_UA_adjusted=df_UA_adjusted.rename('temperature')

df_UA_adjusted.to_csv('adjusted_ninja_temperature/adjusted_ninja_temp_UA.csv')

df_UA_adjusted.describe()

0.9167673442645028 1.284968028908732


count    350640.000000
mean          9.086242
std           9.878789
min         -24.985000
25%           1.219878
50%           9.072907
75%          17.127625
max          34.139159
Name: temperature, dtype: float64

In [7]:
# find average HDD per month from 2008 to 2007 (as in table 4)

# use eurostat threshold
threshold=15.


# initialise for each country
_df_HDD=pd.DataFrame(columns=np.arange(1,13,1)) # HDD per month
_df_noheating_hours=pd.DataFrame(columns=np.arange(1,13,1)) # number of hours with no heating in the month

for year, _df_year in df_UA_adjusted.groupby(df_UA_adjusted.index.year):

    _monthly_HDD=np.array([])
    _monthly_sum_temp_ninja=np.array([])
    _monthly_noheating_hours=np.array([])

    for month, _df_month in _df_year.groupby(_df_year.index.month):

        _sum=_df_month.sum() # sum temp

        _s_HDH=threshold-_df_month # series of degree-hours

        _HDD=(_s_HDH).clip(lower=0).sum()/24 # convert heating degree hours to heating degree days for the month

        _noheating_hours=(_s_HDH[_s_HDH<0]).count() # number of no heating hours in the month

        _monthly_HDD=np.append(_monthly_HDD, _HDD)
        _monthly_noheating_hours=np.append(_monthly_noheating_hours, _noheating_hours)
        _monthly_sum_temp_ninja=np.append(_monthly_sum_temp_ninja, _sum)

    _row=pd.DataFrame(columns=np.arange(1,13,1), data=_monthly_HDD.reshape(1,12), index=[year])
    _df_HDD=pd.concat([_df_HDD, _row], axis=0)

    _row1=pd.DataFrame(columns=np.arange(1,13,1), data=_monthly_noheating_hours.reshape(1,12), index=[year])
    _df_noheating_hours=pd.concat([_df_noheating_hours, _row1], axis=0)


In [8]:
_df_HDD.loc[np.arange(2008, 2018)].mean()

1     586.148366
2     472.350134
3     360.132414
4     166.542604
5      43.199541
6       6.001484
7       0.691179
8       2.712574
9      41.676921
10    196.000053
11    319.085959
12    481.696585
dtype: float64

In [9]:
_df_noheating_hours.loc[np.arange(2008, 2018)].mean()

1       0.0
2       0.0
3       3.3
4     111.2
5     413.9
6     626.2
7     721.5
8     692.5
9     396.0
10     73.4
11      3.7
12      0.0
dtype: float64

In [10]:
# take summer months to be May-September