In [1]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.preprocessing import LabelEncoder
from functions import equal, find_nearest, impute_immediate_mean, max_duplicated_indices

In [2]:
energy = pd.read_csv('data/energy_dataset.csv')

# Chop of nanoseconds, and convert to datetime, reset index
energy.time = pd.to_datetime(energy.time.apply(lambda x: x[:-6]))
energy.set_index('time', inplace=True)

# Get rid of columns that do not contain any information
energy = energy.drop(columns = energy.loc[:,energy.nunique()<=1].columns)

In [3]:
# Create start stop variable for indices
start = dt.datetime(2015, 1, 1)
stop = dt.datetime(2018, 12, 31, 23)

# Create continuous list of indices by hour
data = pd.DataFrame(index = pd.date_range(start, stop, freq='H' ))

# join existing data on the complete list of indices
energy = energy.join(data,how='right')

# Loop through each column and impute missing values
for col in energy.columns:
    
    # Get the indices of missing values in this column
    indices = energy.loc[energy[col].isna()].index
    
    # For each missing value, impute the mean of closest known values
    for i in indices:
        energy.loc[i, col] = impute_immediate_mean(energy[col], i)
        
# Get indices of duplicates
indices = energy.loc[energy.index.value_counts()>1].index.unique()

# average duplicate values for each column
for col in energy.columns:
    for i in indices:
        energy.loc[i, col] = round(energy.loc[i,col].mean(),1)
        

# Drop duplicates
energy.drop_duplicates(inplace=True)

# Create total generation column summing all generation sources
energy['generation total'] = energy.loc[:,:'generation wind onshore'].sum(axis=1)

# Create diff column (difference between total generation and actual load)
energy['diff'] = energy['generation total'] - energy['total load actual']

In [4]:
columns = ['generation biomass',
           'generation fossil brown coal/lignite',
           'generation fossil hard coal',
           'generation fossil oil',
           'generation hydro run-of-river and poundage',
           'generation hydro water reservoir',
           'generation nuclear',
           'generation other',
           'generation other renewable',
           'generation solar',
           'generation waste',
           'generation wind onshore',]
for col in columns:
    flag_indices = energy.loc[energy['diff']<-15000].index
    for i in flag_indices:
        energy.loc[i, col] = impute_immediate_mean(energy[col], i)

In [5]:
# Export to csv
energy.to_csv('data/energy_clean.csv')

## Weather Data
___

In [6]:
weather = pd.read_csv('data/weather_features.csv')

In [7]:
# Drop duplicates
weather.drop_duplicates(inplace=True)

# Cut off nanoseconds and create datetime column
weather.dt_iso = pd.to_datetime(weather.dt_iso.apply(lambda x: x[:-6]))

# Drop columns since weather description provides most granular level of information
weather.drop(columns=['weather_id', 'weather_main', 'weather_icon', 'temp_min', 'temp_max'], inplace=True)

for city in weather.city_name.unique():
    
    # Copy data by city
    city_df = weather.loc[weather.city_name == city].copy()

    # Get the combined weather descriptions for timestamps with multiple descriptions
    combined = city_df.groupby('dt_iso')['weather_description'].transform(lambda x : '/'.join(x))

    # assign new to combined descriptions
    city_df['description'] = combined

    # Drop original weather description column
    city_df.drop(columns=['city_name','weather_description'], inplace=True)

    # Drop duplicates
    city_df.drop_duplicates(inplace=True)

    # Change column names
    city_df.columns = city_df.columns.map(lambda x: x + f'_{city}')

    # join to the data df
    data = data.join(city_df.set_index(f'dt_iso_{city}'))

# Join energy data with weather data
df = energy.join(data)

# Replace values in duplicated indices with the max of each column
max_duplicated_indices(df, inplace=True)

# Fillna with previous (only affects 4 rows)
for col in df.columns:
    df[col].fillna(method='bfill', inplace=True)
    
# Get all unique weather descriptions for each city
categorical = df.select_dtypes(exclude=[np.float]).columns
descriptions = set()
for col in categorical:
    unique = set(df[col].unique())
    descriptions = unique | descriptions
descriptions = list(descriptions)

# Instantiate LabelEncoder and transform cols
encoder = LabelEncoder()
encoder.fit(descriptions)
for col in categorical:
    df[col] = encoder.transform(df[col])

In [8]:
# Export to csv
weather.to_csv('data/weather_clean.csv')
df.to_csv('data/df_clean.csv')