In [None]:
import pandas as pd
import numpy as np
import pickle
import os

In [None]:
RAW_DATA_PATH = '/home/chrisams/Documents/datasets/data_TAIM/raw'
SAVE_DATA_PATH = '/home/chrisams/Documents/datasets/data_TAIM/processed/'
COLUMNS = [
    'Fecha Hora (YYYY-MM-DD HH:MM)',
    'Velocidad de viento en 20.0 metros [mean,m/s]',
    'Velocidad de viento en 10.0 metros [mean,m/s]',
]
V20 = 'Velocidad de viento en 20.0 metros [mean,m/s]'
V10 = 'Velocidad de viento en 10.0 metros [mean,m/s]'
STATS_PATH = '/home/chrisams/Documents/datasets/data_TAIM/stats.pickle'

In [None]:
# Read all the data files.
for path, _, file_names in os.walk(RAW_DATA_PATH):
    eolic_plants_names = [None] * len(file_names)
    eolic_plants_df = [None] * len(file_names)
    for i, file_name in enumerate(file_names):
        print('{} have index: {}'.format(file_name, i))
        full_path = os.path.join(path, file_name)
        eolic_plants_df[i] = pd.read_csv(full_path)
        eolic_plants_names[i] = file_name

In [None]:
# Clean data and keep only useful columns and rows.

min_date = None
max_date = None

for i in range(len(eolic_plants_df)):
    plant =  eolic_plants_df[i]
    plant = plant[COLUMNS].set_index(\
                pd.DatetimeIndex(plant['Fecha Hora (YYYY-MM-DD HH:MM)'])\
            ).iloc[:, 1:]
    v10_mean = plant[V10].mean()
    v20_mean = plant[V20].mean()
    alpha = np.log(v10_mean / v20_mean) / np.log(10 / 20)
    
    plant['85m_speed'] = plant[V20] * np.power(85 / 20, alpha)
    
    current_min_date = plant.index.min()
    current_max_date = plant.index.max()
    
    if min_date:
        if min_date < current_min_date:
            min_date = current_min_date
    else:
        min_date = current_min_date
    
    if max_date:
        if max_date > current_max_date:
            max_date = current_max_date
    else:
        max_date = current_max_date

    eolic_plants_df[i] = plant['85m_speed']

# Normalize data and keep statistics.
stats_dict = {}
for i in range(len(eolic_plants_df)):
    plant = eolic_plants_df[i][min_date:max_date]
    mean = plant.mean()
    std = plant.std()
    eolic_plants_df[i] = (plant - mean) / std
    stats_dict[eolic_plants_names[i]] = {'mean': mean, 'std': std}

In [None]:
# Save data and statistics.
for i, plant in enumerate(eolic_plants_df):
    plant.to_csv(os.path.join(SAVE_DATA_PATH, eolic_plants_names[i]))

with open(STATS_PATH, 'wb') as fp:
    pickle.dump(stats_dict, fp)

In [None]:
def read_resample_data(data_path, plant_name, resample_rule, n_rows,\
                       date_start=None, date_end=None):
    print('Reading {}...'.format(plant_name))
    data = pd.read_csv(os.path.join(data_path, plant_name),\
                       index_col=0, names=['85m_speed'], parse_dates=True)
    data = data.resample(resample_rule).mean().interpolate(method='time')
    if date_start and date_end:
        data = data[date_start:date_end]
    data = data['85m_speed'].values
    if n_rows:
        data = data[:n_rows]
    return data


def load_data(data_path, n_plants, p, resample_rule='10T', n_rows=None,\
              date_start=None, date_end=None, plant_names=None):
    """
    data_path: directory where the data is saved.
    n_plants: number of plants to load (K).
    resample: resample rule for data aggregation.
    date_start: initial date of data (YYYY-MM).
    date_end: end date of data (YYYY-MM).
    plant_names: list with the eolic plants to load.
    """
    if plant_names is not None:
        data = [None] * len(plant_names)
        for i, plant_name in enumerate(plant_names):
            data[i] = read_resample_data(data_path, plant_name, resample_rule,\
                                         n_rows, date_start=date_start, date_end=date_end)
    else:    
        data = [None] * n_plants
        for path, _, file_names in os.walk(data_path):
            for i in range(len(file_names)):
                if i + 1 > n_plants:
                    break
                data[i] = read_resample_data(data_path, file_names[i], resample_rule,\
                                             n_rows, date_start=date_start, date_end=date_end)
    
    data = np.stack(data, axis=0)
    #test_data = data
    
    if p > 0:
        X = np.zeros((n_plants * p, data.shape[1] - p))
        j = 0
        for i in range(p, data.shape[1]):
            for t in range(p):
                X[t * n_plants:(t + 1) * n_plants, j] = data[:, (i - 1) - t]
            j += 1
    else:
        X = data
    
    data = data[:, p:]
    
    return data, X
    #return data, X, test_data

# Tests data preprocessing

In [None]:
DATA_PATH = '/home/chrisams/Documents/datasets/data_TAIM/processed/'
K = 2
p = 1
n_rows = None
date_start = '2011-05'
date_end = '2011-06'
plant_names = [
    'd05b_2010-06-19_2018-03-05.csv',
    'd01_2009-07-12_2018-01-17.csv',
]
#Y0, X, test = load_data(DATA_PATH, K, p, resample_rule='10T', n_rows=n_rows, date_start=date_start,\
#                        date_end=date_end)
Y0, X = load_data(DATA_PATH, K, p, resample_rule='10T', n_rows=n_rows, date_start=date_start,\
                        date_end=date_end, plant_names=plant_names)

In [None]:
Y0.shape

In [None]:
print('YO shape: {} and should be: {} x {}'.format(Y0.shape, K, Y0.shape[1]))
print('X shape: {} and should be: {} x {}'.format(X.shape, K * p, Y0.shape[1]))
c = 5
c_ = 0
for i in range(p, test.shape[1]):
    print('YO: {} = DATA: {}'.format(Y0[:, i - p], test[:, i]))
    for t in range(p):
        print('X: {} = DATA: {}'.format(X[t * K: (t + 1) * K, i - p], test[:, i - (t + 1)]))
    c_ += 1
    if c_ >= c:
        break

In [None]:
print('test')
print(test[:, :6])
print('Y0')
print(Y0[:, :6])
print('X')
print(X[:, :6])

In [None]:
with open(STATS_PATH, 'rb') as f:
    stats = pickle.load(f)

In [None]:
stats

# Visualization of data per month

In [None]:
# Read all the data files.
for path, _, file_names in os.walk(RAW_DATA_PATH):
    eolic_plants_names = [None] * len(file_names)
    eolic_plants_df = [None] * len(file_names)
    for i, file_name in enumerate(file_names):
        print('{} have index: {}'.format(file_name, i))
        full_path = os.path.join(path, file_name)
        plant = pd.read_csv(full_path)
        plant = plant[COLUMNS].set_index(\
                pd.DatetimeIndex(plant['Fecha Hora (YYYY-MM-DD HH:MM)'])\
            ).iloc[:, 1:]
        eolic_plants_df[i] = plant[V20]
        eolic_plants_names[i] = file_name

In [None]:
print(eolic_plants_df[0].shape)

In [None]:
eolic_plants_df[0]['2011-05':'2011-06']

In [None]:
year = '2012'
for i in range(len(eolic_plants_df)):
    print(eolic_plants_names[i])
    df = eolic_plants_df[i]
    zeros = df[year][df[year] == 0]
    print('Data count:')
    print(df[year].groupby(df[year].index.month).count())
    print('Zeros count:')
    print(zeros.groupby(zeros.index.month).count())
    print('-' * 50)

Interpolated time is equal to original time if data is complete.-

In [None]:
df1_original = eolic_plants_df[i]['2011-5'][V20]

In [None]:
df1_interpolated = df1.interpolate(method='time')

In [None]:
np.all(df1_original.values == df1_interpolated.values)