In [2]:
import pandas as pd
import numpy as np
import pickle
import os

In [None]:
RAW_DATA_PATH = '/home/chrisams/Documents/datasets/data_TAIM/raw'
SAVE_DATA_PATH = '/home/chrisams/Documents/datasets/data_TAIM/processed/'
COLUMNS = [
    'Fecha Hora (YYYY-MM-DD HH:MM)',
    'Velocidad de viento en 20.0 metros [mean,m/s]',
    'Velocidad de viento en 10.0 metros [mean,m/s]',
]
V20 = 'Velocidad de viento en 20.0 metros [mean,m/s]'
V10 = 'Velocidad de viento en 10.0 metros [mean,m/s]'
STATS_PATH = '/home/chrisams/Documents/datasets/data_TAIM/stats.pickle'

In [None]:
# Read all the data files.
for path, _, file_names in os.walk(RAW_DATA_PATH):
    eolic_plants_df = [None] * len(file_names)
    for i, file_name in enumerate(file_names):
        print('{} have index: {}'.format(file_name, i))
        full_path = os.path.join(path, file_name)
        eolic_plants_df[i] = pd.read_csv(full_path)

In [None]:
# Clean data and keep only useful columns and rows.

min_date = None
max_date = None

for i in range(len(eolic_plants_df)):
    plant =  eolic_plants_df[i]
    plant = plant[COLUMNS].set_index(\
                pd.DatetimeIndex(plant['Fecha Hora (YYYY-MM-DD HH:MM)'])\
            ).iloc[:, 1:]
    v10_mean = plant[V10].mean()
    v20_mean = plant[V20].mean()
    alpha = np.log(v10_mean / v20_mean) / np.log(10 / 20)
    
    plant['85m_speed'] = plant[V20] * np.power(85 / 20, alpha)
    
    current_min_date = plant.index.min()
    current_max_date = plant.index.max()
    
    if min_date:
        if min_date < current_min_date:
            min_date = current_min_date
    else:
        min_date = current_min_date
    
    if max_date:
        if max_date > current_max_date:
            max_date = current_max_date
    else:
        max_date = current_max_date

    eolic_plants_df[i] = plant['85m_speed']

# Normalize data and keep statistics.
stats_dict = {}
for i in range(len(eolic_plants_df)):
    plant = eolic_plants_df[i][min_date:max_date]
    mean = plant.mean()
    std = plant.std()
    eolic_plants_df[i] = (plant - mean) / std
    stats_dict['plant_{}'.format(i)] = {'mean': mean, 'std': std}

In [None]:
# Save data and statistics.
for i, plant in enumerate(eolic_plants_df):
    plant.to_csv(os.path.join(SAVE_DATA_PATH, 'plant_{}.csv'.format(i)))

with open(STATS_PATH, 'wb') as fp:
    pickle.dump(stats_dict, fp)

In [3]:
def load_data(data_path, n_plants, p, resample_rule='10T', n_rows=None):
    """
    data_path: directory where the data is saved.
    n_plants: number of plants to load (K).
    resample: resample rule for data aggregation.
    """
    data = [None] * n_plants
    for path, _, file_names in os.walk(data_path):
        for i in range(len(file_names)):
            if i + 1 > n_plants:
                break

            data[i] = pd.read_csv(os.path.join(data_path, 'plant_{}.csv'.format(i)),\
                                  index_col=0, names=['85m_speed'], parse_dates=True)

            data[i] = data[i].resample(resample_rule).mean().interpolate(method='time')

            data[i] = data[i]['85m_speed'].values
            
            if n_rows:
                data[i] = data[i][:n_rows]
    
    data = np.stack(data, axis=0)
    # test_data = data
    
    if p > 0:
        X = np.zeros((n_plants * p, data.shape[1] - p))
        j = 0
        for i in range(p, data.shape[1]):
            for t in range(p):
                X[t * n_plants:(t + 1) * n_plants, j] = data[:, (i - 1) - t]
            j += 1
    else:
        X = data
    
    data = data[:, p:]
    
    return data, X
    # return data, X, test_data

# Tests

In [5]:
DATA_PATH = '/home/chrisams/Documents/datasets/data_TAIM/processed/'
K = 2
p = 3
n_rows = 1000
Y0, X, test = load_data(DATA_PATH, K, p, resample_rule='10T', n_rows=n_rows)

In [18]:
print('YO shape: {} and should be: {} x {}'.format(Y0.shape, K, Y0.shape[1]))
print('X shape: {} and should be: {} x {}'.format(X.shape, K * p, Y0.shape[1]))
c = 5
c_ = 0
for i in range(p, test.shape[1]):
    print('YO: {} = DATA: {}'.format(Y0[:, i - p], test[:, i]))
    for t in range(p):
        print('X: {} = DATA: {}'.format(X[t * K: (t + 1) * K, i - p], test[:, i - (t + 1)]))
    c_ += 1
    if c_ >= c:
        break

YO shape: (2, 997) and should be: 2 x 997
X shape: (6, 997) and should be: 6 x 997
YO: [1.10469029 1.64299099] = DATA: [1.10469029 1.64299099]
X: [0.94830146 1.61738329] = DATA: [0.94830146 1.61738329]
X: [0.89617185 1.54056019] = DATA: [0.89617185 1.54056019]
X: [0.87010704 1.386914  ] = DATA: [0.87010704 1.386914  ]
YO: [1.1307551  1.66859869] = DATA: [1.1307551  1.66859869]
X: [1.10469029 1.64299099] = DATA: [1.10469029 1.64299099]
X: [0.94830146 1.61738329] = DATA: [0.94830146 1.61738329]
X: [0.89617185 1.54056019] = DATA: [0.89617185 1.54056019]
YO: [1.18288471 1.66859869] = DATA: [1.18288471 1.66859869]
X: [1.1307551  1.66859869] = DATA: [1.1307551  1.66859869]
X: [1.10469029 1.64299099] = DATA: [1.10469029 1.64299099]
X: [0.94830146 1.61738329] = DATA: [0.94830146 1.61738329]
YO: [1.20894951 1.77102948] = DATA: [1.20894951 1.77102948]
X: [1.18288471 1.66859869] = DATA: [1.18288471 1.66859869]
X: [1.1307551  1.66859869] = DATA: [1.1307551  1.66859869]
X: [1.10469029 1.64299099] =

In [13]:
list(range(1,1))

[]