In [18]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd 
import numpy as np
from abc import ABC

class StandardScaler():
    def __init__(self):
        self.mean = 0.
        self.std = 1.
    
    def fit(self, data):
        self.mean = data.mean(0)
        self.std = data.std(0)

    def transform(self, data):
        mean = torch.from_numpy(self.mean).type_as(data).to(data.device) if torch.is_tensor(data) else self.mean
        std = torch.from_numpy(self.std).type_as(data).to(data.device) if torch.is_tensor(data) else self.std
        return (data - mean) / std

    def inverse_transform(self, data):      
        mean = torch.from_numpy(self.mean).type_as(data).to(data.device) if torch.is_tensor(data) else self.mean
        std = torch.from_numpy(self.std).type_as(data).to(data.device) if torch.is_tensor(data) else self.std
        return (data * std) + mean

def dateToStr(date: int):
    if date < 10:
        return '0' + str(date)
    else:
        return str(date)

def get_hour_data(start_year: int, start_month: int, end_year: int, end_month: int):  # threshold
    matrix = None
    while start_year < end_year or \
            (start_year == end_year and start_month <= end_month):
        yearStr = str(start_year)
        monthStr = dateToStr(start_month)
        data = pd.read_csv(
            'F:\CXpython\金融组项目\GSA-forecastor\datasets\hour_data_matrix_' + yearStr + '\hour_data_matrix' + yearStr + '-' + monthStr + '.csv', sep=',',
            encoding="utf-8")
        matrix = data.values[:, 4:] if matrix is None else \
            np.concatenate((matrix, data.values[:, 4:]), axis=0)
        if (start_month + 1) % 12 == 1:
            start_month = 1
            start_year += 1
        else:
            start_month += 1
    return matrix


def get_weather_data(start_year: int, start_month: int, end_year: int, end_month: int):
    weather_data = pd.read_csv(
        'F:\CXpython\金融组项目\GSA-forecastor\datasets\\all_padding_ready_weather.csv', sep=',',
        encoding="utf-8")
    filter_data = weather_data.loc[
        (weather_data['year'] * 12 + weather_data['month'] >= start_year * 12 + start_month)
        & (weather_data['year'] * 12 + weather_data['month'] <= end_year * 12 + end_month)]
    matrix = filter_data.values[:, 4:]
    return matrix   

In [22]:
hour_data = np.vstack([get_hour_data(2011, 1, 2013, 12), get_hour_data(2014, 7, 2017, 6)])
weather_data = np.vstack([get_weather_data(2011, 1, 2013, 12), get_weather_data(2014, 7, 2017, 6)])
all_data = np.hstack([hour_data, weather_data])

(52608, 67)
(52608, 5)
(52608, 72)


In [29]:
df = pd.DataFrame(all_data)
df[0:int(len(df[0]) * 0.8)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
0,79.0,0.0,61.0,48.0,67.0,30.0,166.0,26.0,357.0,172.0,...,205.0,344.0,41.0,157.0,318.0,46.0,50.0,30.12,9.0,8.0
1,86.0,1.0,54.0,84.0,102.0,48.0,142.0,42.0,311.0,185.0,...,191.0,251.0,53.0,183.0,427.0,46.0,47.0,30.12,9.0,7.0
2,89.0,3.0,33.0,60.0,85.0,50.0,71.0,32.0,329.0,179.0,...,171.0,239.0,50.0,152.0,405.0,45.0,52.0,30.13,9.0,7.0
3,83.0,2.0,30.0,53.0,66.0,43.0,60.0,33.0,405.0,154.0,...,129.0,205.0,44.0,99.0,306.0,44.0,53.0,30.14,10.0,7.0
4,93.0,1.0,15.0,30.0,47.0,38.0,31.0,27.0,303.0,108.0,...,80.0,186.0,24.0,40.0,217.0,44.0,53.0,30.15,9.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42081,63.0,4.0,234.0,70.0,38.0,18.0,306.0,35.0,563.0,224.0,...,255.0,319.0,81.0,303.0,343.0,53.0,48.0,30.25,10.0,3.0
42082,39.0,4.0,198.0,51.0,23.0,6.0,301.0,35.0,499.0,183.0,...,210.0,207.0,83.0,249.0,286.0,53.0,52.0,30.22,10.0,0.0
42083,30.0,13.0,183.0,39.0,23.0,12.0,281.0,42.0,404.0,179.0,...,210.0,172.0,87.0,255.0,240.0,50.0,63.0,30.21,10.0,5.0
42084,31.0,13.0,180.0,56.0,32.0,15.0,260.0,45.0,339.0,174.0,...,193.0,183.0,109.0,244.0,261.0,51.0,63.0,30.20,10.0,0.0


In [28]:
def read_data(all_data, seq_len, scale = True):
    
    df = pd.DataFrame(all_data)
    scaler = None

    n_train = int(len(df[0]) * 0.8)
    n_test = int(len(df[0]) * 0.2)

    train_begin = 0 
    train_end = n_train

    test_begin = len(df) - n_test - seq_len
    test_end = len(df)

    if scale: 
        scaler = StandardScaler()
        train_data = df[0:n_train]
        scaler.fit(train_data.values)
        data = scaler.transform(df.values)
    else:
        data = df.values

    return data[train_begin:train_end], data[test_begin:test_end], scaler, [train_begin, test_begin]

In [30]:
train_data, test_data, scaler, seq_lens = read_data(all_data, 12)