<a href="https://colab.research.google.com/github/4mami/ForKaggle/blob/main/BikeSharingDemand.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

SEED = 1234
BATCH_SIZE = 8
SEQUENCE_LENGTH = 24
TARGET_COLUMN = "count"

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
# 訓練データの加工
# 不要な列の削除
train_df = train_df.drop("casual", axis=1)
train_df = train_df.drop("registered", axis=1)

features = list(train_df.columns.difference([TARGET_COLUMN, "datetime"]))
print(f"features: {features}")
print(f"features num: {len(features)}")

# 1時間後を予測するので、各行のcountをずらす
predict_lead = 1
target = f"{TARGET_COLUMN}_lead{predict_lead}"
train_df[target] = train_df[TARGET_COLUMN].shift(-predict_lead)
train_df = train_df.iloc[:-predict_lead]

In [None]:
# 訓練データの一部を検証データとして使う
train_len = round(len(train_df) * 0.8)
print(f"train_len: {train_len}")

val_df = train_df.iloc[train_len:].copy()
print(f"len(val_df): {len(val_df)}")
train_df = train_df.iloc[:train_len].copy()
print(f"len(train_df): {len(train_df)}")

# temp、atemp、humidity、windspeed列の値の正規化
standardize_target = ["temp","atemp","humidity","windspeed"]
for c in standardize_target:
    mean = train_df[c].mean()
    stdev = train_df[c].std()
    
    train_df[c] = (train_df[c] - mean) / stdev
    test_df[c] = (test_df[c] - mean) / stdev
    val_df[c] = (val_df[c] - mean) / stdev

In [None]:
# 訓練データセットクラス
class SequenceDataset(Dataset):
    def __init__(self, dataframe, target, features, sequence_length=SEQUENCE_LENGTH):
        self.target = target
        self.features = features
        self.sequence_length = sequence_length
        self.y = torch.tensor(dataframe[target].values).float()
        self.X = torch.tensor(dataframe[features].values).float()
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, i):
        # iが求められたシーケンス長以上、つまりパディングの必要がないなら
        if (i >= self.sequence_length - 1):
            i_start = i - self.sequence_length + 1
            x = self.X[i_start:(i+1), :]
        else:
            padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
            x = self.X[0:(i+1), :]
            x = torch.cat((padding, x), 0)
            
        return x, self.y[i] # self.y[i]は、実際は1時間後のcount

In [None]:
# テストデータセットクラス
class SequenceDatasetForTest(Dataset):
    def __init__(self, dataframe, features, sequence_length=SEQUENCE_LENGTH):
        self.features = features
        self.sequence_length = sequence_length
        self.X = torch.tensor(dataframe[features].values).float()
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, i):
        # iが求められたシーケンス長以上、つまりパディングの必要がないなら
        if (i >= self.sequence_length - 1):
            i_start = i - self.sequence_length + 1
            x = self.X[i_start:(i+1), :]
        else:
            padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
            x = self.X[0:(i+1), :]
            x = torch.cat((padding, x), 0)
            
        return x

In [None]:
train_dataset = SequenceDataset(train_df, target=target, features=features)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = SequenceDatasetForTest(test_df, features=features)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

val_dataset = SequenceDataset(val_df, target=target, features=features)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
class RegressionLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim

        self.lstm1 = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        output, (hn, _) = self.lstm1(x)
        hn = hn.squeeze(0)
        return self.fc(hn).flatten()

In [None]:
# 以下のコードは、https://qiita.com/ku_a_i/items/ba33c9ce3449da23b503 より引用
class EarlyStopping:
    """earlystoppingクラス"""

    def __init__(self, patience=5, verbose=False, path='checkpoint_model.pth'):
        """引数：最小値の非更新数カウンタ、表示設定、モデル格納path"""

        self.patience = patience    #設定ストップカウンタ
        self.verbose = verbose      #表示の有無
        self.counter = 0            #現在のカウンタ値
        self.best_score = None      #ベストスコア
        self.early_stop = False     #ストップフラグ
        self.val_loss_min = np.Inf   #前回のベストスコア記憶用
        self.path = path             #ベストモデル格納path

    def __call__(self, val_loss, model):
        """
        特殊(call)メソッド
        実際に学習ループ内で最小lossを更新したか否かを計算させる部分
        """
        score = -val_loss

        if self.best_score is None:  #1Epoch目の処理
            self.best_score = score   #1Epoch目はそのままベストスコアとして記録する
            self.checkpoint(val_loss, model)  #記録後にモデルを保存してスコア表示する
        elif score < self.best_score:  # ベストスコアを更新できなかった場合
            self.counter += 1   #ストップカウンタを+1
            if self.verbose:  #表示を有効にした場合は経過を表示
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')  #現在のカウンタを表示する 
            if self.counter >= self.patience:  #設定カウントを上回ったらストップフラグをTrueに変更
                self.early_stop = True
        else:  #ベストスコアを更新した場合
            self.best_score = score  #ベストスコアを上書き
            self.checkpoint(val_loss, model)  #モデルを保存してスコア表示
            self.counter = 0  #ストップカウンタリセット

    def checkpoint(self, val_loss, model):
        '''ベストスコア更新時に実行されるチェックポイント関数'''
        if self.verbose:  #表示を有効にした場合は、前回のベストスコアからどれだけ更新したか？を表示
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)  #ベストモデルを指定したpathに保存
        self.val_loss_min = val_loss  #その時のlossを記録する

In [None]:
learning_rate = 0.0001
num_hidden_dim = 50
output_dim = 1

model = RegressionLSTM(len(features), num_hidden_dim, output_dim)
loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"device: {device}")
model.to(device)

earlystopping = EarlyStopping(patience=5, verbose=True)

for epoch in range(100000):
    num_batches = len(train_loader)
    sumloss = 0.0
    model.train()
    
    for data in train_loader:
        X = data[0].to(device)
        y = data[1].to(device)        
        a = model(X)
        loss = loss_function(a, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        sumloss += loss.item()
        
    arg_loss = sumloss / num_batches
    print()
    print(f"train:{epoch:06} {arg_loss}")
    
    num_batches_val = len(val_loader)
    sumloss_val = 0.0
    model.eval()
    
    with torch.no_grad():
        for data in val_loader:
            X = data[0].to(device)
            y = data[1].to(device)        
            a = model(X)
            sumloss_val += loss_function(a, y).item()
            
    arg_loss_val = sumloss_val / num_batches_val
    print(f"val  :{epoch:06} {arg_loss_val}")
    
    earlystopping(arg_loss_val, model)
    if earlystopping.early_stop:
        print("Early Stopping!")
        break

In [None]:
def predict(data_loader, model):
    output = torch.tensor([])
    output = output.to(device)
    model.eval()
    with torch.no_grad():
        for X in data_loader:
            a = model(X.to(device))
            output = torch.cat((output, a), 0)
    
    return output

model.load_state_dict(torch.load('checkpoint_model.pth'))

a_col = "model predict"
output = predict(test_loader, model)
output = output.to('cpu')
test_df[a_col] = output.numpy()
out_df = test_df[["datetime", a_col]]

out_df.columns = ["datetime", "count"]
out_df.loc[out_df["count"] < 0, "count"] = 0.0
out_df = out_df.set_index("datetime")
out_df.to_csv("submission.csv")