In [16]:
import random
## Import tensorflow package for modeling
import torch
from torch.autograd import Variable
import torch.optim as optim

## Data processing
import pandas as pd
import numpy as np

## Min-max normalization
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

## Plot the graph
import matplotlib.pyplot as plt
%matplotlib inline

## Initializing module
from sklearn.linear_model import LinearRegression
np.set_printoptions(suppress=True)

## Copy module
import copy

## Used to calculate the training time
import time

## Set the GUP environment
import os
import sys

from matplotlib.ticker import MaxNLocator
import matplotlib.ticker as ticker

In [17]:
## Set up the display
torch.set_printoptions(sci_mode=False)
np.set_printoptions(suppress=True, threshold=np.inf)

In [18]:
## Set seed for random function
random.seed(168)

# Data preparation

In [19]:
## 讀入原始資料
demand = pd.read_csv("demand.csv")

In [20]:
## 區分出 x 與 y 資料
demand_x = demand.iloc[:,:-1]
demand_y = pd.DataFrame(demand.iloc[:,-1])

In [21]:
## 排除前 6 個月需求皆為 0 的資料
demand_y = demand_y[~((demand_x["t-4"]==0) & (demand_x["t-3"]==0) & (demand_x["t-2"]==0) & (demand_x["t-1"]==0) & (demand_x["t"]==0) & (demand_x["t+1"]==0))]
demand_x = demand_x[~((demand_x["t-4"]==0) & (demand_x["t-3"]==0) & (demand_x["t-2"]==0) & (demand_x["t-1"]==0) & (demand_x["t"]==0) & (demand_x["t+1"]==0))]

## 重新排序
demand_x.reset_index(drop=True, inplace=True)
demand_y.reset_index(drop=True, inplace=True)

In [22]:
##Add new column "Number" to demand_y
demand_y.insert(0, "Number", demand_x["Number"])

In [23]:
## Store the date data
date = demand_x["Month_date"]

## Delete the date data from the demand_x 
demand_x.pop("Month_date")

0       2013/5/1
1       2013/6/1
2       2013/7/1
3       2013/8/1
4       2013/9/1
          ...   
2409    2020/2/1
2410    2020/3/1
2411    2020/4/1
2412    2020/5/1
2413    2020/6/1
Name: Month_date, Length: 2414, dtype: object

In [24]:
## Set up the label encoder 
labelencoder = LabelEncoder()
demand_x_encode = copy.deepcopy(demand_x)
demand_y_encode = copy.deepcopy(demand_y)

In [25]:
## Encode the material number
demand_x_encode["Number"]=labelencoder.fit_transform(demand_x_encode["Number"])
demand_y_encode["Number"]=labelencoder.transform(demand_y_encode["Number"])

## 從 1 開始編碼
demand_x_encode["Number"] = demand_x_encode["Number"]+1
demand_y_encode["Number"] = demand_y_encode["Number"]+1

In [26]:
## Print out each meaning of code
material_num = [i for i in range(len(demand_x_encode["Number"].unique()))]
print(labelencoder.inverse_transform(material_num))

['CC1101040' 'CC1101050' 'CC110105002300' 'CC1101060' 'CC110106001800'
 'CC110106004900' 'CC1101080' 'CC110108002300' 'CC110108002800'
 'CC110108004300' 'CC110108004500' 'CC110108004700' 'CC1101100'
 'CC1102035' 'CC1102070' 'CC1102080' 'CC110208001800' 'CC110208002500'
 'CC110208003600' 'CC1102090' 'CC1102100' 'CC110210002800' 'CC1102120'
 'CC1102150' 'CC1102160' 'CC1102200' 'CC1102300' 'CC1102400']


In [49]:
numbers = [number for number in demand_x_encode["Number"].unique()]

x_train, y_train, x_test, y_test = np.array([]), np.array([]), np.array([]), np.array([])
data_content = pd.DataFrame(columns=["Number", "Train", "Test"])

for i in range(len(numbers)):

    target_x = demand_x_encode[demand_x_encode["Number"]== numbers[i]]
    target_y = demand_y_encode[demand_y_encode["Number"]== numbers[i]]

    target_x.reset_index(inplace=True, drop=True)
    target_y.reset_index(inplace=True, drop=True)

    target_x_volume = np.array(target_x)
    target_y_volume = np.array(target_y["t+2"])
    target_y = np.array(target_y)


    ## Handling outlier
    #選定幾倍 IQR 值以外去除
    n=0

    #IQR = Q3-Q1
    IQR = np.percentile(target_y_volume,75) - np.percentile(target_y_volume,25)

    #outlier = Q3 + n*IQR 
    upper = np.percentile(target_y_volume,75)+n*IQR
    lower = np.percentile(target_y_volume,25)-n*IQR

    ## 方法一： outlier 直接排除


    #outlier = Q3 + n*IQR 
    target_x_volume = target_x_volume[np.where(target_y_volume <= upper)[0]]
    target_y = target_y[np.where(target_y_volume <= upper)[0]]
    target_y_volume = target_y_volume[target_y_volume <= upper]

    #outlier = Q1 - n*IQR 
    target_x_volume = target_x_volume[np.where(target_y_volume >= lower)[0]]
    target_y = target_y[np.where(target_y_volume >= lower)[0]]
    target_y_volume = target_y_volume[target_y_volume >= lower]

    record = pd.DataFrame({

        "Number" : [labelencoder.inverse_transform([numbers[i]-1])[0]],
        "Train" : [int(target_x_volume.shape[0]*0.8)],
        "Test" : [target_x_volume.shape[0] - int(target_x_volume.shape[0]*0.8)]
    })
        
    data_content = data_content.append(record)

    ## 方法二： outlier 取代成上、下限值
#     target_y_volume[np.where(target_y_volume > upper)] = upper
#     target_y_volume[np.where(target_y_volume < lower)] = lower


    x_train = np.append(x_train, target_x_volume[:int(target_x_volume.shape[0]*0.8)])
    y_train = np.append(y_train, target_y[:int(target_x_volume.shape[0]*0.8)])

    x_test = np.append(x_test, target_x_volume[int(target_x_volume.shape[0]*0.8):])
    y_test = np.append(y_test, target_y[int(target_x_volume.shape[0]*0.8):])

    
    # 印結果
#         number_text = labelencoder.inverse_transform([numbers[i]-1])[0]
#         axes[i//2][i%2].set_title("Material number: %s" %(number_text))
#         axes[i//2][i%2].plot(target_y_volume)
#         axes[i//2][i%2].axhline(y=IQR, color="red")
#         axes[i//2][i%2].axhline(y=upper, color="orange")
#         axes[i//2][i%2].axhline(y=lower, color="orange")

x_train = x_train.reshape(-1, demand_x_encode.shape[1])
x_test = x_test.reshape(-1, demand_x_encode.shape[1])

In [51]:
data_content.to_csv("Data_description.csv", index=False)