## Import modul dan library
pandas untuk memproses dataset <br>
numpy untuk memproses array <br>
sklearn untuk melakukan scaling menggunakan min-max scaler

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

## Import CSV

In [2]:
df = pd.read_csv('CSV\BTC-USD.csv')
df.head(5)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2014-09-17,465.864014,468.174011,452.421997,457.334015,457.334015,21056800
1,2014-09-18,456.859985,456.859985,413.104004,424.440002,424.440002,34483200
2,2014-09-19,424.102997,427.834991,384.532013,394.79599,394.79599,37919700
3,2014-09-20,394.673004,423.29599,389.882996,408.903992,408.903992,36863600
4,2014-09-21,408.084991,412.425995,393.181,398.821014,398.821014,26580100


## Preprocessing

### Data Clensing
Membersihkan data dan menyamakan format data

#### Drop kolom yang tidak diperlukan

In [3]:
df = df.drop(['Adj Close', 'Volume'], axis=1)
df.head(5)

Unnamed: 0,Date,Open,High,Low,Close
0,2014-09-17,465.864014,468.174011,452.421997,457.334015
1,2014-09-18,456.859985,456.859985,413.104004,424.440002
2,2014-09-19,424.102997,427.834991,384.532013,394.79599
3,2014-09-20,394.673004,423.29599,389.882996,408.903992
4,2014-09-21,408.084991,412.425995,393.181,398.821014


#### Menyamakan format <i>datetime</i> 

In [4]:
df['Date'] = pd.to_datetime(df['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3250 entries, 0 to 3249
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    3250 non-null   datetime64[ns]
 1   Open    3250 non-null   float64       
 2   High    3250 non-null   float64       
 3   Low     3250 non-null   float64       
 4   Close   3250 non-null   float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 127.1 KB


### Normalisasi
Normalisasi data menggunakan min-max scaler agar skala datanya tidak besar (dari rentang 0 sampai 1)

#### Min-Max Scaling

In [5]:
def transform(data):
    data = np.array(data)
    data = np.reshape(data, (len(data),1))
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(data)
    return scaled

### Join semua data dan membuat ID

#### Menggabungkan semua data yang sudah di-scaling menjadi 1 dataset

In [6]:
open = pd.DataFrame(transform(df['Open']), columns=['Open'])
high = pd.DataFrame(transform(df['High']), columns=['High'])
low = pd.DataFrame(transform(df['Low']), columns=['Low'])
close = pd.DataFrame(transform(df['Close']), columns=['Close'])

df_scaled = df['Date']
df_scaled = pd.concat([df_scaled, open, high, low, close], axis=1)
df_scaled

Unnamed: 0,Date,Open,High,Low,Close
0,2014-09-17,0.004289,0.003739,0.004243,0.004144
1,2014-09-18,0.004155,0.003574,0.003649,0.003655
2,2014-09-19,0.003669,0.003151,0.003217,0.003216
3,2014-09-20,0.003232,0.003085,0.003298,0.003425
4,2014-09-21,0.003431,0.002927,0.003348,0.003275
...,...,...,...,...,...
3245,2023-08-06,0.428464,0.422134,0.434860,0.428317
3246,2023-08-07,0.428387,0.423351,0.431240,0.430376
3247,2023-08-08,0.430487,0.436949,0.437125,0.439055
3248,2023-08-09,0.439195,0.435734,0.441097,0.436028


#### Memberikan ID pada semua data dalam dataset

In [14]:
id = []
for i in range(1,len(df_scaled)+1):
    id.append(i)
df_scaled['id'] = id
df_scaled

Unnamed: 0,Date,Open,High,Low,Close,id
0,2014-09-17,0.004289,0.003739,0.004243,0.004144,1
1,2014-09-18,0.004155,0.003574,0.003649,0.003655,2
2,2014-09-19,0.003669,0.003151,0.003217,0.003216,3
3,2014-09-20,0.003232,0.003085,0.003298,0.003425,4
4,2014-09-21,0.003431,0.002927,0.003348,0.003275,5
...,...,...,...,...,...,...
3245,2023-08-06,0.428464,0.422134,0.434860,0.428317,3246
3246,2023-08-07,0.428387,0.423351,0.431240,0.430376,3247
3247,2023-08-08,0.430487,0.436949,0.437125,0.439055,3248
3248,2023-08-09,0.439195,0.435734,0.441097,0.436028,3249


### Data splitting
Data dibagi menjadi data uji dan data latih (Sementara 80%)

In [22]:
dataNum = int(len(df_scaled)*0.8)
train = df_scaled.iloc[:dataNum,:]
test = df_scaled.iloc[dataNum:,:]

#### Dataset training

In [23]:
train

Unnamed: 0,Date,Open,High,Low,Close,id
0,2014-09-17,0.004289,0.003739,0.004243,0.004144,1
1,2014-09-18,0.004155,0.003574,0.003649,0.003655,2
2,2014-09-19,0.003669,0.003151,0.003217,0.003216,3
3,2014-09-20,0.003232,0.003085,0.003298,0.003425,4
4,2014-09-21,0.003431,0.002927,0.003348,0.003275,5
...,...,...,...,...,...,...
2595,2021-10-25,0.901209,0.926211,0.914058,0.932823,2596
2596,2021-10-26,0.932956,0.918916,0.903476,0.893112,2597
2597,2021-10-27,0.893166,0.892758,0.876547,0.865193,2598
2598,2021-10-28,0.865242,0.902870,0.876528,0.896946,2599


#### Dataset Testing

In [21]:
test

Unnamed: 0,Date,Open,High,Low,Close,id
2600,2021-10-30,0.921179,0.905808,0.917480,0.915743,2601
2601,2021-10-31,0.915407,0.906917,0.904732,0.907286,2602
2602,2021-11-01,0.907540,0.907104,0.899006,0.902618,2603
2603,2021-11-02,0.902238,0.933698,0.913775,0.935591,2604
2604,2021-11-03,0.936244,0.923114,0.921496,0.931787,2605
...,...,...,...,...,...,...
3245,2023-08-06,0.428464,0.422134,0.434860,0.428317,3246
3246,2023-08-07,0.428387,0.423351,0.431240,0.430376,3247
3247,2023-08-08,0.430487,0.436949,0.437125,0.439055,3248
3248,2023-08-09,0.439195,0.435734,0.441097,0.436028,3249


## Database

### Setup MongoDB

In [24]:
from pymongo import MongoClient

In [25]:
try:
    client = MongoClient('127.0.0.1', 27017)
except:
    print("Connection failed")

In [26]:
database = client['SKRIPSI_LSTM']
collection_train = database['train']
collection_test = database['test']

### Masukkan dataset ke database

In [32]:
collection_train.insert_many(train.to_dict('records'))
collection_test.insert_many(test.to_dict('records'))

<pymongo.results.InsertManyResult at 0x26946c5f880>