<a href="https://colab.research.google.com/github/AbhishekGiri1617/CrytoCurrency-Price-Prediction/blob/main/cryptocurrency_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Using 6 minutes cryptocurrancy pricing data to predict the price of the next 6 Minutes

In [None]:
import pandas as pd
import os

# load and prepare datasets

In [None]:
df = pd.read_csv('/content/BCH-USD.csv')
df.head()

Unnamed: 0,1528968660,871.650024,871.729980,871.650024.1,871.719971,5.675361
0,1528968720,870.859985,871.719971,871.719971,870.859985,26.856577
1,1528968780,870.099976,871.090027,871.090027,870.099976,1.1243
2,1528968840,868.830017,870.950012,868.830017,870.789978,1.749862
3,1528968900,870.0,870.0,870.0,870.0,1.6805
4,1528968960,869.98999,870.0,870.0,869.98999,1.669014


 here we need to add columns names and merge all data frames in one df


In [None]:
main_df = pd.DataFrame()
ratios = ['BTC-USD', 'LTC-USD', 'ETH-USD', 'BCH-USD']

for ratio in ratios:
    # note "f" is the new replacement for .format
    dataset = f'/content/{ratio}.csv' # set dataset path
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])
    df.rename(columns={"close":f"{ratio}_close", "volume":f"{ratio}_volume"}, inplace=True)

    # set time as index column
    df.set_index("time", inplace=True)
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]

    # merge all df's
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)

print(main_df.head())

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968660    6489.549805        0.587100      96.580002        9.647200   
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   

            ETH-USD_close  ETH-USD_volume  BCH-USD_close  BCH-USD_volume  
time                                                                      
1528968660            NaN             NaN     871.719971        5.675361  
1528968720      486.01001       26.019083     870.859985       26.856577  
1528968780      486.00000        8.449400     870.099976        1.124300  
1528968840      485.75000       26.994646     870.789978        1.749862  
1528968900      4

In [None]:
for c in main_df.columns:
    print(c)

BTC-USD_close
BTC-USD_volume
LTC-USD_close
LTC-USD_volume
ETH-USD_close
ETH-USD_volume
BCH-USD_close
BCH-USD_volume


In [None]:
# define parameters

SEQ_LEN = 60 # number of min. data we use to predict
FUTURE_PERIOD_PREDICT = 3 # future period to predict
RATION_TO_PREDICT = "ETH-USD" # currency we will predict preice for


In [None]:
def classify(current, future):
    if float(future) > float(current):
        return 1 # price will rise
    else:
        return 0  # price will go down

In [None]:
main_df['future'] = main_df[f"{RATION_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT)
print(main_df[[f"{RATION_TO_PREDICT}_close", "future"]].head())

            ETH-USD_close     future
time                                
1528968660            NaN  485.75000
1528968720      486.01001  486.00000
1528968780      486.00000  486.00000
1528968840      485.75000  485.98999
1528968900      486.00000  485.98999


In [None]:
# create trarget column

main_df['target'] = list(map(classify, main_df[f"{RATION_TO_PREDICT}_close"], main_df['future']))
print(main_df[[f"{RATION_TO_PREDICT}_close", "future", 'target']].head())

            ETH-USD_close     future  target
time                                        
1528968660            NaN  485.75000       0
1528968720      486.01001  486.00000       0
1528968780      486.00000  486.00000       0
1528968840      485.75000  485.98999       1
1528968900      486.00000  485.98999       0


### sperate data into training and validating datasets

we can't shuffle data and split it, because that is not work in sequencial data.

In [None]:
times = sorted(main_df.index.values)
last_5pct = times[-int(0.05 * len(times))]
print(last_5pct)

1534922100


In [None]:
validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]


In [None]:
# this fun. for scaling , normaize and balance data
from sklearn import preprocessing
from collections import deque
import numpy as np
import random
import time
def preprocess_df(df):
    df = df.drop('future', axis=1) # this is only for testing and generate target

    for col in df.columns:
        if col != 'target':
            # pct_change compute precent of change between prev. and immediate value
            df[col] = df[col].pct_change()
            df.dropna(inplace=True)
            df[col] = preprocessing.scale(df[col].values) # scaling values 0-1
    df.dropna(inplace=True)


    # this part create a stack or queue of elements and in each iter. remove
    # one time series and add another one in the end, append everystck when len be 60
    sequencial_data = []
    prev_days = deque(maxlen=SEQ_LEN) # make queue or stack of data

    for i in df.values:
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LEN :
            sequencial_data.append([np.array(prev_days) , i[-1]])

    random.shuffle(sequencial_data)

    buys = []
    sells = []

    for seq, target in sequencial_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])

    random.shuffle(buys)
    random.shuffle(sells)

    lower = min(len(buys), len(sells))

    buys = buys[:lower]
    sells = sells[:lower]
    # print(f"sq data len:{len(sequencial_data)}, balanced len: {len(buys) + len(sells)}")
    sequencial_data = buys + sells

    random.shuffle(sequencial_data)

    # split data into x, y
    x = []
    y = []

    for seq, target in sequencial_data:
        x.append(seq)
        y.append(target)

    return np.array(x), y

In [None]:
train_x , train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

In [None]:
print(f"training data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"validation dont buys: {validation_y.count(0)} buys: {validation_y.count(1)}")

training data: 74196 validation: 3260
Dont buys: 37098, buys: 37098
validation dont buys: 1630 buys: 1630


### Training and Predictions

In [None]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization # Changed line - removed CuDNNLSTM
# from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

# # define model parameter
# EPOCHS = 10
# BATCH_SIZE = 32
# NAME = f"{RATION_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

# model = Sequential()
# # add layers to model
# # if you using cpu version use LSTM not CuDNNLSTM, if you have gpu, tensorflow will automatically
# # use the cudnn implementation if conditions are met.
# model.add(LSTM(64, input_shape=(train_x.shape[1:]), return_sequences=True)) # Changed line - replaced CuDNNLSTM with LSTM
# model.add(Dropout(0.2))
# model.add(BatchNormalization())

# model.add(LSTM(64, input_shape=(train_x.shape[1:]), return_sequences=True)) # Changed line - replaced CuDNNLSTM with LSTM
# model.add(Dropout(0.2))
# model.add(BatchNormalization())

# model.add(LSTM(64, input_shape=(train_x.shape[1:]))) # Changed line - replaced CuDNNLSTM with LSTM
# model.add(Dropout(0.2))
# model.add(BatchNormalization())

# model.add(Dense(6,activation="relu"))
# model.add(Dropout(0.2))

# model.add(Dense(2, activation="softmax"))

# opt = tf.keras.optimizers.Adam(learning_rate=0.001, decay=1e-6) # Changed line - replaced 'lr' with 'learning_rate'

# model.compile(loss='sparse_categorical_crossentropy',
#              optimizer=opt,
#              metrics=['accuracy'])

# # tensorbord = TensorBoard(log_dir=f'log/{NAME}')

# # filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}" # uniqe file name that will incluse the epoch and the validation acc for that ecpoch
# # checkpoint = ModelCheckpoint("../input/{}.model".format(filepath, monitor='val_acc', verbos=1, save_best_only=True, mod='max'))

# history = model.fit(
#         train_x, train_y,
#         batch_size=BATCH_SIZE,
#         epochs=EPOCHS,
#         validation_data=(validation_x, validation_y))

In [None]:
from IPython import get_ipython
from IPython.display import display
import pandas as pd
import os
from sklearn import preprocessing
from collections import deque
import numpy as np
import random
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

# Load and prepare datasets
main_df = pd.DataFrame()
ratios = ['BTC-USD', 'LTC-USD', 'ETH-USD', 'BCH-USD']

for ratio in ratios:
    dataset = f'/content/{ratio}.csv'
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])
    df.rename(columns={"close":f"{ratio}_close", "volume":f"{ratio}_volume"}, inplace=True)
    df.set_index("time", inplace=True)
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)

# Define parameters
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3
RATION_TO_PREDICT = "ETH-USD"

# Classify function
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

# Create target column
main_df['future'] = main_df[f"{RATION_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df[f"{RATION_TO_PREDICT}_close"], main_df['future']))

# Separate data into training and validating datasets
times = sorted(main_df.index.values)
last_5pct = times[-int(0.05 * len(times))]
validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]

# Preprocess data
def preprocess_df(df):
    df = df.drop('future', axis=1)
    for col in df.columns:
        if col != 'target':
            df[col] = df[col].pct_change()
            df.dropna(inplace=True)
            df[col] = preprocessing.scale(df[col].values)
    df.dropna(inplace=True)

    sequencial_data = []
    prev_days = deque(maxlen=SEQ_LEN)
    for i in df.values:
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LEN:
            sequencial_data.append([np.array(prev_days), i[-1]])

    random.shuffle(sequencial_data)
    buys = []
    sells = []
    for seq, target in sequencial_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
    random.shuffle(buys)
    random.shuffle(sells)

    lower = min(len(buys), len(sells))
    buys = buys[:lower]
    sells = sells[:lower]
    sequencial_data = buys + sells
    random.shuffle(sequencial_data)

    x = []
    y = []
    for seq, target in sequencial_data:
        x.append(seq)
        y.append(target)

    return np.array(x), y

train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

# Training and Predictions
EPOCHS = 50
BATCH_SIZE = 64
NAME = f"{RATION_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

model = Sequential()
model.add(LSTM(64, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(64, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(64, input_shape=(train_x.shape[1:])))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(6, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(2, activation="softmax"))

opt = tf.keras.optimizers.Adam(learning_rate=0.001, decay=1e-6)

model.compile(loss='sparse_categorical_crossentropy',
             optimizer=opt,
             metrics=['accuracy'])

# Ensure all arrays have the expected dimensions and data type
train_x = np.asarray(train_x).astype(np.float32)
validation_x = np.asarray(validation_x).astype(np.float32)
train_y = np.asarray(train_y).astype(np.float32)
validation_y = np.asarray(validation_y).astype(np.float32)

# Check if validation data is empty and handle it
if validation_x.size == 0 or validation_y.size == 0:
    print("Validation data is empty. Skipping validation.")
    history = model.fit(
        train_x, train_y,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS
    )
else:
    history = model.fit(
        train_x, train_y,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=(validation_x, validation_y)
    )

In [None]:
# prompt: use the above trained model to backtest

import numpy as np
import matplotlib.pyplot as plt

# Assuming 'history' is the result of model.fit() as in your provided code
# Access the history object to get accuracy and loss values
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

# Plot the accuracy
plt.figure(figsize=(10, 5))
plt.plot(accuracy, label='Training Accuracy')
plt.plot(val_accuracy, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot the loss
plt.figure(figsize=(10, 5))
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Evaluate the model
_, accuracy = model.evaluate(validation_x, validation_y)
print('Accuracy: %.2f' % (accuracy*100))


# Make predictions on the validation set
predictions = model.predict(validation_x)
predicted_classes = np.argmax(predictions, axis=1)

# Further analysis: Confusion Matrix, Precision, Recall, F1-score
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(validation_y, predicted_classes))
print(classification_report(validation_y, predicted_classes))
