In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')


In [3]:
def findBest(X,y):
    random_seeds = range(1,1000)
    test_splits = [0.1,0.15,0.2,0.25,0.3]
    best_seed = None
    best_split = None
    best_score = float('-inf')
    model = LinearRegression()
    for seed in random_seeds:
        for split in test_splits:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=seed)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = r2_score(y_test, y_pred)
            if score > best_score:
                best_score = score
                best_seed = seed
                best_split = split
    
    print("Best Score: ", best_score)
    print("Best Seed: ", best_seed)
    print("Best Split: ", best_split)
    return  best_seed, best_split


In [4]:
df = pd.read_csv('../Dataset/GR2CLEAN.csv')
df.tail()

Unnamed: 0,index,Date,Open,High,Low,Close,Volume,Time,Prev_High,Prev_Low,Prev_Volume,Prev_Open,Prev_Close,CloseChange,HighChange,Prev_CO_Diff,Prev_HL_Diff
2344,2345,2023-07-13,311.25,312.7,300.5,302.35,2840536,2345,314.55,308.1,2050698.0,311.5,308.9,-6.55,-1.85,-2.6,6.45
2345,2346,2023-07-14,305.3,310.45,302.35,308.8,1777667,2346,312.7,300.5,2840536.0,311.25,302.35,6.45,-2.25,-8.9,12.2
2346,2347,2023-07-17,312.0,314.5,310.1,311.3,1315832,2347,310.45,302.35,1777667.0,305.3,308.8,2.5,4.05,3.5,8.1
2347,2348,2023-07-18,312.1,312.75,306.0,310.0,974968,2348,314.5,310.1,1315832.0,312.0,311.3,-1.3,-1.75,-0.7,4.4
2348,2349,2023-07-19,310.95,317.95,310.05,313.45,2578074,2349,312.75,306.0,974968.0,312.1,310.0,3.45,5.2,-2.1,6.75


In [5]:
best_seeds = {
    'Close':708,
    'Open':514,
    'High':991,
    'Low':270,
    'Volume':776
}
def trainModel(atrr,df):
    X = df[['Time','Close','Open','High','Low','Prev_Volume']]
    y = df[atrr]
    best_seed , best_split = findBest(X,y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=best_split, random_state=best_seed)
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model



In [6]:
model_close = trainModel('Close',df)



Best Score:  1.0
Best Seed:  1
Best Split:  0.1


In [23]:
def trainAgain(df):
    X = df[['Time','Prev Close','Prev Open','PrevOPDiff','Prev High','Prev Low']]
    y_close = df['Close']
    y_open = df['Open']
    y_high = df['High']
    y_low = df['Low']
    y_volume = df['Volume']
    X_train_close,Y_train_close,X_test_close,Y_test_close = train_test_split(X,y_close,test_size=0.1,random_state=708)
    X_train_open,Y_train_open,X_test_open,Y_test_open = train_test_split(X,y_open,test_size=0.1,random_state=514)
    X_train_high,Y_train_high,X_test_high,Y_test_high = train_test_split(X,y_high,test_size=0.1,random_state=991)
    X_train_low,Y_train_low,X_test_low,Y_test_low = train_test_split(X,y_low,test_size=0.1,random_state=270)

    model_close = LinearRegression()
    model_close.fit(X_train_close,Y_train_close)
    model_open = LinearRegression()
    model_open.fit(X_train_open,Y_train_open)
    model_high = LinearRegression()
    model_high.fit(X_train_high,Y_train_high)
    model_low = LinearRegression()
    model_low.fit(X_train_low,Y_train_low)
    model_volume = LinearRegression()

    return model_close,model_open,model_high,model_low,model_volume


In [24]:
warnings.filterwarnings('ignore')


In [27]:
def PredictNext(df,n):
    last_row = df.tail(1)
    for i in range(n):
        input = [last_row['Time'].values[0]+1,last_row['Close'].values[0],last_row['Open'].values[0],(last_row['Open'].values[0]-last_row['Close'].values[0]),last_row['High'].values[0],last_row['Low'].values[0]]
        input = np.array(input).reshape(1,-1)
        pred_close = model_close.predict(input)
        pred_open = model_open.predict(input)
        pred_high = model_high.predict(input)
        pred_low = model_low.predict(input)
        new_row = {'Time':last_row['Time'].values[0]+1,'Close':pred_close[0],'Open':pred_open[0],'High':pred_high[0],'Low':pred_low[0],'Prev Close':last_row['Close'].values[0],'Prev Open':last_row['Open'].values[0],'PrevOPDiff':(last_row['Open'].values[0]-last_row['Close'].values[0]),'Prev High':last_row['High'].values[0],'Prev Low':last_row['Low'].values[0],'Prev Volume':last_row['Volume'].values[0]}
        df = df.append(new_row,ignore_index=True)
        last_row = df.tail(1)
    return df


In [28]:
df = PredictNext(df,1)


In [29]:
df.tail(2)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Time,Prev Close,Prev Open,PrevOPDiff,Prev High,Prev Low,Prev Volume
2348,2023-07-19,310.95,317.95,310.05,313.45,2578074.0,2349.0,310.0,312.1,2.1,312.75,306.0,974968.0
2349,,314.098002,318.694712,308.719133,313.05704,,2350.0,313.45,310.95,-2.5,317.95,310.05,2578074.0
