In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import LSTM
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [2]:
df = pd.read_csv('D:\MY WORK\Turing Internship\DATA\Google_train_data.csv')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,1/3/2012,325.25,332.83,324.97,663.59,7380500
1,1/4/2012,331.27,333.87,329.08,666.45,5749400
2,1/5/2012,329.83,330.75,326.89,657.21,6590300
3,1/6/2012,328.34,328.77,323.68,648.24,5405900
4,1/9/2012,322.04,322.29,309.46,620.76,11688800


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    1258 non-null   object 
 1   Open    1258 non-null   float64
 2   High    1258 non-null   float64
 3   Low     1258 non-null   float64
 4   Close   1258 non-null   object 
 5   Volume  1258 non-null   object 
dtypes: float64(3), object(3)
memory usage: 59.1+ KB


In [4]:
df.drop(['Date'],axis='columns',inplace=True)

In [5]:
df.dtypes

Open      float64
High      float64
Low       float64
Close      object
Volume     object
dtype: object

In [6]:
df['Volume'] = df['Volume'].str.replace(',','')
df['Volume'] = df['Volume'].astype('float32')
df['Close'] = df['Close'].str.replace(',','')
df['Close'] = df['Close'].astype('float32')
df['Open'] = df['Open'].astype('float32')
df['High'] = df['High'].astype('float32')
df['Low'] = df['Low'].astype('float32')

In [7]:
df.dtypes

Open      float32
High      float32
Low       float32
Close     float32
Volume    float32
dtype: object

In [8]:
df.shape

(1258, 5)

In [9]:
df.isnull().sum()

Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64

In [10]:
df.isna().sum()

Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64

In [11]:
correlation = df.corr()
print(correlation['Close'].sort_values(ascending=False))

Close     1.000000
Volume    0.192357
Low       0.129001
Open      0.125832
High      0.124351
Name: Close, dtype: float64


In [12]:
X = df[['Open', 'High', 'Low', 'Volume']]

y = df['Close']

In [13]:
#Running LSTM
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
X_reshape = X_scaled.reshape((X_scaled.shape[0],1,X_scaled.shape[1]))

In [15]:
from sklearn.model_selection import KFold
fold = KFold(n_splits=10)

for train_index, test_index in fold.split(X_reshape, y):
    X_train, X_test = X_reshape[train_index], X_reshape[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [16]:
X_train.shape,y_train.shape

((1133, 1, 4), (1133,))

In [17]:
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(100,activation='relu',input_shape =(1, 4)),
    tf.keras.layers.Dense(80,activation='relu'),
    tf.keras.layers.Dense(60,activation='relu'),
    tf.keras.layers.Dense(40,activation='relu'),
    tf.keras.layers.Dense(20,activation='relu'),
    tf.keras.layers.Dense(1)
])

In [18]:
model.compile(optimizer='adam',loss='mae')

In [19]:
model.fit(X_train, y_train, epochs=100,validation_data=(X_test,y_test))

Epoch 1/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step - loss: 702.9562 - val_loss: 764.6487
Epoch 2/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 688.0646 - val_loss: 589.6447
Epoch 3/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 465.3279 - val_loss: 299.9010
Epoch 4/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 214.6321 - val_loss: 10.2401
Epoch 5/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 170.4820 - val_loss: 32.3144
Epoch 6/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 150.0033 - val_loss: 49.0629
Epoch 7/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 119.6947 - val_loss: 35.4554
Epoch 8/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 117.1321 - val_loss: 81.3809
Epoch 9/100
[1m36/3

<keras.src.callbacks.history.History at 0x2a276e39370>

In [20]:
model.evaluate(X_test,y_test)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.6945 


3.9708213806152344

In [21]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_pred

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [22]:
from sklearn.metrics import mean_absolute_error

from sklearn.metrics import mean_squared_error
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 770.3359189453125


In [23]:
#Test dataset
df1 = pd.read_csv('D:\MY WORK\Turing Internship\DATA\Google_test_data.csv')
df1

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-08-13,1236.979980,1249.272949,1233.640991,1235.010010,1235.010010,997300
1,2018-08-14,1235.189941,1245.869995,1225.109985,1242.099976,1242.099976,1348100
2,2018-08-15,1229.260010,1235.239990,1209.510010,1214.380005,1214.380005,1828800
3,2018-08-16,1224.729980,1226.000000,1202.550049,1206.489990,1206.489990,1343200
4,2018-08-17,1202.030029,1209.020020,1188.239990,1200.959961,1200.959961,1389600
...,...,...,...,...,...,...,...
247,2019-08-07,1156.000000,1178.444946,1149.624023,1173.989990,1173.989990,1444300
248,2019-08-08,1182.829956,1205.010010,1173.020020,1204.800049,1204.800049,1468000
249,2019-08-09,1197.989990,1203.880005,1183.603027,1188.010010,1188.010010,1065700
250,2019-08-12,1179.209961,1184.959961,1167.671997,1174.709961,1174.709961,1003000


In [24]:
df1.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [25]:
df1.drop(['Adj Close','Date'],axis='columns',inplace=True)

In [26]:
df1.head()

Unnamed: 0,Open,High,Low,Close,Volume
0,1236.97998,1249.272949,1233.640991,1235.01001,997300
1,1235.189941,1245.869995,1225.109985,1242.099976,1348100
2,1229.26001,1235.23999,1209.51001,1214.380005,1828800
3,1224.72998,1226.0,1202.550049,1206.48999,1343200
4,1202.030029,1209.02002,1188.23999,1200.959961,1389600


In [27]:
df1.dtypes

Open      float64
High      float64
Low       float64
Close     float64
Volume      int64
dtype: object

In [28]:
df1.isnull().sum()

Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64

In [29]:
df1.isna().sum()

Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64

In [30]:
X_new = df1[['Open', 'High', 'Low', 'Volume']]

y_new = df1['Close']

In [31]:
scaler = MinMaxScaler()
X_scaled_new = scaler.fit_transform(X_new)

In [32]:
X_reshape_new = X_scaled_new.reshape((X_scaled_new.shape[0],1,X_scaled_new.shape[1]))

In [33]:
foldN = KFold(n_splits=10)

for train_index_new, test_index_new in foldN.split(X_reshape_new, y_new):
    X_new_train, X_new_test = X_reshape_new[train_index_new], X_reshape_new[test_index_new]
    y_new_train, y_new_test = y_new.iloc[train_index_new], y_new.iloc[test_index_new]

In [34]:
X_new_train.shape,y_new_train.shape

((227, 1, 4), (227,))

In [35]:
modelN = tf.keras.Sequential([
    tf.keras.layers.LSTM(100,activation='relu',input_shape =(1, 4)),
    tf.keras.layers.Dense(80,activation='relu'),
    tf.keras.layers.Dense(60,activation='relu'),
    tf.keras.layers.Dense(40,activation='relu'),
    tf.keras.layers.Dense(20,activation='relu'),
    tf.keras.layers.Dense(1)
])

In [36]:
modelN.compile(optimizer='adam',loss='mae')

In [37]:
modelN.fit(X_new_train, y_new_train, epochs=100,validation_data=(X_new_test,y_new_test))

Epoch 1/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 103ms/step - loss: 1128.2716 - val_loss: 1172.0934
Epoch 2/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 1128.2311 - val_loss: 1171.8044
Epoch 3/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 1128.3530 - val_loss: 1171.1831
Epoch 4/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 1128.1350 - val_loss: 1169.8274
Epoch 5/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 1124.9933 - val_loss: 1166.8654
Epoch 6/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 1120.0397 - val_loss: 1160.4653
Epoch 7/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 1109.7877 - val_loss: 1147.1433
Epoch 8/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 1105.3295 - val_loss: 1120.5604
Epoch 9

<keras.src.callbacks.history.History at 0x2a278b8c290>

In [38]:
modelN.evaluate(X_new_test,y_new_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 6.3525


6.352470874786377

In [39]:
y_pred_new = modelN.predict(X_new_test)
y_pred_new = np.argmax(y_pred_new, axis=1)
y_pred_new

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 280ms/step


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int64)

In [40]:
mae = mean_absolute_error(y_new_test, y_pred_new)
print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 1172.2475928


In [41]:
#Simple NN 
foldNN = KFold(n_splits=10)

for train_index_NN, test_index_NN in foldNN.split(X_scaled, y):
    X_NN_train, X_NN_test = X_scaled[train_index_NN], X_scaled[test_index_NN]
    y_NN_train, y_NN_test = y.iloc[train_index_NN], y.iloc[test_index_NN]

In [42]:
X_NN_train.shape

(1133, 4)

In [43]:
modelNN = tf.keras.Sequential([
    
    tf.keras.layers.Dense(100,input_shape=(4,),activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(80,activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(60,activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(40,activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(20,activation='relu'),
    tf.keras.layers.Dense(1)
   
    
])

In [44]:
modelNN.compile(optimizer='adam',loss='mae')

In [45]:
modelNN.fit(X_NN_train, y_NN_train, epochs=50,validation_data=(X_NN_test,y_NN_test))

Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - loss: 701.7191 - val_loss: 757.8161
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 686.1659 - val_loss: 251.6913
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 269.8618 - val_loss: 8.5079
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 192.2921 - val_loss: 26.7142
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 145.3030 - val_loss: 39.8570
Epoch 6/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 134.3509 - val_loss: 35.0250
Epoch 7/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 131.1383 - val_loss: 55.8359
Epoch 8/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 127.6654 - val_loss: 46.1296
Epoch 9/50
[1m36/36[0m [32m

<keras.src.callbacks.history.History at 0x2a27f079220>

In [46]:
modelNN.evaluate(X_NN_test, y_NN_test)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 17.0656 


17.134105682373047

In [47]:
y_pred_NN = modelNN.predict(X_NN_test)
y_pred_NN = np.argmax(y_pred_NN,axis=1)
y_pred_NN

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [48]:
mae = mean_absolute_error(y_NN_test, y_pred_NN)
print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 770.3359189453125


In [49]:
#Running ML models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import  make_scorer,mean_absolute_error,mean_squared_error,r2_score

In [50]:
X_ML_train,X_ML_test,y_ML_train,y_ML_test = train_test_split(X,y,test_size=0.3,random_state=45)

In [51]:
def getscore(model,X_ML_train,X_ML_test,y_ML_train,y_ML_test):
    train = model.fit(X_ML_train,y_ML_train)
    score = model.score(X_ML_test,y_ML_test)
    yp = model.predict(X_ML_test)
    mae = mean_absolute_error(y_ML_test,yp)
    mse = mean_squared_error(y_ML_test,yp)
    r2 = r2_score(y_ML_test,yp)
    return score,mae,mse,r2

In [52]:
score_l = []
score_svm = []
score_rf = []
score_tree = []

score_l.append(getscore(LinearRegression(), X_ML_train,X_ML_test,y_ML_train,y_ML_test))
score_svm.append(getscore(SVR(), X_ML_train,X_ML_test,y_ML_train,y_ML_test))
score_rf.append(getscore(RandomForestRegressor(), X_ML_train,X_ML_test,y_ML_train,y_ML_test))
score_tree.append(getscore(DecisionTreeRegressor(),X_ML_train,X_ML_test,y_ML_train,y_ML_test))

In [53]:
models = ["Logistic Regression", "SVM", "Random Forest", "Decision Tree"]
scores = [score_l, score_svm, score_rf, score_tree]

for model_name, model_scores in zip(models, scores):
    for score, mae, mse, r2 in model_scores:
        print(f'{model_name}:')
        print(f'  Score: {score}')
        print(f'  Mean_absolute_error: {mae}')
        print(f'  Mean_squared_error: {mse}')
        print(f'  R2_score: {r2}')
        print()

Logistic Regression:
  Score: 0.19922113418579102
  Mean_absolute_error: 114.4417495727539
  Mean_squared_error: 23773.6875
  R2_score: 0.19922113418579102

SVM:
  Score: 0.011322594519208584
  Mean_absolute_error: 130.23355608934884
  Mean_squared_error: 29352.05854425743
  R2_score: 0.011322594519208584

Random Forest:
  Score: 0.7244715744239247
  Mean_absolute_error: 31.77756475943106
  Mean_squared_error: 8179.944674859024
  R2_score: 0.7244715744239247

Decision Tree:
  Score: 0.565239968768511
  Mean_absolute_error: 28.327403880931712
  Mean_squared_error: 12907.245395381684
  R2_score: 0.565239968768511



In [54]:
def cross_val(model,X,y):
    scoring_mae = make_scorer(mean_absolute_error, greater_is_better=False)
    scoring_mse = make_scorer(mean_squared_error, greater_is_better=False)
    scoring_r2 = make_scorer(r2_score)

    
    cv_mse = -cross_val_score(model,X,y,cv=5,scoring=scoring_mae).mean()
    cv_mae = -cross_val_score(model,X,y,cv = 5,scoring=scoring_mse).mean()
    cv_r2 = cross_val_score(model,X,y,cv = 5,scoring=scoring_r2).mean()
    return cv_mse,cv_mae,cv_r2
    

In [55]:
cv_score_l = cross_val(LinearRegression(),X,y)
cv_score_rf = cross_val(RandomForestRegressor(),X,y)
cv_score_tree = cross_val(DecisionTreeRegressor(),X,y)
cv_score_svm = cross_val(SVR(),X,y)


In [56]:
models_cv= ["Logistic Regression",  "Random Forest", "Decision Tree","SVM"]
scores_cv = [cv_score_l,  cv_score_rf, cv_score_tree, cv_score_svm,]

for model_name_cv,model_score_cv in zip(models_cv,scores_cv):
    mae, mse, r2 = model_score_cv
    print(f'{model_name_cv}:')
    print(f'Mean_absolute_error: {mae}')
    print(f'  Mean_squared_error: {mse}')
    print(f'  R2_score: {r2}')
    print()

Logistic Regression:
Mean_absolute_error: 158.51181640625
  Mean_squared_error: 39793.0130859375
  R2_score: -6.58026555776596

Random Forest:
Mean_absolute_error: 114.33096473917044
  Mean_squared_error: 44170.86481363818
  R2_score: -2.570654093646751

Decision Tree:
Mean_absolute_error: 117.79451784112078
  Mean_squared_error: 50352.250127618325
  R2_score: -2.9680213153013706

SVM:
Mean_absolute_error: 176.72898420043256
  Mean_squared_error: 40975.821087478704
  R2_score: -6.735022683825937

