### DIAA MALAEB - INTRODUCTION TO AI FINAL PROJECT

### PROJECT: PREDICTING STOCK PRICES 

In [1]:
import pandas as pd

In [2]:
# Load datasets
df_1yr = pd.read_csv('all_stocks_1yr.csv')
df_5yr = pd.read_csv('all_stocks_5yr.csv')

# Display basic information
print(df_1yr.shape)
print(df_5yr.shape)


(126217, 7)
(606801, 7)


In [3]:
# Check for null values
# Display the non-null DataFrame
non_null_values_1yr = df_1yr.isnull().sum()
non_null_values_5yr = df_5yr.isnull().sum()
print(non_null_values_1yr)
print(non_null_values_5yr)

Date        0
Open      380
High      206
Low       224
Close       0
Volume      0
Name        0
dtype: int64
Date        0
Open      384
High      208
Low       227
Close       0
Volume    406
Name        0
dtype: int64


In [4]:
# Impute with median values
# For 1-year dataset
df_1yr['Open'] = df_1yr['Open'].fillna(df_1yr['Open'].median())
df_1yr['High'] = df_1yr['High'].fillna(df_1yr['High'].median())
df_1yr['Low'] = df_1yr['Low'].fillna(df_1yr['Low'].median())
df_1yr['Volume'] = df_1yr['Volume'].fillna(df_1yr['Volume'].median())


In [5]:
# For 5-year dataset
df_5yr['Open'] = df_5yr['Open'].fillna(df_5yr['Open'].median())
df_5yr['High'] = df_5yr['High'].fillna(df_5yr['High'].median())
df_5yr['Low'] = df_5yr['Low'].fillna(df_5yr['Low'].median())
df_5yr['Volume'] = df_5yr['Volume'].fillna(df_5yr['Volume'].median())


In [6]:
# Check for null values after filling
print(df_1yr.isnull().sum())
print(df_5yr.isnull().sum())


Date      0
Open      0
High      0
Low       0
Close     0
Volume    0
Name      0
dtype: int64
Date      0
Open      0
High      0
Low       0
Close     0
Volume    0
Name      0
dtype: int64


In [7]:
duplicates_1yr = df_1yr.duplicated()
duplicates_5yr = df_5yr.duplicated()
print(duplicates_1yr)
print(duplicates_5yr)

0         False
1         False
2         False
3         False
4         False
          ...  
126212    False
126213    False
126214    False
126215    False
126216    False
Length: 126217, dtype: bool
0         False
1         False
2         False
3         False
4         False
          ...  
606796    False
606797    False
606798    False
606799    False
606800    False
Length: 606801, dtype: bool


In [8]:
# Check for negative prices
invalid_prices_1yr = df_1yr[(df_1yr['Open'] < 0) | (df_1yr['High'] < 0) | (df_1yr['Low'] < 0) | (df_1yr['Close'] < 0)]
invalid_prices_5yr = df_5yr[(df_5yr['Open'] < 0) | (df_5yr['High'] < 0) | (df_5yr['Low'] < 0) | (df_5yr['Close'] < 0)]

# Check for invalid volumes
invalid_volumes_1yr = df_1yr[df_1yr['Volume'] <= 0]
invalid_volumes_5yr = df_5yr[df_5yr['Volume'] <= 0]

print(f"Invalid prices: {len(invalid_prices_1yr)}")
print(f"Invalid prices: {len(invalid_prices_5yr)}")

print(f"Invalid volumes: {len(invalid_volumes_1yr)}")
print(f"Invalid volumes: {len(invalid_volumes_5yr)}")

Invalid prices: 0
Invalid prices: 0
Invalid volumes: 1
Invalid volumes: 4


In [9]:
#Handling Invalid volumes
# Inspect rows with invalid volumes
invalid_volumes_rows_1yr = df_1yr[df_1yr['Volume'] <= 0]
invalid_volumes_rows_5yr = df_5yr[df_5yr['Volume'] <= 0]
print(invalid_volumes_rows_1yr)
print(invalid_volumes_rows_5yr)

# Drop rows where 'Volume' is 0
df_1yr = df_1yr[df_1yr['Volume'] > 0]
df_5yr = df_5yr[df_5yr['Volume'] > 0]

#using drop() with condition
df_1yr.drop(df_1yr[df_1yr['Volume'] == 0].index, inplace=True)
df_5yr.drop(df_5yr[df_5yr['Volume'] == 0].index, inplace=True)


             Date   Open   High    Low  Close  Volume Name
19691  2017-07-26  68.96  69.55  68.38  69.08       0  BHF
              Date   Open   High    Low  Close  Volume Name
97075   2017-07-26  59.24  59.79  58.69  69.08     0.0  BHF
247442  2016-06-16  59.24  59.79  58.69  47.00     0.0  FTV
247444  2016-06-20  59.24  59.79  58.69  50.00     0.0  FTV
587245  2015-06-26  59.24  59.79  58.69  61.90     0.0  WRK


In [10]:
# Verify the drop was successful
print(df_1yr['Volume'].isnull().sum())
print(df_5yr['Volume'].isnull().sum())

0
0


In [11]:
zero_volumes_1yr = df_1yr[df_1yr['Volume'] == 0]
zero_volumes_5yr = df_5yr[df_5yr['Volume'] == 0]
print(zero_volumes_1yr)
print(zero_volumes_5yr)

Empty DataFrame
Columns: [Date, Open, High, Low, Close, Volume, Name]
Index: []
Empty DataFrame
Columns: [Date, Open, High, Low, Close, Volume, Name]
Index: []


In [12]:
# Convert 'Date' column to datetime format
df_1yr['Date'] = pd.to_datetime(df_1yr['Date'], format='%Y-%m-%d', errors='coerce')
df_5yr['Date'] = pd.to_datetime(df_5yr['Date'], format='%Y-%m-%d', errors='coerce')

# Check the types after conversion
print(df_1yr['Date'].dtype)
print(df_5yr['Date'].dtype)

datetime64[ns]
datetime64[ns]


In [13]:

# If needed, convert back to string format 'YYYY-MM-DD'
df_1yr['Date'] = df_1yr['Date'].dt.strftime('%Y-%m-%d')
df_5yr['Date'] = df_5yr['Date'].dt.strftime('%Y-%m-%d')

# Verify the conversion
print(df_1yr['Date'].head())
print(df_5yr['Date'].head())

0    2016-08-12
1    2016-08-15
2    2016-08-16
3    2016-08-17
4    2016-08-18
Name: Date, dtype: object
0    2012-08-13
1    2012-08-14
2    2012-08-15
3    2012-08-16
4    2012-08-17
Name: Date, dtype: object


In [46]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam

In [57]:
# Feature engineering for 1-year dataset (using 'Open', 'High', 'Low', 'Close', 'Volume' for prediction)
features_1yr = ['Open', 'High', 'Low', 'Close', 'Volume']
scaler_1yr = MinMaxScaler(feature_range=(0, 1))
df_1yr[features_1yr] = scaler_1yr.fit_transform(df_1yr[features_1yr])

# Feature engineering for 5-year dataset (using 'Open', 'High', 'Low', 'Close', 'Volume' for prediction)
features_5yr = ['Open', 'High', 'Low', 'Close', 'Volume']
scaler_5yr = MinMaxScaler(feature_range=(0, 1))
df_5yr[features_5yr] = scaler_5yr.fit_transform(df_5yr[features_5yr])



In [58]:
# Creating the 'Movement' target variable (1 for price increase, 0 for price decrease)
df_1yr['Movement'] = (df_1yr['Close'].shift(-1) > df_1yr['Close']).astype(int)
df_5yr['Movement'] = (df_5yr['Close'].shift(-1) > df_5yr['Close']).astype(int)


In [59]:
# Now, instead of predicting 'Close', we predict 'Movement'
y_1yr = df_1yr['Movement'].iloc[sequence_length:].values  # Start from index 60 to match the sequence length

# Prepare input sequences for 1-year dataset
x_1yr = np.array([df_1yr.iloc[i-sequence_length:i][features_1yr].values for i in range(sequence_length, len(df_1yr))])


In [60]:
# Now, instead of predicting 'Close', we predict 'Movement'
y_1yr = df_1yr['Movement'].iloc[sequence_length:].values  # Start from index 60 to match the sequence length

# Prepare input sequences for 1-year dataset
x_1yr = np.array([df_1yr.iloc[i-sequence_length:i][features_1yr].values for i in range(sequence_length, len(df_1yr))])


In [61]:
# Now, instead of predicting 'Close', we predict 'Movement'
y_5yr = df_5yr['Movement'].iloc[sequence_length:].values  # Same for 5-year data

# Prepare input sequences for 5-year dataset
x_5yr = np.array([df_5yr.iloc[i-sequence_length:i][features_5yr].values for i in range(sequence_length, len(df_5yr))])


In [62]:
# Split the 1-year data into training and testing sets (80% training, 20% testing)
x_train_1yr, x_test_1yr, y_train_1yr, y_test_1yr = train_test_split(x_1yr, y_1yr, test_size=0.2, random_state=42)


In [63]:
# Split the 5-year data into training and testing sets (80% training, 20% testing)
x_train_5yr, x_test_5yr, y_train_5yr, y_test_5yr = train_test_split(x_5yr, y_5yr, test_size=0.2, random_state=42)


In [65]:
from tensorflow.keras.layers import LSTM

# Define the model for 1-year dataset
model_1yr = Sequential()

# Add LSTM layers
model_1yr.add(LSTM(units=50, return_sequences=True, input_shape=(x_train_1yr.shape[1], x_train_1yr.shape[2])))
model_1yr.add(Dropout(0.2))

model_1yr.add(LSTM(units=50, return_sequences=False))
model_1yr.add(Dropout(0.2))

# Add a dense layer for the output (prediction)
model_1yr.add(Dense(units=1))  # Single value output for regression

# Compile the model
model_1yr.compile(optimizer='adam', loss='mean_squared_error')

# Summary of the model
model_1yr.summary()

# Now let's train the model
history_1yr = model_1yr.fit(x_train_1yr, y_train_1yr, epochs=20, batch_size=32, validation_data=(x_test_1yr, y_test_1yr))


  super().__init__(**kwargs)


Epoch 1/20
[1m3154/3154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 25ms/step - loss: 0.2587 - val_loss: 0.2501
Epoch 2/20
[1m3154/3154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 26ms/step - loss: 0.2512 - val_loss: 0.2512
Epoch 3/20
[1m3154/3154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 26ms/step - loss: 0.2506 - val_loss: 0.2504
Epoch 4/20
[1m3154/3154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 26ms/step - loss: 0.2503 - val_loss: 0.2498
Epoch 5/20
[1m3154/3154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 26ms/step - loss: 0.2498 - val_loss: 0.2498
Epoch 6/20
[1m3154/3154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 26ms/step - loss: 0.2499 - val_loss: 0.2498
Epoch 7/20
[1m3154/3154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 26ms/step - loss: 0.2498 - val_loss: 0.2499
Epoch 8/20
[1m3154/3154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 26ms/step - loss: 0.2499 - val_loss: 0.2499
Epoch 9/

In [66]:
# Define the model for 5-year dataset
model_5yr = Sequential()

# Add LSTM layers
model_5yr.add(LSTM(units=50, return_sequences=True, input_shape=(x_train_5yr.shape[1], x_train_5yr.shape[2])))
model_5yr.add(Dropout(0.2))

model_5yr.add(LSTM(units=50, return_sequences=False))
model_5yr.add(Dropout(0.2))

# Add a dense layer for the output (prediction)
model_5yr.add(Dense(units=1))  # Single value output for regression

# Compile the model
model_5yr.compile(optimizer='adam', loss='mean_squared_error')

# Summary of the model
model_5yr.summary()

# Now let's train the model
history_5yr = model_5yr.fit(x_train_5yr, y_train_5yr, epochs=20, batch_size=32, validation_data=(x_test_5yr, y_test_5yr))


  super().__init__(**kwargs)


Epoch 1/20
[1m15169/15169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m378s[0m 25ms/step - loss: 0.2527 - val_loss: 0.2497
Epoch 2/20
[1m15169/15169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m385s[0m 25ms/step - loss: 0.2498 - val_loss: 0.2497
Epoch 3/20
[1m15169/15169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1572s[0m 104ms/step - loss: 0.2498 - val_loss: 0.2497
Epoch 4/20
[1m15169/15169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28543s[0m 2s/step - loss: 0.2496 - val_loss: 0.2497
Epoch 5/20
[1m15169/15169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59771s[0m 4s/step - loss: 0.2497 - val_loss: 0.2497
Epoch 6/20
[1m15169/15169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40998s[0m 3s/step - loss: 0.2497 - val_loss: 0.2497
Epoch 7/20
[1m15169/15169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49900s[0m 3s/step - loss: 0.2496 - val_loss: 0.2497
Epoch 8/20
[1m15169/15169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45067s[0m 3s/step - loss: 0.2496 -

In [67]:
# Evaluate the 1-year model
mse_1yr = model_1yr.evaluate(x_test_1yr, y_test_1yr)
print("Mean Squared Error for 1-year dataset:", mse_1yr)

# Evaluate the 5-year model
mse_5yr = model_5yr.evaluate(x_test_5yr, y_test_5yr)
print("Mean Squared Error for 5-year dataset:", mse_5yr)


[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - loss: 0.2499
Mean Squared Error for 1-year dataset: 0.2498241662979126
[1m3793/3793[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 4ms/step - loss: 0.2497
Mean Squared Error for 5-year dataset: 0.24968603253364563


In [76]:
import numpy as np

# Step 1: Generate random data for prediction (5 features: Open, High, Low, Close, Volume)
# Make sure the range of the random data matches the expected feature ranges
random_data_1yr = np.random.rand(1, 5)  # 1 sample, 5 features (Open, High, Low, Close, Volume)
random_data_5yr = np.random.rand(1, 5)  # Similar for 5-year dataset

# Step 2: Scale the random data using the same scaler used for training
scaled_random_data_1yr = scaler_1yr.transform(random_data_1yr)
scaled_random_data_5yr = scaler_5yr.transform(random_data_5yr)

# Step 3: Reshape the data to match the expected input shape for the model (3D shape for LSTM)
scaled_random_data_1yr = scaled_random_data_1yr.reshape(1, 1, 5)  # 1 sample, 1 timestep, 5 features
scaled_random_data_5yr = scaled_random_data_5yr.reshape(1, 1, 5)

# Step 4: Make predictions with the models
prediction_1yr = model_1yr.predict(scaled_random_data_1yr)
prediction_5yr = model_5yr.predict(scaled_random_data_5yr)

# Step 5: Output the predictions
print("1-Year Dataset Prediction:", prediction_1yr)
print("5-Year Dataset Prediction:", prediction_5yr)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
1-Year Dataset Prediction: [[0.4966564]]
5-Year Dataset Prediction: [[0.5240886]]
