In [None]:
!pip3 install yfinance numpy pandas matplotlib scikit-learn statsmodels tensorflow

In [14]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller



In [None]:
# List of stock tickers used for this project
tickers = ["AAPL", "MSFT", "JNJ", "PFE", "JPM", "BAC"]

# Time period for data collection
start_date = "2010-01-01"
end_date = "2023-12-31"

# Stock data dictionary
stock_data = {}

# Acquire close prices for each of the stocks
for ticker in tickers:
    print(f"Downloading data for {ticker}...")
    data = yf.download(ticker, start=start_date, end=end_date)['Close']
    stock_data[ticker] = data
    data.to_csv(f"{ticker}_closing_data.csv")

# Convert the stock data to a DataFrame
close_prices_df = pd.concat(stock_data, axis=1)

# Rename the columns to be the stock tickers
close_prices_df.columns=tickers
# Save combined data to a CSV
close_prices_df.to_csv("combined_closing_prices.csv")
# Display combined data
print(close_prices_df.head())


In [None]:
# Inspecting the Data

# Assign data to be the closing prices DataFrame
data = close_prices_df

# Set the index to be the date column in order to chronologically use data
data.index = pd.to_datetime(data.index)

# Plot the closing prices for each stock
data.plot(figsize=(12, 6))
plt.title("Stock Closing Prices")
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend(title="Stocks")
plt.show()


In [17]:
# Normalizing the data (for LSTM)

# Normalize the data using MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
normalized_data = scaler.fit_transform(data)

# Convert back to DataFrame
normalized_df = pd.DataFrame(normalized_data, index=data.index, columns=data.columns)


In [None]:
# LINEAR REGRESSION

# Define a dictionary to store train-test data for each stock
train_test_data = {}

# Function to create sequences for time series data
def create_sequences(data, window_size=30):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])
        y.append(data[i + window_size])
    return np.array(X), np.array(y)

# Loop through each stock in the DataFrame and prepare data for it
window_size = 30
for stock in close_prices_df.columns:
    print(f"Preparing data for {stock}...")
    
    # Get the stock's closing price data
    stock_data = close_prices_df[stock].values

    # Create sequences for the current stock
    X, y = create_sequences(stock_data, window_size=window_size)

    # Split into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Store the train-test data for current stock
    train_test_data[stock] = {
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test
    }

    print(f"Data prepared for {stock}: Train size: {len(y_train)}, Test size: {len(y_test)}")

    # Train a Linear Regression model for current stock
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate errors
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print(f"{stock} - Mean Squared Error (MSE): {mse}")
    print(f"{stock} - Root Mean Squared Error (RMSE): {rmse}")

    # Plot actual vs predicted prices
    plt.figure(figsize=(10, 6))
    plt.scatter(range(len(y_test)), y_test, label='Actual Prices', color='blue', s=10)
    plt.scatter(range(len(y_pred)), y_pred, label='Predicted Prices', color='red', s=10, marker='x')
    plt.xlabel("Time(index of each test sample)")
    plt.ylabel("Price")
    plt.legend()
    plt.title(f"Linear Regression: Actual vs Predicted Prices for {stock}")
    plt.show()


In [None]:
# Checking if our data is stationary before differencing

# Perform Augmented Dickey-Fuller test to determine stock's stationarity
def check_stationarity(series, stock_name):
    result = adfuller(series)
    print(f"--- {stock_name} ---")
    print(f"ADF Statistic: {result[0]}")
    print(f"p-value: {result[1]}")
    if result[1] > 0.05:
        print("The series is likely non-stationary.")
    else:
        print("The series is stationary.")

# Function to plot ACF and PACF
def plot_acf_pacf(series, stock_name):
    plt.figure(figsize=(12, 6))
    plot_acf(series, lags=50)
    plt.title(f"ACF Plot for {stock_name}")
    plt.show()
    
    plt.figure(figsize=(12, 6))
    plot_pacf(series, lags=50)
    plt.title(f"PACF Plot for {stock_name}")
    plt.show()

# Loop through all stocks in the DataFrame
for stock in data.columns:
    print(f"Processing {stock}...")
    stock_series = data[stock].dropna()
    
    # Check stationarity
    check_stationarity(stock_series, stock)
    
    # Plot ACF and PACF
    plot_acf_pacf(stock_series, stock)



In [None]:
# Checking if all our data is stationary after differencing, which it should be
for stock in data.columns:

    # Get the stock's data
    stock_data = data[stock].dropna()

    # Apply differencing to make the series stationary
    stock_data_diff = stock_data.diff().dropna()

    # Check stationarity again after differencing
    check_stationarity(stock_data_diff, stock)

    # Create ACF and PACF plots for the differenced data
    plt.figure(figsize=(12, 6))
    plot_acf(stock_data_diff, lags=50)
    plt.title(f"ACF Plot for Differenced {stock}")
    plt.show()

    plt.figure(figsize=(12, 6))
    plot_pacf(stock_data_diff, lags=50)
    plt.title(f"PACF Plot for Differenced {stock}")
    plt.show()

In [None]:
# ARIMA

# Define a dictionary to store train-test data for each stock
train_test_data = {}

# Loop through each stock in the DataFrame
for stock in data.columns:
    print(f"Preparing ARIMA model for {stock}...")
    
    # Get the stock's closing price data
    stock_data = data[stock]

    # Plot the stock price for visualization purposes
    stock_data.plot(figsize=(20, 8))
    plt.title(f"Stock Price for {stock}")
    plt.ylabel("Price")
    plt.show()

    # Set the train and test sizes
    train_size = int(len(stock_data) * 0.8)
    train, test = stock_data[:train_size], stock_data[train_size:]

    # Fit ARIMA model and set the p,d, and q values
    model = ARIMA(train, order=(2, 0, 1))
    model_fit = model.fit()

    # Make predictions
    predictions = model_fit.forecast(steps=len(test))
    print(predictions)

    # Evaluate the model using errors
    mse = mean_squared_error(test, predictions)
    rmse = np.sqrt(mse)
    print(f"{stock} - Mean Squared Error (MSE): {mse}")
    print(f"{stock} - Root Mean Squared Error (RMSE): {rmse}")

    # Plot actual vs predicted prices
    plt.figure(figsize=(20, 8))
    plt.plot(test.index, test, label='Test Actual Prices', color='blue')
    plt.plot(test.index, predictions, label='Test Predicted Prices', color='red', linestyle='--')
    plt.plot(train.index,train, label='Training Actual Prices', color='purple')

    plt.title(f"ARIMA: Actual vs Predicted Prices for {stock}")
    plt.xlabel("Date")
    plt.ylabel("Price")
    plt.legend()
    plt.show()


In [None]:
PrevDays = 60  # Number of previous days used for prediction

# Loop through each stock in the DataFrame
for stock in data.columns:
    print(f"Processing LSTM for {stock}...")
    
    # Get the stock's closing price data
    stock_data = data[stock].dropna().values.reshape(-1, 1)

    # Normalize the data using MinMaxScaler
    scaler = MinMaxScaler(feature_range=(0, 1))
    stock_data_scaled = scaler.fit_transform(stock_data)

    # Create sequences
    def create_sequences(data, lookback):
        X, y = [], []
        for i in range(lookback, len(data)):
            X.append(data[i - lookback:i, 0])
            y.append(data[i, 0])
        return np.array(X), np.array(y)
    
    X, y = create_sequences(stock_data_scaled, PrevDays)
    
    # Reshape X for LSTM
    X = X.reshape(X.shape[0], X.shape[1], 1)

    # Set training and testing sizes
    train_size = int(len(X) * 0.8)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    # Build the LSTM model
    model = Sequential()
    model.add(LSTM(units=100, return_sequences=True, input_shape=(X_train.shape[1], 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(units=100, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(units=1))

    model.compile(optimizer='adam', loss='mean_squared_error')

    # Train the model
    history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), verbose=1)

    # Make predictions
    predictions_scaled = model.predict(X_test)
    predictions = scaler.inverse_transform(predictions_scaled)
    y_test = scaler.inverse_transform(y_test.reshape(-1, 1))

    # Evaluate the model using errors
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    print(f"{stock} - Mean Squared Error (MSE): {mse}")
    print(f"{stock} - Root Mean Squared Error (RMSE): {rmse}")

    # Plot actual vs predicted prices
    plt.figure(figsize=(12, 6))
    plt.plot(range(len(stock_data)), stock_data, label='Actual Prices', color='blue')
    plt.plot(range(len(stock_data) - len(predictions), len(stock_data)), predictions, label='Predicted Prices', color='red')
    plt.title(f"LSTM: Actual vs Predicted Prices for {stock}")
    plt.xlabel("Time")
    plt.ylabel("Price")
    plt.legend()
    plt.show()

    print(f"Completed LSTM model for {stock}.\n")

