# <center> **STRUCTURING AND IMPORTING THE NECESSARY LIBRARIES**  </center>

In [1]:
#IMPORT ALL THE NECESSARY LIBRARIES 

# Data Processing
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Data Visualization
import matplotlib.pyplot as plt
import statsmodels.api as sm
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from matplotlib.transforms import Affine2D
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf , plot_pacf
import mplcursors  # Import mplcursors for interactive plotting
import itertools


#We use the glob () function since we are working from the same directory 
import os
import glob 

# Statistical Computation
from statsmodels.tsa.stattools import adfuller

# Prophet model for Time Series
from prophet import Prophet

# Random Forest and Linear Regression for Machine Learning
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.callbacks import Callback

# Bi-LSTM For Machine Learning
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout
import contextlib
import sys

# ARIMA For Time Series
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA

# Training and Testing 
from sklearn.model_selection import train_test_split

# Metrics Computation
from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

#Importing Warnings and Hiding them
import warnings                
warnings.filterwarnings('ignore')  

ImportError: cannot import name 'formatargspec' from 'inspect' (/Users/CHARLES/anaconda3/lib/python3.11/inspect.py)

# <center>**IMPORTING AND REVIEWING THE DATASET**</center>
#### <u>CLOSING PRICE FOR HSBC, BAC, BARC AND JPM</u>

In [None]:
# Load CSV file with proper date parsing
df_stock = pd.read_csv('stock-yy-mm-dd.csv', parse_dates=['Dates'])

# Convert 'Dates' column to datetime format
df_stock['Dates'] = pd.to_datetime(df_stock['Dates'], format='%Y-%m-%d')

# Check the basic information about the DataFrame
df_stock.info()

# This line will print the first 1044 rows of the DataFrame
df_stock.head(1044)

#### <u>SCALING THE DATA</u>

In [None]:
# Initialize the scaler
scaler = MinMaxScaler()

# Select the columns to be scaled (excluding any non-numeric columns)
numeric_cols = df_stock.select_dtypes(include=['float64', 'int64']).columns

# Fit the scaler on the data and transform it
df_stock_scaled = df_stock.copy()  # Make a copy of the DataFrame to avoid modifying the original data
df_stock_scaled[numeric_cols] = scaler.fit_transform(df_stock[numeric_cols])

# Now df_stock_scaled contains the scaled version of the data


# <center>**DATA EXPLORATORY ANALYSIS**</center>

#### <u>PLOTTING THE TREND FOR THE DATASET TO IDENTIFY PATTERNS</u>

In [None]:
# Define color map for each line
color_map = {'HSBA LN Equity': 'blue', 
             'BARC LN Equity': 'green', 
             'JPM UN Equity': 'red', 
             'BAC UN Equity': 'orange'}

fig = px.line(df_stock, x='Dates', y=['HSBA LN Equity', 'BARC LN Equity', 'JPM UN Equity', 'BAC UN Equity'], 
              title='Stock Price Prediction', color_discrete_map=color_map)
fig.update_xaxes(rangeslider_visible=False, tickangle=90)  # Rotate x-axis ticks vertically

# Adjust trend line thickness
fig.update_traces(line=dict(width=1))

# Add legend
fig.update_layout(showlegend=True)

# Center the title
fig.update_layout(title=dict(x=0.5, y=0.9, xanchor='center', yanchor='top'))

# Increase the graph size and adjust margins
fig.update_layout(
    autosize=False,
    width=1000,  # Adjust width as needed
    height=600,  # Adjust height as needed
    margin=dict(l=50, r=50, t=100, b=50),  # Adjust margins as needed
)

# Set graph background color
fig.update_layout(plot_bgcolor='white')

# Add grid
fig.update_layout(xaxis=dict(showgrid=True, gridwidth=1, gridcolor='lightgray'),
                  yaxis=dict(showgrid=True, gridwidth=1, gridcolor='lightgray'))

# Enable zoom and pan
fig.update_layout(
    dragmode='zoom',  # Enable zooming
)

fig.show()


#### <u>DECOMPOSING OUR DATASET USING TIME SERIES ANALYSIS</u>

In [None]:
# Perform time series decomposition for each time series
decomposition_results = {}
for column in ['HSBA LN Equity', 'BARC LN Equity', 'JPM UN Equity', 'BAC UN Equity']:
    result = seasonal_decompose(df_stock[column], model='additive', period=12)  # Assuming a seasonal period of 12
    decomposition_results[column] = result

# Extract components for each time series
trend = {column: result.trend for column, result in decomposition_results.items()}
seasonal = {column: result.seasonal for column, result in decomposition_results.items()}
residual = {column: result.resid for column, result in decomposition_results.items()}

# Define color map for each line
color_map = {'HSBA LN Equity': 'blue', 
             'BARC LN Equity': 'green', 
             'JPM UN Equity': 'red', 
             'BAC UN Equity': 'orange'}

# Plot original time series
fig = px.line(df_stock, x='Dates', y=['HSBA LN Equity', 'BARC LN Equity', 'JPM UN Equity', 'BAC UN Equity'], 
              title='Stock Price Prediction Decomposition')

fig.update_xaxes(rangeslider_visible=False, tickangle=90)  # Rotate x-axis ticks vertically

# Adjust trend line thickness and color
fig.update_traces(line=dict(width=1, color='black'), name='Original')

# Add legend
fig.update_layout(showlegend=True)

# Center the title
fig.update_layout(title=dict(x=0.5, y=0.9, xanchor='center', yanchor='top'))

# Increase the graph size and adjust margins
fig.update_layout(
    autosize=False,
    width=1000,  # Adjust width as needed
    height=600,  # Adjust height as needed
    margin=dict(l=50, r=50, t=100, b=50),  # Adjust margins as needed
)

# Set graph background color
fig.update_layout(plot_bgcolor='white')

# Add grid
fig.update_layout(xaxis=dict(showgrid=True, gridwidth=1, gridcolor='lightgray'),
                  yaxis=dict(showgrid=True, gridwidth=1, gridcolor='lightgray'))

# Enable zoom and pan
fig.update_layout(
    dragmode='zoom',  # Enable zooming
)

# Plot decomposed components with legend for each time series
for column in ['HSBA LN Equity', 'BARC LN Equity', 'JPM UN Equity', 'BAC UN Equity']:
    fig.add_trace(go.Scatter(x=df_stock['Dates'], y=trend[column], mode='lines', name=f'Trend - {column}', line=dict(color=color_map[column])))
    fig.add_trace(go.Scatter(x=df_stock['Dates'], y=seasonal[column], mode='lines', name=f'Seasonal - {column}', line=dict(color=color_map[column])))
    fig.add_trace(go.Scatter(x=df_stock['Dates'], y=residual[column], mode='lines', name=f'Residual - {column}', line=dict(color=color_map[column])))

fig.show()

#### <u>SEPARATING THE DECOMPOSITION FOR EACH BANK [HSBA, BARC, JPM AND BAC]</u>

In [None]:
# Define color map for each line
color_map = {'HSBA LN Equity': 'blue', 
             'BARC LN Equity': 'green', 
             'JPM UN Equity': 'red', 
             'BAC UN Equity': 'orange'}

# Plot original time series, trend, seasonality, and residual for each time series
for column in ['HSBA LN Equity', 'BARC LN Equity', 'JPM UN Equity', 'BAC UN Equity']:
    # Original Time Series
    fig_original = go.Figure()
    fig_original.add_trace(go.Scatter(x=df_stock['Dates'], y=df_stock[column], mode='lines', name=f'Original - {column}', line=dict(color=color_map[column])))
    fig_original.update_layout(title=f'Original Time Series of {column}', xaxis_title='Dates', yaxis_title='Last Price')
    fig_original.update_layout(plot_bgcolor='white', xaxis=dict(showgrid=True, gridwidth=1, gridcolor='black'), yaxis=dict(showgrid=True, gridwidth=1, gridcolor='black'))
    fig_original.show()

    # Trend Component
    fig_trend = go.Figure()
    fig_trend.add_trace(go.Scatter(x=df_stock['Dates'], y=trend[column], mode='lines', name=f'Trend - {column}', line=dict(color=color_map[column])))
    fig_trend.update_layout(title=f'Trend Component of {column}', xaxis_title='Dates', yaxis_title='Last Price')
    fig_trend.update_layout(plot_bgcolor='white', xaxis=dict(showgrid=True, gridwidth=1, gridcolor='black'), yaxis=dict(showgrid=True, gridwidth=1, gridcolor='black'))
    fig_trend.show()

    # Seasonal Component
    fig_seasonal = go.Figure()
    fig_seasonal.add_trace(go.Scatter(x=df_stock['Dates'], y=seasonal[column], mode='lines', name=f'Seasonal - {column}', line=dict(color=color_map[column])))
    fig_seasonal.update_layout(title=f'Seasonal Component of {column}', xaxis_title='Dates', yaxis_title='Last Price')
    fig_seasonal.update_layout(plot_bgcolor='white', xaxis=dict(showgrid=True, gridwidth=1, gridcolor='black'), yaxis=dict(showgrid=True, gridwidth=1, gridcolor='black'))
    fig_seasonal.show()

    # Residual Component
    fig_residual = go.Figure()
    fig_residual.add_trace(go.Scatter(x=df_stock['Dates'], y=residual[column], mode='lines', name=f'Residual - {column}', line=dict(color=color_map[column])))
    fig_residual.update_layout(title=f'Residual Component of {column}', xaxis_title='Dates', yaxis_title='Last Price')
    fig_residual.update_layout(plot_bgcolor='white', xaxis=dict(showgrid=True, gridwidth=1, gridcolor='black'), yaxis=dict(showgrid=True, gridwidth=1, gridcolor='black'))
    fig_residual.show()

#### <u>CHECKING FOR OUTLIERS IN THE DATASET</u> 

In [None]:
# Set up the figure size
plt.figure(figsize=(12, 8))

# Create boxplots for each stock's closing price with switched axes
ax = sns.boxplot(data=df_stock.iloc[:, 1:], orient="v", palette="Set2", showfliers=False)

# Rotate the x-axis labels
ax.tick_params(axis='x', rotation=45)

plt.title('Boxplot of Closing Prices for Each Stock')
plt.xlabel('Stock')
plt.ylabel('Closing Price')

plt.show()


#### <u>CHECKING FOR CORRELATION BETWEEN THE STOCKS [HSBA, BAC, BARC AND JPM]</u>

In [None]:
# Compute the correlation matrix
corr = df_stock.iloc[:, 1:].corr()

# Generate a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='viridis', fmt=".2f", annot_kws={"size": 10})
plt.title('Correlation Matrix of Closing Prices')
plt.show()


#### <u>ANALYZING AUTOCORRELATION FOR HSBA, BACR, JPM AND BAC</u>

In [None]:
# Extract the time series data
time_series = df_stock[['HSBA LN Equity', 'BARC LN Equity', 'JPM UN Equity', 'BAC UN Equity']]  # Assuming these are the appropriate column names

# Plot the autocorrelation function for each column
fig, axes = plt.subplots(nrows=len(time_series.columns), figsize=(10, 6))

for i, (column, series) in enumerate(time_series.items()):
    plot_acf(series, lags=20, ax=axes[i])  # Adjust the number of lags as needed
    axes[i].set_title(f'Autocorrelation Function for {column}')
    axes[i].set_xlabel('Lag')
    axes[i].set_ylabel('Autocorrelation')

plt.tight_layout()
plt.show()

#### <u> CONDUCTING DICKEY-FULLER TEST ON STOCK TIME SERIES FOR STATIONARITY</u>

In [None]:
# Perform the Dickey-Fuller test for each time series
for column in ['HSBA LN Equity', 'BARC LN Equity', 'JPM UN Equity', 'BAC UN Equity']:
    # Extract the time series data
    time_series = df_stock[column]

    # Perform the Dickey-Fuller test
    result = adfuller(time_series)

    # Print the test statistic and p-value
    print(f'ADF Statistic for {column}:', result[0])
    print(f'p-value for {column}:', result[1])

    # Print critical values
    print(f'Critical Values for {column}:')
    for key, value in result[4].items():
        print(f'{key}: {value}')

    print()  # Add an empty line for better readability

# <center>**STOCK PRICE PREDICTION WITH PROPHET**</center>
### <center><u>**DATA PRE-PROCESSING**</u></center>
#### <u>TRANSFORM THE DATASET INTO A MULTIPLE TIME SERIES MODEL DATASET</u>

In [None]:
# Release Date from the index
df_stock=df_stock.reset_index()

# Melt the DataFrame to convert it to long format
df_stock = pd.melt(df_stock, id_vars='Dates', value_vars=['HSBA LN Equity', 'BARC LN Equity', 'JPM UN Equity', 'BAC UN Equity'], var_name='ticker', value_name='y')
df_stock.columns = ['ds', 'ticker', 'y']

# Round the 'y' values to two decimal places
df_stock['y'] = df_stock['y'].round(2)

tickers_to_check = ['HSBA LN Equity', 'BARC LN Equity', 'JPM UN Equity', 'BAC UN Equity']

#Check if the tickers are in the Data Frame
for ticker in tickers_to_check:
    if ticker in df_stock['ticker'].unique():
        print(f"{ticker} is present in the DataFrame.")
    else:
        print(f"{ticker} is not present in the DataFrame.")

# Print the first few rows of the DataFrame
print(df_stock.head(1043))
df_stock.info()

#### GROUPING THE DATA BY TICKER FOR BETTER ANALYSIS AND VISUALIZATION

In [None]:
#Group the data by ticker
groups_by_ticker = df_stock.groupby('ticker')

#Check the groups in the dataframe
groups_by_ticker.groups.keys()

# Check the shape of the grouped data
group_shapes = {ticker: group_data.shape for ticker, group_data in groups_by_ticker}

# Print the shapes
for ticker, shape in group_shapes.items():
    print(f"Shape of data for {ticker}: {shape}")

# <center> **MODELING WITH PROPHET**</center>
#### <u>SPLITTING THE DATA</u>

In [None]:
# Initialize variables to store the sum of rows for training and testing sets
total_train_rows = 0
total_test_rows = 0

# Iterate over each ticker group
for ticker, group_data in groups_by_ticker:
    # Manually split the data for the current ticker group
    split_index = int(len(group_data) * 0.8)  # 80% for training, 20% for testing
    group_train = group_data[:split_index]  # Training subset includes data from the beginning up to the split index
    group_test = group_data[split_index:]   # Testing subset includes data from the split index to the end
    
    # Add the number of rows in the training and testing sets to the total
    total_train_rows += len(group_train)
    total_test_rows += len(group_test)
    
    print(f"Shape of data for {ticker}:")
    print("Training set:", group_train.shape)
    print("Testing set:", group_test.shape)

# Check if the sum of rows in the training and testing sets matches the total processed data shape
if total_train_rows + total_test_rows == 4172:
    print("The sum of rows in the training and testing sets for each ticker group tallies with the total processed data shape.")
else:
    print("The sum of rows in the training and testing sets for each ticker group does not tally with the total processed data shape.")


#### <u>TRAINING AND FORECASTING</u>

In [None]:
from prophet import Prophet

def forecast_prophet(groups_by_ticker):
    forecasts = []  # List to store forecasted results for each ticker group
    observed_data = []  # List to store observed data for each ticker group
    trained_models = {}  # Dictionary to store trained Prophet models for each ticker group

    # Iterate over each ticker group
    for ticker, group_data in groups_by_ticker:
        # Split the data for the current ticker group
        split_index = int(len(group_data) * 0.8)
        group_train = group_data[:split_index]
        group_test = group_data[split_index:]
        
        # Append both training and testing subsets to the observed data
        observed_data.append((ticker, group_train, group_test))
        
        # Initialize Prophet model
        model = Prophet()
        
        # Fit the model using only the training data
        model.fit(group_train)

        # Store the trained model in the dictionary
        trained_models[ticker] = model

        # Make predictions using the model for the next 365 days
        future = model.make_future_dataframe(periods=365)
        forecast = model.predict(future)[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
        forecast['ticker'] = ticker

        # Store the forecasted results
        forecasts.append(forecast[['ds', 'ticker', 'yhat', 'yhat_upper', 'yhat_lower']])
    
    # Concatenate forecasts for all ticker groups
    all_forecasts = pd.concat(forecasts)

    return all_forecasts, observed_data, trained_models

# Call the function and pass the grouped data
all_forecasts, observed_data, trained_models = forecast_prophet(groups_by_ticker)

# Print the information about the DataFrame
print("\nINFORMATION ABOUT THE DATAFRAME:")
print(all_forecasts.head())


#### <u>REVIEWING OF THE SHAPE OF TRAIN AND TEST DATA TO SEE IF THERE ARE DISCREPANCIES</u>

In [None]:
# Calculate the sum of the sizes of the training and testing sets for each ticker group
total_train_size = sum([group_train.shape[0] for _, group_train, _ in observed_data])
total_test_size = sum([group_test.shape[0] for _, _, group_test in observed_data])

# Print the total sizes
print("Total size of training sets:", total_train_size)
print("Total size of testing sets:", total_test_size)


# <center> **EVALUTING THE PERFORMANCE OF PROPHET**</center>
#### <u>PROPHET PERFORMANCE BASED ON TICKERS</u>

In [None]:
# Define a function to calculate MAPE and sMAPE
def mape(actual, predicted):
    actual, predicted = np.array(actual), np.array(predicted)
    return np.mean(np.abs((actual - predicted) / actual)) * 100 if np.any(actual) else float('inf')

def smape(actual, predicted):
    actual, predicted = np.array(actual), np.array(predicted)
    denominator = np.abs(actual) + np.abs(predicted)
    return 2 * np.mean(np.abs(actual - predicted) / denominator) * 100 if np.any(denominator) else float('inf')

# Initialize lists to store metrics for each ticker
performance_metrics = []

# Assume all_forecasts and observed_data are defined as mentioned earlier
for ticker, group_train, group_test in observed_data:
    # Merge forecast data with actual data based on date ('ds') and ticker
    forecast = all_forecasts[all_forecasts['ticker'] == ticker]
    actual_vs_predicted = pd.merge(group_test, forecast, on='ds', how='inner')
    
    # Calculate performance metrics
    mse = mean_squared_error(actual_vs_predicted['y'], actual_vs_predicted['yhat'])
    mae = mean_absolute_error(actual_vs_predicted['y'], actual_vs_predicted['yhat'])
    r2 = r2_score(actual_vs_predicted['y'], actual_vs_predicted['yhat'])
    rmse = np.sqrt(mse)
    mape_score = mape(actual_vs_predicted['y'], actual_vs_predicted['yhat'])
    smape_score = smape(actual_vs_predicted['y'], actual_vs_predicted['yhat'])

    # Store metrics in a dictionary for each ticker
    performance_metrics.append({
        'Ticker': ticker,
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'RMSE': rmse,
        'MAPE': mape_score,
        'sMAPE': smape_score
    })

# Convert list of metrics to DataFrame for easier viewing
metrics_df = pd.DataFrame(performance_metrics)
print(metrics_df)


# <center>**VISUALIZING OUR FORECAST**</center>

#### <u>PLOTTING THE FORECAST WITH ACTUAL AND UNCERTAINTY INTERVALS BY TICKER</u>

In [None]:
import plotly.graph_objects as go
import pandas as pd

# Initialize the figure
fig = go.Figure()

# Define distinct colors and settings for each type of data for each ticker
colors = {
    'HSBA LN Equity': {'train': 'navy', 'test': 'black', 'predicted': 'brown', 'interval': 'rgba(173, 216, 230, 0.4)'},
    'BARC LN Equity': {'train': 'purple', 'test': 'red', 'predicted': 'green', 'interval': 'rgba(255, 182, 193, 0.4)'},
    'JPM UN Equity': {'train': 'orange', 'test': 'green', 'predicted': 'red', 'interval': 'rgba(255, 99, 71, 0.4)'},
    'BAC UN Equity': {'train': 'maroon', 'test': 'crimson', 'predicted': 'black', 'interval': 'rgba(240, 128, 128, 0.4)'}
}

# Assuming you have observed_data iterable with group_train, group_test per ticker
for ticker, group_train, group_test in observed_data:
    # Merge forecast data with actual data based on date ('ds') and ticker
    forecast = all_forecasts[all_forecasts['ticker'] == ticker]
    actual_vs_predicted = pd.merge(group_test, forecast, on='ds', how='inner')

    # Add trace for training data as bubbles
    fig.add_trace(go.Scatter(x=group_train['ds'], y=group_train['y'],
                             mode='markers', name=f'{ticker} Train',
                             marker=dict(size=5, color=colors[ticker]['train'])))

    # Add trace for test data
    fig.add_trace(go.Scatter(x=group_test['ds'], y=group_test['y'],
                             mode='lines', name=f'{ticker} Test',
                             line=dict(color=colors[ticker]['test'])))

    # Add trace for predicted values
    fig.add_trace(go.Scatter(x=actual_vs_predicted['ds'], y=actual_vs_predicted['yhat'],
                             mode='lines', name=f'{ticker} Predicted',
                             line=dict(color=colors[ticker]['predicted'])))

    # Add uncertainty intervals with legend entry
    interval_name = f'{ticker} Uncertainty Interval'
    fig.add_trace(go.Scatter(x=actual_vs_predicted['ds'], y=actual_vs_predicted['yhat_upper'],
                             mode='lines', name=interval_name,
                             line=dict(width=0),
                             showlegend=False))

    fig.add_trace(go.Scatter(x=actual_vs_predicted['ds'], y=actual_vs_predicted['yhat_lower'],
                             mode='lines', line=dict(width=0),
                             fill='tonexty',  # Fill area between
                             fillcolor=colors[ticker]['interval'],
                             showlegend=True, legendgroup=interval_name, name=interval_name))

    # Add a vertical line to indicate the start of test data for each ticker
    fig.add_vline(x=group_test['ds'].iloc[0], line_width=1, line_dash="dash", line_color=colors[ticker]['test'])

# Update layout to enhance aesthetics and functionality
fig.update_layout(
    title='Training, Test, and Predicted Data with Uncertainty Intervals for Each Ticker',
    xaxis_title='Date',
    yaxis_title='Value',
    legend_title="Legend",
    plot_bgcolor='white',
    xaxis=dict(showgrid=True, gridcolor='lightgrey'),
    yaxis=dict(showgrid=True, gridcolor='lightgrey'),
    height=600,  # Adjust height based on your preference
    width=1200,  # Adjust width based on your preference
)

# Show the plot
plt.savefig('TRAINING, TEST AND PREDICTION WITH UNCERTAINTY INTERVALS.png')  # Save the figure
fig.show()


#### <u>SEPARATING THE PREDICTIONS WITH THE ACTUALS AND UNCERTAINTY LEVEL FOR EACH TICKER</u>

In [None]:
import plotly.graph_objects as go
import pandas as pd

# Define distinct colors and settings for each type of data for each ticker
colors = {
    'HSBA LN Equity': {'train': 'navy', 'test': 'black', 'predicted': 'brown', 'interval': 'rgba(173, 216, 230, 0.4)'},
    'BARC LN Equity': {'train': 'purple', 'test': 'red', 'predicted': 'green', 'interval': 'rgba(255, 182, 193, 0.4)'},
    'JPM UN Equity': {'train': 'orange', 'test': 'green', 'predicted': 'red', 'interval': 'rgba(255, 99, 71, 0.4)'},
    'BAC UN Equity': {'train': 'maroon', 'test': 'crimson', 'predicted': 'black', 'interval': 'rgba(240, 128, 128, 0.4)'}
}

# Assuming you have observed_data iterable with group_train, group_test per ticker
for ticker, group_train, group_test in observed_data:
    # Initialize the figure for each ticker
    fig = go.Figure()

    # Merge forecast data with actual data based on date ('ds') and ticker
    forecast = all_forecasts[all_forecasts['ticker'] == ticker]
    actual_vs_predicted = pd.merge(group_test, forecast, on='ds', how='inner')

    # Add trace for training data as bubbles
    fig.add_trace(go.Scatter(x=group_train['ds'], y=group_train['y'],
                             mode='markers', name='Train',
                             marker=dict(size=5, color=colors[ticker]['train'])))

    # Add trace for test data
    fig.add_trace(go.Scatter(x=group_test['ds'], y=group_test['y'],
                             mode='lines', name='Actual',
                             line=dict(color=colors[ticker]['test'])))

    # Add trace for predicted values
    fig.add_trace(go.Scatter(x=actual_vs_predicted['ds'], y=actual_vs_predicted['yhat'],
                             mode='lines', name='Predicted',
                             line=dict(color=colors[ticker]['predicted'])))

    # Add uncertainty intervals with legend entry
    fig.add_trace(go.Scatter(x=actual_vs_predicted['ds'], y=actual_vs_predicted['yhat_upper'],
                             mode='lines', line=dict(width=0),
                             showlegend=False))

    fig.add_trace(go.Scatter(x=actual_vs_predicted['ds'], y=actual_vs_predicted['yhat_lower'],
                             mode='lines', line=dict(width=0),
                             fill='tonexty',  # Fill area between
                             fillcolor=colors[ticker]['interval'],
                             showlegend=True, name='Uncertainty Interval'))

    # Add a vertical line to indicate the start of test data for each ticker
    fig.add_vline(x=group_test['ds'].iloc[0], line_width=1, line_dash="dash", line_color=colors[ticker]['test'])

    # Update layout to enhance aesthetics and functionality
    fig.update_layout(
        title=f'Training, Test, and Predicted Data with Uncertainty for {ticker}',
        xaxis_title='Date',
        yaxis_title='Value',
        legend_title="Legend",
        plot_bgcolor='white',
        xaxis=dict(showgrid=True, gridcolor='lightgrey'),
        yaxis=dict(showgrid=True, gridcolor='lightgrey'),
        height=600,  # Adjust height based on your preference
        width=1200,  # Adjust width based on your preference
    )

    # Show the plot
    plt.savefig('TRAINING, TEST AND PREDICTION WITH UNCERTAINTY INTERVALS FOR ALL TICKERS.png')  # Save the figure
    fig.show()


#### <u>PLOTTING THE PROPHET COMPONENT FOR TREND, SEASONALITY AND HOLIDAY USING THE FORECAST</u>

In [None]:
# Get unique ticker symbols
tickers = all_forecasts['ticker'].unique()

# Plot the components for each ticker
for ticker in tickers:
    # Filter the forecasted data for the current ticker
    forecast_ticker = all_forecasts[all_forecasts['ticker'] == ticker]
    
    # Initialize the plot
    plt.figure(figsize=(10, 6))
    
    # Plot the trend component
    plt.plot(forecast_ticker['ds'], forecast_ticker['yhat'], label='Trend', color='blue', alpha=0.7)
    
    # Plot the seasonal component
    plt.plot(forecast_ticker['ds'], forecast_ticker['yhat_upper'], label='Seasonal', color='green', alpha=0.7)
    
    # Plot the holiday component
    plt.plot(forecast_ticker['ds'], forecast_ticker['yhat_lower'], label='Holiday', color='red', alpha=0.7)
    
    # Set plot title and labels
    plt.title(f'Components for Ticker: {ticker}')
    plt.xlabel('Date')
    plt.ylabel('Value')
    
    # Add legend
    plt.legend()
    
    # Show plot
    plt.show()


# <center>**STOCK PRICE PREDICTION USING RANDOM FOREST**</center>

#### <u>Data Review</u>

In [None]:
df = pd.read_csv('BAC_STOCK_DATA.csv',parse_dates=['Dates'], encoding='utf-8')

# Convert 'Dates' column to datetime format
df['Dates'] = pd.to_datetime(df['Dates'], format='%d/%m/%Y')

#print the first 144 row
df.head(144)

# <center>**MODELING WITH RANDOM FOREST FOR BAC**</center>

#### <u>Splitting and Training</u>

In [None]:
# Split the data into features (X) and target (y)
X = df[['Dates']]
y = df['BAC UN Equity']

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Create and train the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)



#### <u>RANDOM FOREST PERFORMANCE EVALUATION BASED ON BAC</u>

In [None]:
y_train_pred = rf_model.predict(X_train)
y_val_pred = rf_model.predict(X_val)
y_test_pred = rf_model.predict(X_test)

r2_train = r2_score(y_train, y_train_pred)
explained_variance_train = explained_variance_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
mae_train = mean_absolute_error(y_train, y_train_pred)

# Calculate evaluation metrics on the validation set
r2_val = r2_score(y_val, y_val_pred)
explained_variance_val = explained_variance_score(y_val, y_val_pred)
mape_val = mean_absolute_percentage_error(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
rmse_val = mse_val ** 0.5
mae_val = mean_absolute_error(y_val, y_val_pred)

# Calculate evaluation metrics on the test set
r2_test = r2_score(y_test, y_test_pred)
explained_variance_test = explained_variance_score(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = mse_test ** 0.5
mae_test = mean_absolute_error(y_test, y_test_pred)

print("Training Set Metrics:")
print(f"\nR-squared: {r2_train:.2f}")
print(f"Explained Variation: {explained_variance_train:.2f}")
print(f"MAPE: {mape_train:.2f}")
print(f"MSE: {mse_train:.2f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAE: {mae_train:.2f}")


print("\nValidation Set Metrics:")
print(f"\nR-squared: {r2_val:.2f}")
print(f"Explained Variation: {explained_variance_val:.2f}")
print(f"MAPE: {mape_val:.2f}")
print(f"MSE: {mse_val:.2f}")
print(f"RMSE: {rmse_val:.2f}")
print(f"MAE: {mae_val:.2f}")

print("\nValidation Set Metrics:")
print(f"\nR-squared: {r2_val:.2f}")
print(f"Explained Variation: {explained_variance_val:.2f}")
print(f"MAPE: {mape_val:.2f}")
print(f"MSE: {mse_val:.2f}")
print(f"RMSE: {rmse_val:.2f}")
print(f"MAE: {mae_val:.2f}")

# Calculate evaluation metrics on the validation set
r2_val = r2_score(y_val, y_val_pred)
explained_variance_val = explained_variance_score(y_val, y_val_pred)
mape_val = mean_absolute_percentage_error(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
rmse_val = mse_val ** 0.5
mae_val = mean_absolute_error(y_val, y_val_pred)

print("\nValidation Set Metrics:")
print(f"\nR-squared: {r2_val:.2f}")
print(f"Explained Variation: {explained_variance_val:.2f}")
print(f"MAPE: {mape_val:.2f}")
print(f"MSE: {mse_val:.2f}")
print(f"RMSE: {rmse_val:.2f}")
print(f"MAE: {mae_val:.2f}")

print("\nTest Set Metrics:")

print(f"\nR-squared: {r2_test:.2f}")
print(f"Explained Variation: {explained_variance_test:.2f}")
print(f"MAPE: {mape_test:.2f}")
print(f"MSE: {mse_test:.2f}")
print(f"RMSE: {rmse_test:.2f}")
print(f"MAE: {mae_test:.2f}")



#### <u>VISUALIZING OUR FORECAST FOR BAC</u>

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(X_train['Dates'], y_train, label='Training Data')
plt.scatter(X_test['Dates'], y_test, label='Test Data')
plt.scatter(X_test['Dates'], y_test_pred, label='Predicted Data')
plt.xlabel('Date')
plt.ylabel('PX_LAST')
plt.title('Random Forest Regression for BAC')
plt.legend()
plt.show()

#### <u>DATA REVIEW FOR BARC</u>

In [None]:
df = pd.read_csv('BARC.csv',parse_dates=['Dates'], encoding='utf-8')

# Convert 'Dates' column to datetime format
df['Dates'] = pd.to_datetime(df['Dates'], format='%d/%m/%Y')

#print the first 144 row
df.head(144)

#### <u>SPLITTING AND TRAINING</u>

In [None]:
# Split the data into features (X) and target (y)
X = df[['Dates']]
y = df['BARC LN Equity']

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Create and train the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

#### <u>RANDOM FOREST PERFORMANCE EVALUATION BASED ON BARC</u>

In [None]:
y_train_pred = rf_model.predict(X_train)
y_val_pred = rf_model.predict(X_val)
y_test_pred = rf_model.predict(X_test)

r2_train = r2_score(y_train, y_train_pred)
explained_variance_train = explained_variance_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
mae_train = mean_absolute_error(y_train, y_train_pred)

# Calculate evaluation metrics on the validation set
r2_val = r2_score(y_val, y_val_pred)
explained_variance_val = explained_variance_score(y_val, y_val_pred)
mape_val = mean_absolute_percentage_error(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
rmse_val = mse_val ** 0.5
mae_val = mean_absolute_error(y_val, y_val_pred)

print("\nValidation Set Metrics:")
print(f"\nR-squared: {r2_val:.2f}")
print(f"Explained Variation: {explained_variance_val:.2f}")
print(f"MAPE: {mape_val:.2f}")
print(f"MSE: {mse_val:.2f}")
print(f"RMSE: {rmse_val:.2f}")
print(f"MAE: {mae_val:.2f}")

print("\nTest Set Metrics:")

print(f"\nR-squared: {r2_test:.2f}")
print(f"Explained Variation: {explained_variance_test:.2f}")
print(f"MAPE: {mape_test:.2f}")
print(f"MSE: {mse_test:.2f}")
print(f"RMSE: {rmse_test:.2f}")
print(f"MAE: {mae_test:.2f}")


#### <u>VISUALIZING OUR FORECAST FOR BARC</u>

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(X_train['Dates'], y_train, label='Training Data')
plt.scatter(X_test['Dates'], y_test, label='Test Data')
plt.scatter(X_test['Dates'], y_test_pred, label='Predicted Data')
plt.xlabel('Date')
plt.ylabel('PX_LAST')
plt.title('Random Forest Regression for BARC')
plt.legend()
plt.show()

#### <u>DATA REVIEW FOR HSBA</u>

In [None]:
def import_csv(file):
    data = pd.read_csv(file, skiprows=5)
    return data
    
    # Convert 'Dates' column to datetime format
df['Dates'] = pd.to_datetime(df['Dates'], format='%d/%m/%Y')

file_path = 'hsbcbank.csv'
df = import_csv(file_path)

df

#### <u>Splitting and Training</u>

In [None]:
# Convert 'Dates' column to datetime format
df['Dates'] = pd.to_datetime(df['Dates'], format='%d/%m/%Y')

In [None]:
# Split the data into features (X) and target (y)
X = df[['Dates']]
y = df['PX_LAST']

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Create and train the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

#### <u>RANDOM FOREST PERFORMANCE EVALUATION BASED ON HSBA</u>

In [None]:
y_train_pred = rf_model.predict(X_train)
y_val_pred = rf_model.predict(X_val)
y_test_pred = rf_model.predict(X_test)

r2_train = r2_score(y_train, y_train_pred)
explained_variance_train = explained_variance_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
mae_train = mean_absolute_error(y_train, y_train_pred)

# Calculate evaluation metrics on the validation set
r2_val = r2_score(y_val, y_val_pred)
explained_variance_val = explained_variance_score(y_val, y_val_pred)
mape_val = mean_absolute_percentage_error(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
rmse_val = mse_val ** 0.5
mae_val = mean_absolute_error(y_val, y_val_pred)

# Calculate evaluation metrics on the test set
r2_test = r2_score(y_test, y_test_pred)
explained_variance_test = explained_variance_score(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = mse_test ** 0.5
mae_test = mean_absolute_error(y_test, y_test_pred)

print("Training Set Metrics:")
print(f"\nR-squared: {r2_train:.2f}")
print(f"Explained Variation: {explained_variance_train:.2f}")
print(f"MAPE: {mape_train:.2f}")
print(f"MSE: {mse_train:.2f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAE: {mae_train:.2f}")

print("\nValidation Set Metrics:")
print(f"\nR-squared: {r2_val:.2f}")
print(f"Explained Variation: {explained_variance_val:.2f}")
print(f"MAPE: {mape_val:.2f}")
print(f"MSE: {mse_val:.2f}")
print(f"RMSE: {rmse_val:.2f}")
print(f"MAE: {mae_val:.2f}")

print("\nTest Set Metrics:")

print(f"\nR-squared: {r2_test:.2f}")
print(f"Explained Variation: {explained_variance_test:.2f}")
print(f"MAPE: {mape_test:.2f}")
print(f"MSE: {mse_test:.2f}")
print(f"RMSE: {rmse_test:.2f}")
print(f"MAE: {mae_test:.2f}")




#### **VISUALIZING OUR FORECAST FOR HSBA**

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(X_train['Dates'], y_train, label='Training Data')
plt.scatter(X_test['Dates'], y_test, label='Test Data')
plt.scatter(X_test['Dates'], y_test_pred, label='Predicted Data')
plt.xlabel('Date')
plt.ylabel('PX_LAST')
plt.title('Random Forest Regression for HSBC')
plt.legend()
plt.show()

#### <u>Data Review For JPM</u>

In [None]:
df = pd.read_csv('JPM_STOCK_DATA.csv',parse_dates=['Dates'], encoding='utf-8')

# Convert 'Dates' column to datetime format
df['Dates'] = pd.to_datetime(df['Dates'], format='%d/%m/%Y')

#print the first 144 row
df.head(144)

#### <u>Splitting and Training</u>

In [None]:
# Split the data into features (X) and target (y)
X = df[['Dates']]
y = df['JPM UN Equity']

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Create and train the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


#### <u>RANDOM FOREST PERFORMANCE EVALUATION BASED ON JPM</u>

In [None]:

y_train_pred = rf_model.predict(X_train) 
y_val_pred = rf_model.predict(X_val)
y_test_pred = rf_model.predict(X_test)

r2_train = r2_score(y_train, y_train_pred)
explained_variance_train = explained_variance_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
mae_train = mean_absolute_error(y_train, y_train_pred)

# Calculate evaluation metrics on the validation set
r2_val = r2_score(y_val, y_val_pred)
explained_variance_val = explained_variance_score(y_val, y_val_pred)
mape_val = mean_absolute_percentage_error(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
rmse_val = mse_val ** 0.5
mae_val = mean_absolute_error(y_val, y_val_pred)

# Calculate evaluation metrics on the test set
r2_test = r2_score(y_test, y_test_pred)
explained_variance_test = explained_variance_score(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = mse_test ** 0.5
mae_test = mean_absolute_error(y_test, y_test_pred)

print("Training Set Metrics:")
print(f"\nR-squared: {r2_train:.2f}")
print(f"Explained Variation: {explained_variance_train:.2f}")
print(f"MAPE: {mape_train:.2f}")
print(f"MSE: {mse_train:.2f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAE: {mae_train:.2f}")

print("\nValidation Set Metrics:")
print(f"\nR-squared: {r2_val:.2f}")
print(f"Explained Variation: {explained_variance_val:.2f}")
print(f"MAPE: {mape_val:.2f}")
print(f"MSE: {mse_val:.2f}")
print(f"RMSE: {rmse_val:.2f}")
print(f"MAE: {mae_val:.2f}")

print("\nValidation Set Metrics:")
print(f"\nR-squared: {r2_val:.2f}")
print(f"Explained Variation: {explained_variance_val:.2f}")
print(f"MAPE: {mape_val:.2f}")
print(f"MSE: {mse_val:.2f}")
print(f"RMSE: {rmse_val:.2f}")
print(f"MAE: {mae_val:.2f}")

# Calculate evaluation metrics on the validation set
r2_val = r2_score(y_val, y_val_pred)
explained_variance_val = explained_variance_score(y_val, y_val_pred)
mape_val = mean_absolute_percentage_error(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
rmse_val = mse_val ** 0.5
mae_val = mean_absolute_error(y_val, y_val_pred)

print("\nValidation Set Metrics:")
print(f"\nR-squared: {r2_val:.2f}")
print(f"Explained Variation: {explained_variance_val:.2f}")
print(f"MAPE: {mape_val:.2f}")
print(f"MSE: {mse_val:.2f}")
print(f"RMSE: {rmse_val:.2f}")
print(f"MAE: {mae_val:.2f}")

print("\nTest Set Metrics:")

print(f"\nR-squared: {r2_test:.2f}")
print(f"Explained Variation: {explained_variance_test:.2f}")
print(f"MAPE: {mape_test:.2f}")
print(f"MSE: {mse_test:.2f}")
print(f"RMSE: {rmse_test:.2f}")
print(f"MAE: {mae_test:.2f}")

#### **VISUALIZING OUR FORECAST FOR JPM**

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(X_train['Dates'], y_train, label='Training Data')
plt.scatter(X_test['Dates'], y_test, label='Test Data')
plt.scatter(X_test['Dates'], y_test_pred, label='Predicted Data')
plt.xlabel('Date')
plt.ylabel('PX_LAST')
plt.title('Random Forest Regression for JPM')
plt.legend()
plt.show()

# <center>**STOCK PRICE PREDICTION USING LINEAR REGRESSION**</center>

#### <u>Reviewing of the Original Dataset For BAC.</u>

In [None]:
#This line reads the data from the test_csv.csv file using the read_csv function from the pandas library and stores it in the data variable as a pandas DataFrame.

stock_price = pd.read_csv("Stockprice_four.csv")

In [None]:
# Convert the date column to datetime format
#This line converts the 'Dates' column in the DataFrame from string format to datetime format using the to_datetime function from pandas. 
#The format='%d/%m/%Y' argument specifies that the dates are in the 'DD/MM/YYYY' format.

stock_price['Dates'] = pd.to_datetime(stock_price['Dates'], format='%d/%m/%Y')

stock_price.head(10)

In [None]:
# Set the date column as the index
#This line sets the 'Dates' column as the index of the DataFrame using the set_index method from pandas.

stock_price = stock_price.set_index('Dates')
stock_price 

In [None]:
# Convert the datetime index to a numerical representation (number of days since a reference date)
reference_date = stock_price.index.min()
X = (stock_price.index - reference_date).days.values.reshape(-1, 1)
len(X)

#### <u>Splitting and Training</u>

In [None]:
# Target variable
#This line assigns the 'BARC LN Equity' column from the DataFrame to the y variable, which represents the target variable (stock prices). 
#The values attribute is used to convert the pandas Series to a numpy array.

y = stock_price['BAC UN EQUITY'].values

# Split the data into training and test sets
#This line splits the data into training and test sets using the train_test_split function from scikit-learn. 
#The X and y variables contain the feature matrix and target variable, respectively. 
#The test_size=0.2 argument specifies that 20% of the data should be used for testing, and the remaining 80% for training. 
#The random_state=42 argument ensures reproducibility by setting a fixed seed for the random number generator.
#The function returns four variables: X_train, X_test, y_train, and y_test, which contain the feature matrices and target variables for the training and test sets, respectively.

train_size = int(len(X)*0.8)
tests_size = len(X)-train_size
X_train, X_test = X[0:train_size],X[train_size:len(X)]
y_train, y_test = y[0:train_size],y[train_size:len(X)]

X_train


#### <u>Training The Model</u>

In [None]:
# Create and fit the linear regression model
#These lines create an instance of the LinearRegression model from scikit-learn and fit the model to the training data (X_train, y_train) using the fit method.

model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the training, validation, and test sets
#These lines use the fitted linear regression model to make predictions on the training and test sets using the predict method. 
#The predicted values for the training set are stored in y_train_pred, and the predicted values for the test set are stored in y_test_pred.

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


#### <u>PERFORMANCE EVALUATION BASED ON BAC</u>

In [None]:
# Calculate evaluation metrics on the training set
#These lines calculate various evaluation metrics for assessing the performance of the linear regression model on both the training and test sets. The metrics calculated are:
#r2_score: R-squared score, which measures how well the model fits the data (ranges from 0 to 1, with 1 being a perfect fit).
#explained_variance_score: Explained variance score, which represents the proportion of variance in the target variable that is explained by the model.
#mean_absolute_percentage_error: Mean Absolute Percentage Error (MAPE), which measures the average absolute percentage difference between the predicted and actual values.
#mean_squared_error: Mean Squared Error (MSE), which measures the average squared difference between the predicted and actual values.
#np.sqrt(mse): Root Mean Squared Error (RMSE), which is the square root of the MSE and provides a measure of the typical magnitude of the prediction errors.
#mean_absolute_error: Mean Absolute Error (MAE), which measures the average absolute difference between the predicted and actual values.
#Each metric is calculated separately for the training and test sets using the corresponding functions from the sklearn.metrics module.



r2_train = r2_score(y_train, y_train_pred)
explained_variance_train = explained_variance_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
mae_train = mean_absolute_error(y_train, y_train_pred)



# Calculate evaluation metrics on the test set
r2_test = r2_score(y_test, y_test_pred)
explained_variance_test = explained_variance_score(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = mse_test ** 0.5
mae_test = mean_absolute_error(y_test, y_test_pred)

#These lines print the calculated evaluation metrics for both the training and test sets. The f-strings are used to format the output with the metric names and values.



print("Training Set Metrics:")
print(f"\nR-squared: {r2_train:.2f}")
print(f"Explained Variation: {explained_variance_train:.2f}")
print(f"MAPE: {mape_train:.2f}")
print(f"MSE: {mse_train:.2f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAE: {mae_train:.2f}")


print("\nTest Set Metrics:")

print(f"\nR-squared: {r2_test:.2f}")
print(f"Explained Variation: {explained_variance_test:.2f}")
print(f"MAPE: {mape_test:.2f}")
print(f"MSE: {mse_test:.2f}")
print(f"RMSE: {rmse_test:.2f}")
print(f"MAE: {mae_test:.2f}")

#### **VISUALIZING OUR FORECAST FOR BAC**

In [None]:
# Plot the training and test data with the trained model
#These lines create a plot using Matplotlib to visualize the training and test data along with the trained linear regression model:

#plt.figure(figsize=(12, 6)) creates a new figure with a specific size (12 inches wide and 6 inches tall).
#plt.scatter(X_train, y_train, label='Training Data', alpha=0.3) plots the training data as scattered points, with a label and transparency level (alpha=0.3).
#plt.scatter(X_test, y_test, label='Test Data', alpha=0.3) plots the test data as scattered points, with a label and transparency level (alpha=0.3).
#plt.plot(X, model.predict(X), 'r', label='Trained Model') plots the trained linear regression model as a red line, using the predict method to obtain the predicted values for the entire feature matrix X.
#plt.xlabel('Date') and plt.ylabel('BARC LN Equity') set the labels for the x-axis and y-axis, respectively.
#plt.title('Linear Regression Model') sets the title of the plot.
#plt.legend() adds a legend to the plot, displaying the labels for the training data, test data, and trained model.
#plt.show() displays the plot.
#This plot allows you to visually assess how well the trained linear regression model fits the training and test data.




plt.figure(figsize=(12, 6))
plt.scatter(X, y, label='Full', alpha=0.3, color='black')
plt.scatter(X_train, y_train, label='Train', alpha=0.3, color='red')
plt.scatter(X_test, y_test, label='Test', alpha=0.3, color='green')
plt.scatter(X_test, y_test_pred, label='Predicted', alpha=0.3, color='orange')
plt.xlabel('Date')
plt.ylabel('BAC LN Equity')
plt.title('Linear Regression Model')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(X_test, y_test, label='Test Data', alpha=0.3)
plt.scatter(X_test, model.predict(X_test), label='Predicted')
plt.xlabel('Date')
plt.ylabel('BAC LN Equity')
plt.title('Linear Regression Model')
plt.legend()
plt.show()

#### <u>Reviewing of the Original Dataset For BARC.</u>

In [None]:
#This line reads the data from the test_csv.csv file using the read_csv function from the pandas library and stores it in the data variable as a pandas DataFrame.

stock_price = pd.read_csv("test_csv.csv")

# Convert the date column to datetime format
#This line converts the 'Dates' column in the DataFrame from string format to datetime format using the to_datetime function from pandas. 
#The format='%d/%m/%Y' argument specifies that the dates are in the 'DD/MM/YYYY' format.

stock_price['Dates'] = pd.to_datetime(stock_price['Dates'], format='%d/%m/%Y')

stock_price.head(10)

In [None]:
# Set the date column as the index
#This line sets the 'Dates' column as the index of the DataFrame using the set_index method from pandas.

stock_price = stock_price.set_index('Dates')
stock_price 

In [None]:
# Convert the datetime index to a numerical representation (number of days since a reference date)
reference_date = stock_price.index.min()
X = (stock_price.index - reference_date).days.values.reshape(-1, 1)
len(X)

#### <u>Splitting and Training</u>

In [None]:
# Target variable
#This line assigns the 'BARC LN Equity' column from the DataFrame to the y variable, which represents the target variable (stock prices). 
#The values attribute is used to convert the pandas Series to a numpy array.

y = stock_price['BARC LN Equity'].values

# Split the data into training and test sets
#This line splits the data into training and test sets using the train_test_split function from scikit-learn. 
#The X and y variables contain the feature matrix and target variable, respectively. 
#The test_size=0.2 argument specifies that 20% of the data should be used for testing, and the remaining 80% for training. 
#The random_state=42 argument ensures reproducibility by setting a fixed seed for the random number generator.
#The function returns four variables: X_train, X_test, y_train, and y_test, which contain the feature matrices and target variables for the training and test sets, respectively.

train_size = int(len(X)*0.8)
tests_size = len(X)-train_size
X_train, X_test = X[0:train_size],X[train_size:len(X)]
y_train, y_test = y[0:train_size],y[train_size:len(X)]

X_train


#### <u>Training The Model</u>

In [None]:
# Create and fit the linear regression model
#These lines create an instance of the LinearRegression model from scikit-learn and fit the model to the training data (X_train, y_train) using the fit method.

model = LinearRegression()
model.fit(X_train, y_train)


# Make predictions on the training, validation, and test sets
#These lines use the fitted linear regression model to make predictions on the training and test sets using the predict method. 
#The predicted values for the training set are stored in y_train_pred, and the predicted values for the test set are stored in y_test_pred.

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


#### <u>PERFORMANCE EVALUATION BASED ON BARC</u>

In [None]:
# Calculate evaluation metrics on the training set
#These lines calculate various evaluation metrics for assessing the performance of the linear regression model on both the training and test sets. The metrics calculated are:
#r2_score: R-squared score, which measures how well the model fits the data (ranges from 0 to 1, with 1 being a perfect fit).
#explained_variance_score: Explained variance score, which represents the proportion of variance in the target variable that is explained by the model.
#mean_absolute_percentage_error: Mean Absolute Percentage Error (MAPE), which measures the average absolute percentage difference between the predicted and actual values.
#mean_squared_error: Mean Squared Error (MSE), which measures the average squared difference between the predicted and actual values.
#np.sqrt(mse): Root Mean Squared Error (RMSE), which is the square root of the MSE and provides a measure of the typical magnitude of the prediction errors.
#mean_absolute_error: Mean Absolute Error (MAE), which measures the average absolute difference between the predicted and actual values.
#Each metric is calculated separately for the training and test sets using the corresponding functions from the sklearn.metrics module.



r2_train = r2_score(y_train, y_train_pred)
explained_variance_train = explained_variance_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
mae_train = mean_absolute_error(y_train, y_train_pred)



# Calculate evaluation metrics on the test set
r2_test = r2_score(y_test, y_test_pred)
explained_variance_test = explained_variance_score(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = mse_test ** 0.5
mae_test = mean_absolute_error(y_test, y_test_pred)

#These lines print the calculated evaluation metrics for both the training and test sets. The f-strings are used to format the output with the metric names and values.



print("Training Set Metrics:")
print(f"\nR-squared: {r2_train:.2f}")
print(f"Explained Variation: {explained_variance_train:.2f}")
print(f"MAPE: {mape_train:.2f}")
print(f"MSE: {mse_train:.2f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAE: {mae_train:.2f}")


print("\nTest Set Metrics:")

print(f"\nR-squared: {r2_test:.2f}")
print(f"Explained Variation: {explained_variance_test:.2f}")
print(f"MAPE: {mape_test:.2f}")
print(f"MSE: {mse_test:.2f}")
print(f"RMSE: {rmse_test:.2f}")
print(f"MAE: {mae_test:.2f}")

#### **VISUALIZING OUR FORECAST FOR BARC**

In [None]:
# Plot the training and test data with the trained model
#These lines create a plot using Matplotlib to visualize the training and test data along with the trained linear regression model:

#plt.figure(figsize=(12, 6)) creates a new figure with a specific size (12 inches wide and 6 inches tall).
#plt.scatter(X_train, y_train, label='Training Data', alpha=0.3) plots the training data as scattered points, with a label and transparency level (alpha=0.3).
#plt.scatter(X_test, y_test, label='Test Data', alpha=0.3) plots the test data as scattered points, with a label and transparency level (alpha=0.3).
#plt.plot(X, model.predict(X), 'r', label='Trained Model') plots the trained linear regression model as a red line, using the predict method to obtain the predicted values for the entire feature matrix X.
#plt.xlabel('Date') and plt.ylabel('BARC LN Equity') set the labels for the x-axis and y-axis, respectively.
#plt.title('Linear Regression Model') sets the title of the plot.
#plt.legend() adds a legend to the plot, displaying the labels for the training data, test data, and trained model.
#plt.show() displays the plot.
#This plot allows you to visually assess how well the trained linear regression model fits the training and test data.




plt.figure(figsize=(12, 6))
plt.scatter(X, y, label='Full', alpha=0.3, color='black')
plt.scatter(X_train, y_train, label='Train', alpha=0.3, color='red')
plt.scatter(X_test, y_test, label='Test', alpha=0.3, color='green')
plt.scatter(X_test, y_test_pred, label='Predicted', alpha=0.3, color='orange')
plt.xlabel('Date')
plt.ylabel('BARC LN Equity')
plt.title('Linear Regression Model')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(X_test, y_test, label='Test Data', alpha=0.3)
plt.scatter(X_test, model.predict(X_test), label='Predicted')
plt.xlabel('Date')
plt.ylabel('BARC LN Equity')
plt.title('Linear Regression Model')
plt.legend()
plt.show()

#### <u>Reviewing of the Original Dataset For JPM</u>

In [None]:
#This line reads the data from the test_csv.csv file using the read_csv function from the pandas library and stores it in the data variable as a pandas DataFrame.

stock_price = pd.read_csv("Stockprice_three.csv")

# Convert the date column to datetime format
#This line converts the 'Dates' column in the DataFrame from string format to datetime format using the to_datetime function from pandas. 
#The format='%d/%m/%Y' argument specifies that the dates are in the 'DD/MM/YYYY' format.

stock_price['Dates'] = pd.to_datetime(stock_price['Dates'], format='%d/%m/%Y')

stock_price.head(10)

In [None]:
# Set the date column as the index
#This line sets the 'Dates' column as the index of the DataFrame using the set_index method from pandas.

stock_price = stock_price.set_index('Dates')
stock_price 

In [None]:
# Convert the datetime index to a numerical representation (number of days since a reference date)
reference_date = stock_price.index.min()
X = (stock_price.index - reference_date).days.values.reshape(-1, 1)
len(X)

#### <u>Splitting and Training</u>

In [None]:
# Target variable
#This line assigns the 'BARC LN Equity' column from the DataFrame to the y variable, which represents the target variable (stock prices). 
#The values attribute is used to convert the pandas Series to a numpy array.

y = stock_price['JPM UN Equity'].values

# Split the data into training and test sets
#This line splits the data into training and test sets using the train_test_split function from scikit-learn. 
#The X and y variables contain the feature matrix and target variable, respectively. 
#The test_size=0.2 argument specifies that 20% of the data should be used for testing, and the remaining 80% for training. 
#The random_state=42 argument ensures reproducibility by setting a fixed seed for the random number generator.
#The function returns four variables: X_train, X_test, y_train, and y_test, which contain the feature matrices and target variables for the training and test sets, respectively.

train_size = int(len(X)*0.8)
tests_size = len(X)-train_size
X_train, X_test = X[0:train_size],X[train_size:len(X)]
y_train, y_test = y[0:train_size],y[train_size:len(X)]

X_train

#### <u>Training The Model</u>

In [None]:
# Create and fit the linear regression model
#These lines create an instance of the LinearRegression model from scikit-learn and fit the model to the training data (X_train, y_train) using the fit method.

model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the training, validation, and test sets
#These lines use the fitted linear regression model to make predictions on the training and test sets using the predict method. 
#The predicted values for the training set are stored in y_train_pred, and the predicted values for the test set are stored in y_test_pred.

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


#### <u>PERFORMANCE EVALUATION BASED ON JPM</u>

In [None]:
# Calculate evaluation metrics on the training set
#These lines calculate various evaluation metrics for assessing the performance of the linear regression model on both the training and test sets. The metrics calculated are:
#r2_score: R-squared score, which measures how well the model fits the data (ranges from 0 to 1, with 1 being a perfect fit).
#explained_variance_score: Explained variance score, which represents the proportion of variance in the target variable that is explained by the model.
#mean_absolute_percentage_error: Mean Absolute Percentage Error (MAPE), which measures the average absolute percentage difference between the predicted and actual values.
#mean_squared_error: Mean Squared Error (MSE), which measures the average squared difference between the predicted and actual values.
#np.sqrt(mse): Root Mean Squared Error (RMSE), which is the square root of the MSE and provides a measure of the typical magnitude of the prediction errors.
#mean_absolute_error: Mean Absolute Error (MAE), which measures the average absolute difference between the predicted and actual values.
#Each metric is calculated separately for the training and test sets using the corresponding functions from the sklearn.metrics module.



r2_train = r2_score(y_train, y_train_pred)
explained_variance_train = explained_variance_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
mae_train = mean_absolute_error(y_train, y_train_pred)



# Calculate evaluation metrics on the test set
r2_test = r2_score(y_test, y_test_pred)
explained_variance_test = explained_variance_score(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = mse_test ** 0.5
mae_test = mean_absolute_error(y_test, y_test_pred)

#These lines print the calculated evaluation metrics for both the training and test sets. The f-strings are used to format the output with the metric names and values.



print("Training Set Metrics:")
print(f"\nR-squared: {r2_train:.2f}")
print(f"Explained Variation: {explained_variance_train:.2f}")
print(f"MAPE: {mape_train:.2f}")
print(f"MSE: {mse_train:.2f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAE: {mae_train:.2f}")


print("\nTest Set Metrics:")

print(f"\nR-squared: {r2_test:.2f}")
print(f"Explained Variation: {explained_variance_test:.2f}")
print(f"MAPE: {mape_test:.2f}")
print(f"MSE: {mse_test:.2f}")
print(f"RMSE: {rmse_test:.2f}")
print(f"MAE: {mae_test:.2f}")

#### **VISUALIZING OUR FORECAST FOR JPM**

In [None]:
# Plot the training and test data with the trained model
#These lines create a plot using Matplotlib to visualize the training and test data along with the trained linear regression model:

#plt.figure(figsize=(12, 6)) creates a new figure with a specific size (12 inches wide and 6 inches tall).
#plt.scatter(X_train, y_train, label='Training Data', alpha=0.3) plots the training data as scattered points, with a label and transparency level (alpha=0.3).
#plt.scatter(X_test, y_test, label='Test Data', alpha=0.3) plots the test data as scattered points, with a label and transparency level (alpha=0.3).
#plt.plot(X, model.predict(X), 'r', label='Trained Model') plots the trained linear regression model as a red line, using the predict method to obtain the predicted values for the entire feature matrix X.
#plt.xlabel('Date') and plt.ylabel('BARC LN Equity') set the labels for the x-axis and y-axis, respectively.
#plt.title('Linear Regression Model') sets the title of the plot.
#plt.legend() adds a legend to the plot, displaying the labels for the training data, test data, and trained model.
#plt.show() displays the plot.
#This plot allows you to visually assess how well the trained linear regression model fits the training and test data.




plt.figure(figsize=(12, 6))
plt.scatter(X, y, label='Full', alpha=0.3, color='black')
plt.scatter(X_train, y_train, label='Train', alpha=0.3, color='red')
plt.scatter(X_test, y_test, label='Test', alpha=0.3, color='green')
plt.scatter(X_test, y_test_pred, label='Predicted', alpha=0.3, color='orange')
plt.xlabel('Date')
plt.ylabel('JPM UN Equity')
plt.title('Linear Regression Model')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(X_test, y_test, label='Test Data', alpha=0.3)
plt.scatter(X_test, model.predict(X_test), label='Predicted')
plt.xlabel('Date')
plt.ylabel('JPM UN Equity')
plt.title('Linear Regression Model')
plt.legend()
plt.show()

#### <u>Reviewing of the Original Dataset For HSBA</u>

In [None]:
#This line reads the data from the test_csv.csv file using the read_csv function from the pandas library and stores it in the data variable as a pandas DataFrame.

stock_price = pd.read_csv("Stockprice_two.csv")

# Convert the date column to datetime format
#This line converts the 'Dates' column in the DataFrame from string format to datetime format using the to_datetime function from pandas. 
#The format='%d/%m/%Y' argument specifies that the dates are in the 'DD/MM/YYYY' format.

stock_price['Dates'] = pd.to_datetime(stock_price['Dates'], format='%d/%m/%Y')

stock_price.head(10)

In [None]:
# Set the date column as the index
#This line sets the 'Dates' column as the index of the DataFrame using the set_index method from pandas.

stock_price = stock_price.set_index('Dates')
stock_price 

In [None]:
# Convert the datetime index to a numerical representation (number of days since a reference date)
reference_date = stock_price.index.min()
X = (stock_price.index - reference_date).days.values.reshape(-1, 1)
len(X)
y = stock_price['HSBC Equity'].values

#### <u>Splitting and Training</u>

In [None]:
# Target variable
#This line assigns the 'BARC LN Equity' column from the DataFrame to the y variable, which represents the target variable (stock prices). 
#The values attribute is used to convert the pandas Series to a numpy array.

y = stock_price['HSBC Equity'].values

# Split the data into training and test sets
#This line splits the data into training and test sets using the train_test_split function from scikit-learn. 
#The X and y variables contain the feature matrix and target variable, respectively. 
#The test_size=0.2 argument specifies that 20% of the data should be used for testing, and the remaining 80% for training. 
#The random_state=42 argument ensures reproducibility by setting a fixed seed for the random number generator.
#The function returns four variables: X_train, X_test, y_train, and y_test, which contain the feature matrices and target variables for the training and test sets, respectively.

train_size = int(len(X)*0.8)
tests_size = len(X)-train_size
X_train, X_test = X[0:train_size],X[train_size:len(X)]
y_train, y_test = y[0:train_size],y[train_size:len(X)]

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
X_train


#### <u>Training The Model</u>

In [None]:
# Split the data into train, validation, and test sets
#These lines create an instance of the LinearRegression model from scikit-learn and fit the model to the training data (X_train, y_train) using the fit method.

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

#X_train

# Create and fit the linear regression model
#These lines create an instance of the LinearRegression model from scikit-learn and fit the model to the training data (X_train, y_train) using the fit method.

model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the training, validation, and test sets
#These lines use the fitted linear regression model to make predictions on the training and test sets using the predict method. 
#The predicted values for the training set are stored in y_train_pred, and the predicted values for the test set are stored in y_test_pred.

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


#### <u>PERFORMANCE EVALUATION BASED ON HSBA</u>

In [None]:
# Calculate evaluation metrics on the training set
#These lines calculate various evaluation metrics for assessing the performance of the linear regression model on both the training and test sets. The metrics calculated are:
#r2_score: R-squared score, which measures how well the model fits the data (ranges from 0 to 1, with 1 being a perfect fit).
#explained_variance_score: Explained variance score, which represents the proportion of variance in the target variable that is explained by the model.
#mean_absolute_percentage_error: Mean Absolute Percentage Error (MAPE), which measures the average absolute percentage difference between the predicted and actual values.
#mean_squared_error: Mean Squared Error (MSE), which measures the average squared difference between the predicted and actual values.
#np.sqrt(mse): Root Mean Squared Error (RMSE), which is the square root of the MSE and provides a measure of the typical magnitude of the prediction errors.
#mean_absolute_error: Mean Absolute Error (MAE), which measures the average absolute difference between the predicted and actual values.
#Each metric is calculated separately for the training and test sets using the corresponding functions from the sklearn.metrics module.



r2_train = r2_score(y_train, y_train_pred)
explained_variance_train = explained_variance_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
mae_train = mean_absolute_error(y_train, y_train_pred)

# Calculate evaluation metrics on the test set
r2_test = r2_score(y_test, y_test_pred)
explained_variance_test = explained_variance_score(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = mse_test ** 0.5
mae_test = mean_absolute_error(y_test, y_test_pred)

#These lines print the calculated evaluation metrics for both the training and test sets. The f-strings are used to format the output with the metric names and values.



print("Training Set Metrics:")
print(f"\nR-squared: {r2_train:.2f}")
print(f"Explained Variation: {explained_variance_train:.2f}")
print(f"MAPE: {mape_train:.2f}")
print(f"MSE: {mse_train:.2f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAE: {mae_train:.2f}")


print("\nTest Set Metrics:")

print(f"\nR-squared: {r2_test:.2f}")
print(f"Explained Variation: {explained_variance_test:.2f}")
print(f"MAPE: {mape_test:.2f}")
print(f"MSE: {mse_test:.2f}")
print(f"RMSE: {rmse_test:.2f}")
print(f"MAE: {mae_test:.2f}")

#### **VISUALIZING OUR FORECAST FOR HSBA**

In [None]:
# Plot the training and test data with the trained model
#These lines create a plot using Matplotlib to visualize the training and test data along with the trained linear regression model:

#plt.figure(figsize=(12, 6)) creates a new figure with a specific size (12 inches wide and 6 inches tall).
#plt.scatter(X_train, y_train, label='Training Data', alpha=0.3) plots the training data as scattered points, with a label and transparency level (alpha=0.3).
#plt.scatter(X_test, y_test, label='Test Data', alpha=0.3) plots the test data as scattered points, with a label and transparency level (alpha=0.3).
#plt.plot(X, model.predict(X), 'r', label='Trained Model') plots the trained linear regression model as a red line, using the predict method to obtain the predicted values for the entire feature matrix X.
#plt.xlabel('Date') and plt.ylabel('BARC LN Equity') set the labels for the x-axis and y-axis, respectively.
#plt.title('Linear Regression Model') sets the title of the plot.
#plt.legend() adds a legend to the plot, displaying the labels for the training data, test data, and trained model.
#plt.show() displays the plot.
#This plot allows you to visually assess how well the trained linear regression model fits the training and test data.




plt.figure(figsize=(12, 6))
plt.scatter(X, y, label='Full', alpha=0.3, color='black')
plt.scatter(X_train, y_train, label='Train', alpha=0.3, color='red')
plt.scatter(X_test, y_test, label='Test', alpha=0.3, color='green')
plt.scatter(X_test, y_test_pred, label='Predicted', alpha=0.3, color='orange')
plt.xlabel('Date')
plt.ylabel('HSBA Equity')
plt.title('Linear Regression Model')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(X_test, y_test, label='Test Data', alpha=0.3)
plt.scatter(X_test, model.predict(X_test), label='Predicted')
plt.xlabel('Date')
plt.ylabel('HSBA Equity')
plt.title('Linear Regression Model')
plt.legend()
plt.show()

# <center>**STOCK PRICE PREDICTION USING ARIMA MODEL**</center>

#### <u>Reviewing of the Original Dataset For HSBA</u>

In [None]:
df_1 = pd.read_csv('stockprice_two.csv')
df_1.head(10)

In [None]:

df_1 = pd.read_csv('stockprice_two.csv')
df_1.head(10)

In [None]:
df_2 = df_1
df_2['Dates'] = pd.to_datetime(df_1['Dates'], format='%d/%m/%Y')
df_2 = df_2.reset_index(drop=True)

# Set 'Dates' as the index of the DataFrame
df_2 = df_2.set_index('Dates')
if not isinstance(df_2.index, pd.DatetimeIndex):
    # If not, set the frequency to 'D'
    df_2.index = pd.DatetimeIndex(df_2.index, freq='D')

#Check for null 

df_2.isnull().sum().any()

#### **VISUALIZING OUR FORECAST FOR HSBA**

In [None]:
#Plot trendline with date and stockprice

year = df_2.index
stock_price = df_2['HSBC Equity']
plt.figure(figsize=(25, 8))

plt.plot(year, stock_price)
plt.title('stock_price prediction')
plt.xlabel('year')
plt.ylabel('stock_price')
plt.xticks(rotation=360, ha='right')

plt.show()

# Augmented Dickey Fuller

In [None]:
# Perform ADF test
result = adfuller(stock_price)

# Extract and print the results
adf_statistic = result[0]
p_value = result[1]
critical_values = result[4]

print(f'ADF Statistic: {adf_statistic}')
print(f'p-value: {p_value}')
print('Critical Values:')
for key, value in critical_values.items():
    print(f'   {key}: {value}')

# Check for stationarity based on the p-value
if p_value <= 0.05:
    print("Reject the null hypothesis. The data is likely stationary.")
else:
    print("Fail to reject the null hypothesis. The data may not be stationary.")


# Autocorrelation Function

In [None]:

#Plot ACF
plot_acf(stock_price, lags=30, title='Autocorrelation Function (ACF)')
plt.show()

# Plot PACF
plot_pacf(stock_price, lags=30, title='Partial Autocorrelation Function (PACF)')
plt.show()


# Moving average

In [None]:
def moving_average(data, window_size):
    window = np.ones(int(window_size))/float(window_size)
    return np.convolve(data, window, 'same') 

#data = df_2['BARC LN Equity'] # example 
window_size = 30 # size of moving window

ma = moving_average(stock_price, window_size)
print(ma)
plt.figure(figsize=(24, 8))
plt.plot(year, ma)
plt.plot(year, stock_price)
plt.title('Stock Price Moving Average')



# Decomposition

In [None]:
# Perform decomposition
decomposition = seasonal_decompose(df_2['HSBC Equity'], model='additive', period=365)


# Plot the decomposition components
fig, ax = plt.subplots(4, 1, figsize=(10, 8))

ax[0].plot(df_2.index, decomposition.observed)
ax[0].set_title('Original Series')

ax[1].plot(df_2.index, decomposition.trend)
ax[1].set_title('Trend')

ax[2].plot(df_2.index, decomposition.seasonal)
ax[2].set_title('Seasonality')

ax[3].plot(df_2.index, decomposition.resid)
ax[3].set_title('Residuals')

plt.tight_layout()
plt.show()

In [None]:
# Train-test split
train_data, test_data = train_test_split(df_2, test_size=0.2, shuffle=False)

# Fit Auto ARIMA model
model = auto_arima(train_data['HSBC Equity'], seasonal=True, m=12, suppress_warnings=True)

# Make predictions
forecast = model.predict(n_periods=len(test_data))

# Calculate Mean Squared Error
mse = mean_squared_error(test_data['HSBC Equity'], forecast)
print(f"Mean Squared Error (MSE): {mse}")


# Auto Arima

In [None]:
import pandas as pd
from pmdarima import auto_arima

# Use auto_arima to find the best model
model = auto_arima(df_2['HSBC Equity'], seasonal=True, m=12, suppress_warnings=True)

# Get the selected order
selected_order = model.order
print("Selected ARIMA Order:", selected_order)

# AIC

In [None]:
# Display the original time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['HSBC Equity'], label='Original Data', marker='o')
plt.title('Original HSBC Equity')
plt.xlabel('Dates')
plt.ylabel('HSBC Equity')
plt.legend()
plt.show()

# Perform differencing on the 'StockPrice' column
df_2['DifferencedHSBC Equity'] = df_2['HSBC Equity'].diff()

# Display the differenced time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['DifferencedHSBC Equity'], label='Differenced Data', marker='o', color='orange')
plt.title('Differenced HSBC Equity')
plt.xlabel('Dates')
plt.ylabel('Differenced HSBC Equity')
plt.legend()
plt.show()


In [None]:
df_2['DifferencedHSBC Equity'] #= #df['BARC LN Equity'].diff()

# second order differencing 

In [None]:
# Replace the file path with the location of your dataset

plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['HSBC Equity'], label='Original Data', marker='o')
plt.title('Original HSBC Equity')
plt.xlabel('Dates')
plt.ylabel('HSBC Equity')
plt.legend()
plt.show()

# Perform differencing on the 'StockPrice' column
#df['DifferencedHSBC Equity'] = df['BARC LN Equity'].diff()

# Display the first differenced time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['DifferencedHSBC Equity'], label='1st Differenced Data', marker='o', color='orange')
plt.title('1st Differenced HSBC Equity')
plt.xlabel('Dates')
plt.ylabel('1st Differenced HSBC Equity')
plt.legend()
plt.show()

# Perform differencing on the already differenced data
df_2['SecondDifferencedHSBC Equity'] = df_2['DifferencedHSBC Equity'].diff()

# Display the second differenced time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['SecondDifferencedHSBC Equity'], label='2nd Differenced Data', marker='o', color='green')
plt.title('2nd DifferencedHSBC Equity')
plt.xlabel('Dates')
plt.ylabel('2nd Differenced HSBC Equity')
plt.legend()
plt.show()


In [None]:
#ARIMA (AutoRegressive Integrated Moving Average) model

# Plot the original time series
plt.figure(figsize=(17, 8))
plt.plot(df_2.index, df_2['HSBC Equity'], label='Original Data')
plt.title('Stock Price Prediction')
plt.xlabel('Year')
plt.ylabel('Stock Price')

# Fit ARIMA model
order = (0, 1, 0)  # Replace p, d, q with appropriate values
model = ARIMA(df_2['HSBC Equity'], order=order)
result = model.fit()

# Get predictions
predictions = result.predict(start=df_2.index.min(), end=df_2.index.max(), dynamic=False)

# Plot the predictions
plt.figure(figsize=(24, 8))
plt.plot(df_2.index, predictions, color='red', label='ARIMA Predictions')
plt.legend()
plt.show()

In [None]:
#perform a grid search for ARIMA parameters using AIC
# Define the range of values for p, d, and q
p_values = [0, 1, 2]
d_values = [0, 1]
q_values = [0, 1, 2]

# Initialize variables for optimal values and minimum AIC
best_aic = float("inf")
best_order = None

# Perform grid search
for p, d, q in itertools.product(p_values, d_values, q_values):
    order = (p, d, q)
    try:
        model = ARIMA(df_2['HSBC Equity'], order=order)
        result = model.fit()
        aic = result.aic
        if aic < best_aic:
            best_aic = aic
            best_order = order
    except:
        continue

print(f"Best AIC: {best_aic}")
print(f"Best Order (p, d, q): {best_order}")

In [None]:
reference_date = df_2.index.min()
X = (df_2.index - reference_date).days.values.reshape(-1, 1)
y = df_2['HSBC Equity'].values



train_size = int(len(X)*0.8)
tests_size = len(X)-train_size
X_train, X_test = X[0:train_size],X[train_size:len(X)]
y_train, y_test = y[0:train_size],y[train_size:len(X)]

# Use auto_arima to find the best ARIMA model
model = auto_arima(df_2['HSBC Equity'], seasonal=True, m=12, suppress_warnings=True, stepwise=True)

# Get the selected order from auto_arima
order = model.get_params()['order']

# Train ARIMA model with the selected order
arima_model = ARIMA(df_2['HSBC Equity'], order=order)
arima_fit = arima_model.fit()

# Forecast using the trained ARIMA model
forecast = arima_fit.forecast(steps=len(X_test))

# Evaluate the model using AIC
aic = arima_fit.aic

y_train_pred = model.predict(n_periods=len(X_train))
y_test_pred = model.predict(n_periods=len(X_test))



In [None]:
print(f"Selected ARIMA Order: {order}")
print(f"AIC: {aic}")
plt.plot(year, stock_price)
plt.title('stock_price prediction')
plt.xlabel('year')
plt.ylabel('stock_price')
plt.xticks(rotation=360, ha='right')
plt.figure(figsize=(12, 6))
plt.scatter(X_train, y_train, label='Train', alpha=0.3, color='red')
plt.scatter(X_test, y_test, label='Test', alpha=0.3, color='green')
plt.scatter(X_test, y_test_pred, label='Predicted', alpha=0.3, color='orange')
plt.title('Stock Price Prediction with Auto ARIMA')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.legend()
plt.show()


#plt.show()

#### <u>Reviewing of the Original Dataset For BARC</u>

In [None]:
df_1 = pd.read_csv('test_csv.csv')
df_1.head(10)

In [None]:

df_1 = pd.read_csv('test_csv.csv')
df_1.head(10)

In [None]:
df_2 = df_1
df_2['Dates'] = pd.to_datetime(df_1['Dates'], format='%d/%m/%Y')
df_2 = df_2.reset_index(drop=True)

# Set 'Dates' as the index of the DataFrame
df_2 = df_2.set_index('Dates')
if not isinstance(df_2.index, pd.DatetimeIndex):
    # If not, set the frequency to 'D'
    df_2.index = pd.DatetimeIndex(df_2.index, freq='D')



In [None]:
#Check for null 

df_2.isnull().sum().any()

# Visualisation 

In [None]:
#Plot trendline with date and stockprice

year = df_2.index
stock_price = df_2['BARC LN Equity']
plt.figure(figsize=(25, 8))

plt.plot(year, stock_price)
plt.title('stock_price prediction')
plt.xlabel('year')
plt.ylabel('stock_price')
plt.xticks(rotation=360, ha='right')

plt.show()

# Augmented Dickey Fuller

In [None]:
# Perform ADF test
result = adfuller(stock_price)

# Extract and print the results
adf_statistic = result[0]
p_value = result[1]
critical_values = result[4]

print(f'ADF Statistic: {adf_statistic}')
print(f'p-value: {p_value}')
print('Critical Values:')
for key, value in critical_values.items():
    print(f'   {key}: {value}')

# Check for stationarity based on the p-value
if p_value <= 0.05:
    print("Reject the null hypothesis. The data is likely stationary.")
else:
    print("Fail to reject the null hypothesis. The data may not be stationary.")


# Autocorrelation Function

In [None]:

#Plot ACF
plot_acf(stock_price, lags=30, title='Autocorrelation Function (ACF)')
plt.show()

# Plot PACF
plot_pacf(stock_price, lags=30, title='Partial Autocorrelation Function (PACF)')
plt.show()


# Moving average

In [None]:
def moving_average(data, window_size):
    window = np.ones(int(window_size))/float(window_size)
    return np.convolve(data, window, 'same') 

#data = df_2['BARC LN Equity'] # example 
window_size = 30 # size of moving window

ma = moving_average(stock_price, window_size)
print(ma)
plt.figure(figsize=(24, 8))
plt.plot(year, ma)
plt.plot(year, stock_price)
plt.title('Stock Price Moving Average')



# Decomposition

In [None]:
# Perform decomposition
decomposition = seasonal_decompose(df_2['BARC LN Equity'], model='additive', period=365)


# Plot the decomposition components
fig, ax = plt.subplots(4, 1, figsize=(10, 8))

ax[0].plot(df_2.index, decomposition.observed)
ax[0].set_title('Original Series')

ax[1].plot(df_2.index, decomposition.trend)
ax[1].set_title('Trend')

ax[2].plot(df_2.index, decomposition.seasonal)
ax[2].set_title('Seasonality')

ax[3].plot(df_2.index, decomposition.resid)
ax[3].set_title('Residuals')

plt.tight_layout()
plt.show()

In [None]:
# Train-test split
train_data, test_data = train_test_split(df_2, test_size=0.2, shuffle=False)

# Fit Auto ARIMA model
model = auto_arima(train_data['BARC LN Equity'], seasonal=True, m=12, suppress_warnings=True)

# Make predictions
forecast = model.predict(n_periods=len(test_data))

# Calculate Mean Squared Error
mse = mean_squared_error(test_data['BARC LN Equity'], forecast)
print(f"Mean Squared Error (MSE): {mse}")


# Auto Arima

In [None]:
import pandas as pd
from pmdarima import auto_arima

# Use auto_arima to find the best model
model = auto_arima(df_2['BARC LN Equity'], seasonal=True, m=12, suppress_warnings=True)

# Get the selected order
selected_order = model.order
print("Selected ARIMA Order:", selected_order)

# AIC

In [None]:
# Display the original time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['BARC LN Equity'], label='Original Data', marker='o')
plt.title('Original BARC LN Equity')
plt.xlabel('Dates')
plt.ylabel('BARC LN Equity')
plt.legend()
plt.show()

# Perform differencing on the 'StockPrice' column
df_2['DifferencedBARC LN Equity'] = df_2['BARC LN Equity'].diff()

# Display the differenced time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['DifferencedBARC LN Equity'], label='Differenced Data', marker='o', color='orange')
plt.title('Differenced BARC LN Equity')
plt.xlabel('Dates')
plt.ylabel('Differenced BARC LN Equity')
plt.legend()
plt.show()


In [None]:
df_2['DifferencedBARC LN Equity'] #= #df['BARC LN Equity'].diff()

# second order differencing 

In [None]:
# Replace the file path with the location of your dataset

plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['BARC LN Equity'], label='Original Data', marker='o')
plt.title('Original BARC LN Equity')
plt.xlabel('Dates')
plt.ylabel('BARC LN Equity')
plt.legend()
plt.show()

# Perform differencing on the 'StockPrice' column

# Display the first differenced time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['DifferencedBARC LN Equity'], label='1st Differenced Data', marker='o', color='orange')
plt.title('1st Differenced BARC LN Equity')
plt.xlabel('Dates')
plt.ylabel('1st Differenced BARC LN Equity')
plt.legend()
plt.show()

# Perform differencing on the already differenced data
df_2['SecondDifferencedBARC LN Equity'] = df_2['DifferencedBARC LN Equity'].diff()

# Display the second differenced time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['SecondDifferencedBARC LN Equity'], label='2nd Differenced Data', marker='o', color='green')
plt.title('2nd DifferencedBARC LN Equity')
plt.xlabel('Dates')
plt.ylabel('2nd Differenced BARC LN Equity')
plt.legend()
plt.show()


In [None]:
#ARIMA (AutoRegressive Integrated Moving Average) model

# Plot the original time series
plt.figure(figsize=(17, 8))
plt.plot(df_2.index, df_2['BARC LN Equity'], label='Original Data')
plt.title('Stock Price Prediction')
plt.xlabel('Year')
plt.ylabel('Stock Price')

# Fit ARIMA model
order = (0, 1, 0)  # Replace p, d, q with appropriate values
model = ARIMA(df_2['BARC LN Equity'], order=order)
result = model.fit()

# Get predictions
predictions = result.predict(start=df_2.index.min(), end=df_2.index.max(), dynamic=False)

# Plot the predictions
plt.figure(figsize=(24, 8))
plt.plot(df_2.index, predictions, color='red', label='ARIMA Predictions')
plt.legend()
plt.show()

In [None]:
#perform a grid search for ARIMA parameters using AIC
# Define the range of values for p, d, and q
p_values = [0, 1, 2]
d_values = [0, 1]
q_values = [0, 1, 2]

# Initialize variables for optimal values and minimum AIC
best_aic = float("inf")
best_order = None

# Perform grid search
for p, d, q in itertools.product(p_values, d_values, q_values):
    order = (p, d, q)
    try:
        model = ARIMA(df_2['HSBC Equity'], order=order)
        result = model.fit()
        aic = result.aic
        if aic < best_aic:
            best_aic = aic
            best_order = order
    except:
        continue

print(f"Best AIC: {best_aic}")
print(f"Best Order (p, d, q): {best_order}")

In [None]:
reference_date = df_2.index.min()
X = (df_2.index - reference_date).days.values.reshape(-1, 1)
y = df_2['BARC LN Equity'].values



train_size = int(len(X)*0.8)
tests_size = len(X)-train_size
X_train, X_test = X[0:train_size],X[train_size:len(X)]
y_train, y_test = y[0:train_size],y[train_size:len(X)]

# Use auto_arima to find the best ARIMA model
model = auto_arima(df_2['BARC LN Equity'], seasonal=True, m=12, suppress_warnings=True, stepwise=True)

# Get the selected order from auto_arima
order = model.get_params()['order']

# Train ARIMA model with the selected order
arima_model = ARIMA(df_2['BARC LN Equity'], order=order)
arima_fit = arima_model.fit()

# Forecast using the trained ARIMA model
forecast = arima_fit.forecast(steps=len(X_test))

# Evaluate the model using AIC
aic = arima_fit.aic

y_train_pred = model.predict(n_periods=len(X_train))
y_test_pred = model.predict(n_periods=len(X_test))



In [None]:
print(f"Selected ARIMA Order: {order}")
print(f"AIC: {aic}")
plt.plot(year, stock_price)
plt.title('stock_price prediction')
plt.xlabel('year')
plt.ylabel('stock_price')
plt.xticks(rotation=360, ha='right')
plt.figure(figsize=(12, 6))
plt.scatter(X_train, y_train, label='Train', alpha=0.3, color='red')
plt.scatter(X_test, y_test, label='Test', alpha=0.3, color='green')
plt.scatter(X_test, y_test_pred, label='Predicted', alpha=0.3, color='orange')
plt.title('Stock Price Prediction with Auto ARIMA')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.legend()
plt.show()


#plt.show()

#### <u>Reviewing of the Original Dataset For JPM</u>

In [None]:
df_1 = pd.read_csv('stockprice_three.csv')
df_1.head(10)

In [None]:
df_1 = pd.read_csv('stockprice_three.csv')
df_1.head(10)

In [None]:
df_2 = df_1
df_2['Dates'] = pd.to_datetime(df_1['Dates'], format='%d/%m/%Y')
df_2 = df_2.reset_index(drop=True)

# Set 'Dates' as the index of the DataFrame
df_2 = df_2.set_index('Dates')
if not isinstance(df_2.index, pd.DatetimeIndex):
    # If not, set the frequency to 'D'
    df_2.index = pd.DatetimeIndex(df_2.index, freq='D')



In [None]:
#Check for null 

df_2.isnull().sum().any()

# Visualisation 

In [None]:
#Plot trendline with date and stockprice

year = df_2.index
stock_price = df_2['JPM UN Equity']
plt.figure(figsize=(25, 8))

plt.plot(year, stock_price)
plt.title('stock_price prediction')
plt.xlabel('year')
plt.ylabel('stock_price')
plt.xticks(rotation=360, ha='right')

plt.show()

# Augmented Dickey Fuller

In [None]:
# Perform ADF test
result = adfuller(stock_price)

# Extract and print the results
adf_statistic = result[0]
p_value = result[1]
critical_values = result[4]

print(f'ADF Statistic: {adf_statistic}')
print(f'p-value: {p_value}')
print('Critical Values:')
for key, value in critical_values.items():
    print(f'   {key}: {value}')

# Check for stationarity based on the p-value
if p_value <= 0.05:
    print("Reject the null hypothesis. The data is likely stationary.")
else:
    print("Fail to reject the null hypothesis. The data may not be stationary.")


# Autocorrelation Function

In [None]:

#Plot ACF
plot_acf(stock_price, lags=30, title='Autocorrelation Function (ACF)')
plt.show()

# Plot PACF
plot_pacf(stock_price, lags=30, title='Partial Autocorrelation Function (PACF)')
plt.show()


# Moving average

In [None]:
def moving_average(data, window_size):
    window = np.ones(int(window_size))/float(window_size)
    return np.convolve(data, window, 'same') 

window_size = 30 # size of moving window

ma = moving_average(stock_price, window_size)
print(ma)
plt.figure(figsize=(24, 8))
plt.plot(year, ma)
plt.plot(year, stock_price)
plt.title('Stock Price Moving Average')



# Decomposition

In [None]:
# Perform decomposition
decomposition = seasonal_decompose(df_2['JPM UN Equity'], model='additive', period=365)


# Plot the decomposition components
fig, ax = plt.subplots(4, 1, figsize=(10, 8))

ax[0].plot(df_2.index, decomposition.observed)
ax[0].set_title('Original Series')

ax[1].plot(df_2.index, decomposition.trend)
ax[1].set_title('Trend')

ax[2].plot(df_2.index, decomposition.seasonal)
ax[2].set_title('Seasonality')

ax[3].plot(df_2.index, decomposition.resid)
ax[3].set_title('Residuals')

plt.tight_layout()
plt.show()

In [None]:
# Train-test split
train_data, test_data = train_test_split(df_2, test_size=0.2, shuffle=False)

# Fit Auto ARIMA model
model = auto_arima(train_data['JPM UN Equity'], seasonal=True, m=12, suppress_warnings=True)

# Make predictions
forecast = model.predict(n_periods=len(test_data))

# Calculate Mean Squared Error
mse = mean_squared_error(test_data['JPM UN Equity'], forecast)
print(f"Mean Squared Error (MSE): {mse}")


# Auto Arima

In [None]:
import pandas as pd
from pmdarima import auto_arima

# Use auto_arima to find the best model
model = auto_arima(df_2['JPM UN Equity'], seasonal=True, m=12, suppress_warnings=True)

# Get the selected order
selected_order = model.order
print("Selected ARIMA Order:", selected_order)

# AIC

In [None]:
# Display the original time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['JPM UN Equity'], label='Original Data', marker='o')
plt.title('Original JPM UN Equity')
plt.xlabel('Dates')
plt.ylabel('JPM UN Equity')
plt.legend()
plt.show()

# Perform differencing on the 'StockPrice' column
df_2['DifferencedJPM UN Equity'] = df_2['JPM UN Equity'].diff()

# Display the differenced time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['DifferencedJPM UN Equity'], label='Differenced Data', marker='o', color='orange')
plt.title('Differenced JPM UN Equity')
plt.xlabel('Dates')
plt.ylabel('Differenced JPM UN Equity')
plt.legend()
plt.show()


In [None]:
df_2['DifferencedJPM UN Equity'] #= #df['BARC LN Equity'].diff()

# second order differencing 

In [None]:
# Replace the file path with the location of your dataset

plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['JPM UN Equity'], label='Original Data', marker='o')
plt.title('Original JPM UN Equity')
plt.xlabel('Dates')
plt.ylabel('JPM UN Equity')
plt.legend()
plt.show()

# Perform differencing on the 'StockPrice' column

# Display the first differenced time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['DifferencedJPM UN Equity'], label='1st Differenced Data', marker='o', color='orange')
plt.title('1st Differenced JPM UN Equity')
plt.xlabel('Dates')
plt.ylabel('1st Differenced JPM UN Equity')
plt.legend()
plt.show()

# Perform differencing on the already differenced data
df_2['SecondDifferencedJPM UN Equity'] = df_2['DifferencedJPM UN Equity'].diff()

# Display the second differenced time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['SecondDifferencedJPM UN Equity'], label='2nd Differenced Data', marker='o', color='green')
plt.title('2nd DifferencedJPM UN Equity')
plt.xlabel('Dates')
plt.ylabel('2nd Differenced JPM UN Equity')
plt.legend()
plt.show()


In [None]:
#ARIMA (AutoRegressive Integrated Moving Average) model

# Plot the original time series
plt.figure(figsize=(17, 8))
plt.plot(df_2.index, df_2['JPM UN Equity'], label='Original Data')
plt.title('Stock Price Prediction')
plt.xlabel('Year')
plt.ylabel('Stock Price')

# Fit ARIMA model
order = (0, 1, 0)  # Replace p, d, q with appropriate values
model = ARIMA(df_2['JPM UN Equity'], order=order)
result = model.fit()

# Get predictions
predictions = result.predict(start=df_2.index.min(), end=df_2.index.max(), dynamic=False)

# Plot the predictions
plt.figure(figsize=(24, 8))
plt.plot(df_2.index, predictions, color='red', label='ARIMA Predictions')
plt.legend()
plt.show()

In [None]:
#perform a grid search for ARIMA parameters using AIC
# Define the range of values for p, d, and q
p_values = [0, 1, 2]
d_values = [0, 1]
q_values = [0, 1, 2]

# Initialize variables for optimal values and minimum AIC
best_aic = float("inf")
best_order = None

# Perform grid search
for p, d, q in itertools.product(p_values, d_values, q_values):
    order = (p, d, q)
    try:
        model = ARIMA(df_2['JPM UN Equity'], order=order)
        result = model.fit()
        aic = result.aic
        if aic < best_aic:
            best_aic = aic
            best_order = order
    except:
        continue

print(f"Best AIC: {best_aic}")
print(f"Best Order (p, d, q): {best_order}")

In [None]:
reference_date = df_2.index.min()
X = (df_2.index - reference_date).days.values.reshape(-1, 1)
y = df_2['JPM UN Equity'].values



train_size = int(len(X)*0.8)
tests_size = len(X)-train_size
X_train, X_test = X[0:train_size],X[train_size:len(X)]
y_train, y_test = y[0:train_size],y[train_size:len(X)]

# Use auto_arima to find the best ARIMA model
model = auto_arima(df_2['JPM UN Equity'], seasonal=True, m=12, suppress_warnings=True, stepwise=True)

# Get the selected order from auto_arima
order = model.get_params()['order']

# Train ARIMA model with the selected order
arima_model = ARIMA(df_2['JPM UN Equity'], order=order)
arima_fit = arima_model.fit()

# Forecast using the trained ARIMA model
forecast = arima_fit.forecast(steps=len(X_test))

# Evaluate the model using AIC
aic = arima_fit.aic

y_train_pred = model.predict(n_periods=len(X_train))
y_test_pred = model.predict(n_periods=len(X_test))



In [None]:
print(f"Selected ARIMA Order: {order}")
print(f"AIC: {aic}")
plt.plot(year, stock_price)
plt.title('stock_price prediction')
plt.xlabel('year')
plt.ylabel('stock_price')
plt.xticks(rotation=360, ha='right')
plt.figure(figsize=(12, 6))
plt.scatter(X_train, y_train, label='Train', alpha=0.3, color='red')
plt.scatter(X_test, y_test, label='Test', alpha=0.3, color='green')
plt.scatter(X_test, y_test_pred, label='Predicted', alpha=0.3, color='orange')
plt.title('Stock Price Prediction with Auto ARIMA')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.legend()
plt.show()


#plt.show()

#### <u>Reviewing of the Original Dataset For BAC</u>

In [None]:
df_1 = pd.read_csv('stockprice_four.csv')
df_1.head(10)

In [None]:

df_1 = pd.read_csv('stockprice_four.csv')
df_1.head(10)

In [None]:
df_2 = df_1
df_2['Dates'] = pd.to_datetime(df_1['Dates'], format='%d/%m/%Y')
df_2 = df_2.reset_index(drop=True)

# Set 'Dates' as the index of the DataFrame
df_2 = df_2.set_index('Dates')
if not isinstance(df_2.index, pd.DatetimeIndex):
    # If not, set the frequency to 'D'
    df_2.index = pd.DatetimeIndex(df_2.index, freq='D')



In [None]:
#Check for null 

df_2.isnull().sum().any()

# Visualisation 

In [None]:
#Plot trendline with date and stockprice

year = df_2.index
stock_price = df_2['BAC UN EQUITY']
plt.figure(figsize=(25, 8))

plt.plot(year, stock_price)
plt.title('stock_price prediction')
plt.xlabel('year')
plt.ylabel('stock_price')
plt.xticks(rotation=360, ha='right')

plt.show()

# Augmented Dickey Fuller

In [None]:
# Perform ADF test
result = adfuller(stock_price)

# Extract and print the results
adf_statistic = result[0]
p_value = result[1]
critical_values = result[4]

print(f'ADF Statistic: {adf_statistic}')
print(f'p-value: {p_value}')
print('Critical Values:')
for key, value in critical_values.items():
    print(f'   {key}: {value}')

# Check for stationarity based on the p-value
if p_value <= 0.05:
    print("Reject the null hypothesis. The data is likely stationary.")
else:
    print("Fail to reject the null hypothesis. The data may not be stationary.")


# Autocorrelation Function

In [None]:

#Plot ACF
plot_acf(stock_price, lags=30, title='Autocorrelation Function (ACF)')
plt.show()

# Plot PACF
plot_pacf(stock_price, lags=30, title='Partial Autocorrelation Function (PACF)')
plt.show()


# Moving average

In [None]:
def moving_average(data, window_size):
    window = np.ones(int(window_size))/float(window_size)
    return np.convolve(data, window, 'same') 

window_size = 30 # size of moving window

ma = moving_average(stock_price, window_size)
print(ma)
plt.figure(figsize=(24, 8))
plt.plot(year, ma)
plt.plot(year, stock_price)
plt.title('Stock Price Moving Average')



# Decomposition

In [None]:
# Perform decomposition
decomposition = seasonal_decompose(df_2['BAC UN EQUITY'], model='additive', period=365)


# Plot the decomposition components
fig, ax = plt.subplots(4, 1, figsize=(10, 8))

ax[0].plot(df_2.index, decomposition.observed)
ax[0].set_title('Original Series')

ax[1].plot(df_2.index, decomposition.trend)
ax[1].set_title('Trend')

ax[2].plot(df_2.index, decomposition.seasonal)
ax[2].set_title('Seasonality')

ax[3].plot(df_2.index, decomposition.resid)
ax[3].set_title('Residuals')

plt.tight_layout()
plt.show()

In [None]:
# Train-test split
train_data, test_data = train_test_split(df_2, test_size=0.2, shuffle=False)

# Fit Auto ARIMA model
model = auto_arima(train_data['BAC UN EQUITY'], seasonal=True, m=12, suppress_warnings=True)

# Make predictions
forecast = model.predict(n_periods=len(test_data))

# Calculate Mean Squared Error
mse = mean_squared_error(test_data['BAC UN EQUITY'], forecast)
print(f"Mean Squared Error (MSE): {mse}")


# Auto Arima

In [None]:
import pandas as pd
from pmdarima import auto_arima

# Use auto_arima to find the best model
model = auto_arima(df_2['BAC UN EQUITY'], seasonal=True, m=12, suppress_warnings=True)

# Get the selected order
selected_order = model.order
print("Selected ARIMA Order:", selected_order)

# AIC

In [None]:
# Display the original time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['BAC UN EQUITY'], label='Original Data', marker='o')
plt.title('Original BAC UN EQUITY')
plt.xlabel('Dates')
plt.ylabel('BAC UN EQUITY')
plt.legend()
plt.show()

# Perform differencing on the 'StockPrice' column
df_2['DifferencedBAC UN EQUITY'] = df_2['BAC UN EQUITY'].diff()

# Display the differenced time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['DifferencedBAC UN EQUITY'], label='Differenced Data', marker='o', color='orange')
plt.title('Differenced BAC UN EQUITY')
plt.xlabel('Dates')
plt.ylabel('Differenced BAC UN EQUITY')
plt.legend()
plt.show()


In [None]:
df_2['DifferencedBAC UN EQUITY'] #= #df['BARC LN Equity'].diff()

# second order differencing 

In [None]:
# Replace the file path with the location of your dataset

plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['BAC UN EQUITY'], label='Original Data', marker='o')
plt.title('Original BAC UN EQUITY')
plt.xlabel('Dates')
plt.ylabel('BAC UN EQUITY')
plt.legend()
plt.show()

# Perform differencing on the 'StockPrice' column

# Display the first differenced time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['DifferencedBAC UN EQUITY'], label='1st Differenced Data', marker='o', color='orange')
plt.title('1st Differenced BAC UN EQUITY')
plt.xlabel('Dates')
plt.ylabel('1st Differenced BAC UN EQUITY')
plt.legend()
plt.show()

# Perform differencing on the already differenced data
df_2['SecondDifferencedBAC UN EQUITY'] = df_2['DifferencedBAC UN EQUITY'].diff()

# Display the second differenced time series plot
plt.figure(figsize=(10, 5))
plt.plot(df_2.index, df_2['SecondDifferencedBAC UN EQUITY'], label='2nd Differenced Data', marker='o', color='green')
plt.title('2nd DifferencedBAC UN EQUITY')
plt.xlabel('Dates')
plt.ylabel('2nd Differenced BAC UN EQUITY')
plt.legend()
plt.show()


In [None]:
#ARIMA (AutoRegressive Integrated Moving Average) model

# Plot the original time series
plt.figure(figsize=(17, 8))
plt.plot(df_2.index, df_2['BAC UN EQUITY'], label='Original Data')
plt.title('Stock Price Prediction')
plt.xlabel('Year')
plt.ylabel('Stock Price')

# Fit ARIMA model
order = (0, 1, 0)  # Replace p, d, q with appropriate values
model = ARIMA(df_2['BAC UN EQUITY'], order=order)
result = model.fit()

# Get predictions
predictions = result.predict(start=df_2.index.min(), end=df_2.index.max(), dynamic=False)

# Plot the predictions
plt.figure(figsize=(24, 8))
plt.plot(df_2.index, predictions, color='red', label='ARIMA Predictions')
plt.legend()
plt.show()

In [None]:
#perform a grid search for ARIMA parameters using AIC
# Define the range of values for p, d, and q
p_values = [0, 1, 2]
d_values = [0, 1]
q_values = [0, 1, 2]

# Initialize variables for optimal values and minimum AIC
best_aic = float("inf")
best_order = None

# Perform grid search
for p, d, q in itertools.product(p_values, d_values, q_values):
    order = (p, d, q)
    try:
        model = ARIMA(df_2['BAC UN EQUITY'], order=order)
        result = model.fit()
        aic = result.aic
        if aic < best_aic:
            best_aic = aic
            best_order = order
    except:
        continue

print(f"Best AIC: {best_aic}")
print(f"Best Order (p, d, q): {best_order}")

In [None]:
reference_date = df_2.index.min()
X = (df_2.index - reference_date).days.values.reshape(-1, 1)
y = df_2['BAC UN EQUITY'].values



train_size = int(len(X)*0.8)
tests_size = len(X)-train_size
X_train, X_test = X[0:train_size],X[train_size:len(X)]
y_train, y_test = y[0:train_size],y[train_size:len(X)]

# Use auto_arima to find the best ARIMA model
model = auto_arima(df_2['BAC UN EQUITY'], seasonal=True, m=12, suppress_warnings=True, stepwise=True)

# Get the selected order from auto_arima
order = model.get_params()['order']

# Train ARIMA model with the selected order
arima_model = ARIMA(df_2['BAC UN EQUITY'], order=order)
arima_fit = arima_model.fit()

# Forecast using the trained ARIMA model
forecast = arima_fit.forecast(steps=len(X_test))

# Evaluate the model using AIC
aic = arima_fit.aic

y_train_pred = model.predict(n_periods=len(X_train))
y_test_pred = model.predict(n_periods=len(X_test))



In [None]:
print(f"Selected ARIMA Order: {order}")
print(f"AIC: {aic}")
plt.plot(year, stock_price)
plt.title('stock_price prediction')
plt.xlabel('year')
plt.ylabel('stock_price')
plt.xticks(rotation=360, ha='right')
plt.figure(figsize=(12, 6))
plt.scatter(X_train, y_train, label='Train', alpha=0.3, color='red')
plt.scatter(X_test, y_test, label='Test', alpha=0.3, color='green')
plt.scatter(X_test, y_test_pred, label='Predicted', alpha=0.3, color='orange')
plt.title('Stock Price Prediction with Auto ARIMA')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.legend()
plt.show()


#plt.show()

In [None]:
# Reading Excel files
df = pd.read_excel('UK.US BANKS STOCK PRICES.xlsx', index_col='Dates')


# 选择一个股价列进行预测，这里以第一列为例·/Select a stock price column to forecast
colIndex = 3
data = df.iloc[:, colIndex].values.reshape(-1, 1)


In [None]:
# 数据归一化/data normalization
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data)

In [None]:
# 创建时间序列数据集/Create a time series data set
def create_dataset(dataset, time_step=30):
    X, Y = [], []
    for i in range(len(dataset) - time_step - 1):
        a = dataset[i:(i + time_step), 0]
        X.append(a)
        Y.append(dataset[i + time_step, 0])
    return np.array(X), np.array(Y)


In [None]:
# 取过去1年的数据用来预测/Take the last year's data and use it to make predictions
time_step = 260
X, Y = create_dataset(scaled_data, time_step)

# 分割数据集/Split data set
train_size = int(len(X) * 0.8)
test_size = len(X) - train_size
X_train, X_test = X[0:train_size], X[train_size:len(X)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(Y)]

# 重塑成LSTM需要的格式 [samples, time steps, features]/Remodel to the format required by LSTM
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)


In [None]:
# 构建BILSTM模型/Construct BILSTM model
model = Sequential()
model.add(Bidirectional(LSTM(50, return_sequences=True), input_shape=(time_step, 1)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(50)))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')

# 训练模型/training model
history = model.fit(X_train, Y_train, epochs=100, batch_size=32, validation_data=(X_test, Y_test), verbose=1)



In [None]:
def predict_future_prices(model, last_sequence, n_future_days=260 * 2):
    future_prices = []
    current_sequence = last_sequence.copy()

    for _ in range(n_future_days):
        current_sequence = current_sequence.reshape(1, time_step, 1)
        future_price = model.predict(current_sequence)
        future_prices.append(future_price[0][0])

        # 更新序列/Update sequence
        current_sequence = np.append(current_sequence[:, 1:, :], [[future_price[0]]], axis=1)

    return scaler.inverse_transform(np.array(future_prices).reshape(-1, 1))


# 使用最后的时间步长数据作为输入/Use the last time step data as input
last_sequence = scaled_data[-time_step:]
future_prices = predict_future_prices(model, last_sequence)


In [None]:
# 在测试集上评估模型/Evaluate the model on the test set
test_loss = model.evaluate(X_test, Y_test, verbose=0)
print(f"Test Loss: {test_loss}")

test_predictions = model.predict(X_test)
test_predictions = scaler.inverse_transform(test_predictions)
Y_test = scaler.inverse_transform(Y_test.reshape(-1, 1))

# 计算MSE和RMSE/Calculate MSE and RMSE
mse = mean_squared_error(Y_test, test_predictions)
rmse = sqrt(mse)
print(f"Mean Squared Error (MSE,均方误差) on Test Set: {mse}")
print(f"Root Mean Squared Error (RMSE,均方根误差) on Test Set: {rmse}")

mae = mean_absolute_error(Y_test, test_predictions)
print(f"Mean Absolute Error (MAE,平均绝对误差) on Test Set: {mae}")

mape = np.mean(np.abs((Y_test - test_predictions) / Y_test)) * 100
print(f"Mean Absolute Percentage Error (MAPE,平均绝对百分比误差) on Test Set:{mape}%")

ev = explained_variance_score(Y_test, test_predictions)
print(f"Explained Variance Score (EV,解释方差分数) on Test Set: {ev}")

r2 = r2_score(Y_test, test_predictions)
print(f"R-squared (R²,决定系数) on Test Set: {r2}")


In [None]:
# 股价数据可视化与未来预测的股价结合/Share price data visualization combined with future forecast share price
plt.figure(figsize=(12, 6))
plt.title('Stock Prices and Future Prediction')
plt.xlabel('Dates')
plt.ylabel('Prices')
plt.plot(df.index, df.iloc[:, colIndex], label='Actual Price')
plt.plot(pd.date_range(df.index[-1], periods=260*2, freq='B'), future_prices, label='Predicted Future Price', color='red')
plt.legend()
plt.savefig('stock_prices_future_prediction.png', format='png', dpi=200)
plt.show()

In [None]:
# 可视化测试集的实际价格和预测价格/Visualize the actual and predicted prices of the test set
plt.figure(figsize=(12, 6))
plt.title('Test Set Actual and Predicted Prices')
plt.xlabel('Time')
plt.ylabel('Price')
plt.plot(Y_test, label='Actual Price')
plt.plot(test_predictions, label='Predicted Price', color='red')
plt.legend()
plt.savefig('Test_Set_Actual_and_Predicted Prices.png', format='png', dpi=200)
plt.show()
