#Importing dataset

#EDA


In [None]:
# ----------------------------------------------------------
# 📈 APPLE STOCK PRICE PREDICTION – FINAL FIXED VERSION (NO ERRORS)
# ----------------------------------------------------------

# ✅ Import Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
import warnings
warnings.filterwarnings("ignore")

# Deep Learning (LSTM)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# ----------------------------------------------------------
# STEP 1: Load and Prepare Dataset
# ----------------------------------------------------------
df = pd.read_csv("P587 DATASET.csv")
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by='Date')
df.set_index('Date', inplace=True)
df = df.fillna(method='ffill')

plt.figure(figsize=(12,5))
plt.plot(df['Close'], label='Apple Close Price', color='blue')
plt.title('Apple Stock Closing Price (2012–2019)')
plt.xlabel('Date')
plt.ylabel('Close Price ($)')
plt.legend()
plt.show()

# ----------------------------------------------------------
# STEP 2: Time Series Split (90% train, 10% test)
# ----------------------------------------------------------
train_size = int(len(df) * 0.9)
train, test = df['Close'][:train_size], df['Close'][train_size:]

plt.figure(figsize=(10,5))
plt.plot(train, label='Train Data')
plt.plot(test, label='Test Data', color='orange')
plt.title('Time Series Split: Train vs Test')
plt.legend()
plt.show()

# ----------------------------------------------------------
# STEP 3: ARIMA MODEL
# ----------------------------------------------------------
arima_model = ARIMA(train, order=(5,1,0))
arima_fit = arima_model.fit()
arima_pred = arima_fit.predict(start=len(train), end=len(train)+len(test)-1, typ='levels')
arima_pred.index = test.index

mse_arima = mean_squared_error(test, arima_pred)
mae_arima = mean_absolute_error(test, arima_pred)
rmse_arima = sqrt(mse_arima)
r2_arima = r2_score(test, arima_pred)

plt.figure(figsize=(12,5))
plt.plot(train, label='Train')
plt.plot(test, label='Actual')
plt.plot(arima_pred, label='ARIMA Predicted', color='red')
plt.title('ARIMA Model Forecast')
plt.legend()
plt.show()

# ----------------------------------------------------------
# STEP 4: SARIMA MODEL
# ----------------------------------------------------------
sarima_model = SARIMAX(train, order=(1,1,1), seasonal_order=(1,1,1,12))
sarima_fit = sarima_model.fit(disp=False)
sarima_pred = sarima_fit.predict(start=len(train), end=len(train)+len(test)-1, typ='levels')
sarima_pred.index = test.index

mse_sarima = mean_squared_error(test, sarima_pred)
mae_sarima = mean_absolute_error(test, sarima_pred)
rmse_sarima = sqrt(mse_sarima)
r2_sarima = r2_score(test, sarima_pred)

plt.figure(figsize=(12,5))
plt.plot(train, label='Train')
plt.plot(test, label='Actual')
plt.plot(sarima_pred, label='SARIMA Predicted', color='green')
plt.title('SARIMA Model Forecast')
plt.legend()
plt.show()

# ----------------------------------------------------------
# STEP 5: XGBOOST MODEL
# ----------------------------------------------------------
df_ml = df.copy()
df_ml['Day'] = df_ml.index.day
df_ml['Month'] = df_ml.index.month
df_ml['Year'] = df_ml.index.year
df_ml['MA_5'] = df_ml['Close'].rolling(5).mean()
df_ml['MA_10'] = df_ml['Close'].rolling(10).mean()
df_ml = df_ml.dropna()

X = df_ml[['Open','High','Low','Volume','Day','Month','Year','MA_5','MA_10']]
y = df_ml['Close']

split = int(len(X)*0.9)
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

xgb_model = XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.05, random_state=42)
xgb_model.fit(X_train_scaled, y_train)
xgb_pred = xgb_model.predict(X_test_scaled)

mse_xgb = mean_squared_error(y_test, xgb_pred)
mae_xgb = mean_absolute_error(y_test, xgb_pred)
rmse_xgb = sqrt(mse_xgb)
r2_xgb = r2_score(y_test, xgb_pred)

plt.figure(figsize=(12,5))
plt.plot(y_test.index, y_test.values, label='Actual')
plt.plot(y_test.index, xgb_pred, label='XGBoost Predicted', color='blue')
plt.title('XGBoost Model Forecast')
plt.legend()
plt.show()

# ----------------------------------------------------------
# STEP 6: LSTM MODEL
# ----------------------------------------------------------
data = df[['Close']]
scaler_lstm = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler_lstm.fit_transform(data)

train_size = int(len(scaled_data) * 0.9)
train_data, test_data = scaled_data[:train_size], scaled_data[train_size:]

def create_dataset(dataset, time_step=60):
    X, Y = [], []
    for i in range(len(dataset)-time_step-1):
        X.append(dataset[i:(i+time_step), 0])
        Y.append(dataset[i + time_step, 0])
    return np.array(X), np.array(Y)

time_step = 60
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)

X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 1)),
    LSTM(50),
    Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error')
early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0, callbacks=[early_stop])

lstm_pred = model.predict(X_test)
lstm_pred = scaler_lstm.inverse_transform(lstm_pred)
y_test_actual = scaler_lstm.inverse_transform(y_test.reshape(-1,1))

mse_lstm = mean_squared_error(y_test_actual, lstm_pred)
mae_lstm = mean_absolute_error(y_test_actual, lstm_pred)
rmse_lstm = sqrt(mse_lstm)
r2_lstm = r2_score(y_test_actual, lstm_pred)

plt.figure(figsize=(12,5))
plt.plot(y_test_actual, label='Actual')
plt.plot(lstm_pred, label='LSTM Predicted', color='purple')
plt.title('LSTM Model Forecast')
plt.legend()
plt.show()

# ----------------------------------------------------------
# STEP 7: MODEL COMPARISON TABLE
# ----------------------------------------------------------
results_df = pd.DataFrame({
    'Model': ['ARIMA', 'SARIMA', 'XGBoost', 'LSTM'],
    'MSE': [mse_arima, mse_sarima, mse_xgb, mse_lstm],
    'MAE': [mae_arima, mae_sarima, mae_xgb, mae_lstm],
    'RMSE': [rmse_arima, rmse_sarima, rmse_xgb, rmse_lstm],
    'R²': [r2_arima, r2_sarima, r2_xgb, r2_lstm]
})

# Round and show in one clean table
results_df = results_df.round(4)
print("\n📊 Model Performance Comparison:\n")
display(results_df)

# Identify Best Model
best_model = results_df.loc[results_df['RMSE'].idxmin(), 'Model']
print(f"\n🏆 Best Model: {best_model}")

# Visual Comparison
plt.figure(figsize=(10,5))
plt.bar(results_df['Model'], results_df['RMSE'], color=['red','green','blue','purple'])
plt.title('Model RMSE Comparison (Lower = Better)')
plt.ylabel('RMSE')
plt.show()

# ----------------------------------------------------------
# STEP 8: FUTURE FORECAST (BEST MODEL)
# ----------------------------------------------------------
if best_model == 'SARIMA':
    final_model = sarima_fit
    future_forecast = final_model.forecast(steps=30)
elif best_model == 'ARIMA':
    final_model = arima_fit
    future_forecast = final_model.forecast(steps=30)
elif best_model == 'XGBoost':
    last_data = X_test_scaled[-30:]
    future_forecast = xgb_model.predict(last_data)
else:
    last_60 = scaled_data[-60:]
    X_input = last_60.reshape(1, -1)
    temp_input = list(X_input[0])
    lst_output = []
    for i in range(30):
        X_input = np.array(temp_input[-60:]).reshape(1, 60, 1)
        yhat = model.predict(X_input, verbose=0)
        temp_input.append(yhat[0][0])
        lst_output.append(yhat[0][0])
    future_forecast = scaler_lstm.inverse_transform(np.array(lst_output).reshape(-1,1)).flatten()

future_dates = pd.date_range(df.index[-1] + pd.Timedelta(days=1), periods=30)
forecast_df = pd.DataFrame({'Date': future_dates, 'Predicted_Close': future_forecast}).set_index('Date')

plt.figure(figsize=(12,5))
plt.plot(df['Close'], label='Historical')
plt.plot(forecast_df['Predicted_Close'], label=f'30-Day Forecast ({best_model})', color='red')
plt.title(f'Apple Stock 30-Day Forecast using {best_model}')
plt.legend()
plt.show()

print("\n📅 Next 30-Day Forecast:")
display(forecast_df.head(10))


In [None]:
# ----------------------------------------------------------
# 🧾 STEP 7: MODEL COMPARISON TABLE (FINAL TABULAR COLUMN)
# ----------------------------------------------------------

# Combine all model metrics into one DataFrame
results_df = pd.DataFrame({
    'Model': ['ARIMA', 'SARIMA', 'XGBoost', 'LSTM'],
    'MSE': [mse_arima, mse_sarima, mse_xgb, mse_lstm],
    'MAE': [mae_arima, mae_sarima, mae_xgb, mae_lstm],
    'RMSE': [rmse_arima, rmse_sarima, rmse_xgb, rmse_lstm],
    'R²': [r2_arima, r2_sarima, r2_xgb, r2_lstm]
}).round(4)

# Display tabular comparison neatly
print("\n📊 MODEL PERFORMANCE COMPARISON\n")
from tabulate import tabulate
print(tabulate(results_df, headers='keys', tablefmt='fancy_grid', showindex=False))

# Identify Best Model
best_model = results_df.loc[results_df['RMSE'].idxmin(), 'Model']
print(f"\n🏆 BEST MODEL BASED ON LOWEST RMSE: {best_model}")

# Visualization
plt.figure(figsize=(10,5))
plt.bar(results_df['Model'], results_df['RMSE'], color=['red','green','blue','purple'])
plt.title('Model RMSE Comparison (Lower = Better)')
plt.ylabel('RMSE')
plt.show()
