In [None]:
import pandas as pd
import yfinance as yf

# Download historical stock price data

stock_data = yf.download('AAPL', start='2010-01-01', end='2023-01-01')
stock_data.to_csv('AAPL_stock_data.csv')


## Data Preprocessing
* Load the data and handle missing values if any.
* Convert the date column to datetime format and set it as the index.

In [None]:
# Load the data

data = pd.read_csv('AAPL_stock_data.csv')

# Convert 'Date' column to datetime format and set as index

data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

# Check for missing values

data.isnull().sum()


## Exploratory Data Analysis (EDA)
* Visualize the stock price data to understand its trends and patterns.
* Check for stationarity using statistical tests like Augmented Dickey-Fuller (ADF) test.

In [None]:
import matplotlib.pyplot as plt

# Plot the closing prices

plt.figure(figsize=(10, 5))
plt.plot(data['Close'])
plt.title('Stock Closing Prices')
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()

# ADF test for stationarity

from statsmodels.tsa.stattools import adfuller

result = adfuller(data['Close'])
print('ADF Statistic:', result[0])
print('p-value:', result[1])


## Data Transformation
* If the data is not stationary, apply transformations such as differencing or log transformation to make it stationary.

In [None]:
# Differencing to make the data stationary

data['Close_diff'] = data['Close'].diff().dropna()


# ADF test on differenced data

result_diff = adfuller(data['Close_diff'].dropna())
print('ADF Statistic:', result_diff[0])
print('p-value:', result_diff[1])


## Model Building
* Split the data into training and testing sets.
* Use models like ARIMA (AutoRegressive Integrated Moving Average) or LSTM (Long Short-Term Memory) for time series forecasting.

## Using ARIMA:

In [None]:
from statsmodels.tsa.arima.model import ARIMA


# Split data into training and testing sets
train_size = int(len(data) * 0.8)
train, test = data['Close_diff'].dropna()[:train_size], data['Close_diff'].dropna()[train_size:]


# Build and train the ARIMA model
model = ARIMA(train, order=(5,1,0))
model_fit = model.fit(disp=0)


# Forecast
forecast = model_fit.forecast(steps=len(test))[0]


# Plot forecast vs actual
plt.figure(figsize=(10, 5))
plt.plot(data.index[train_size:], test, label='Actual')
plt.plot(data.index[train_size:], forecast, label='Forecast')
plt.title('Stock Price Forecast')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()


## Using LSTM:

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM


# Scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
data_scaled = scaler.fit_transform(data['Close'].values.reshape(-1, 1))


# Prepare the dataset for LSTM
def create_dataset(dataset, time_step=1):
    X, Y = [], []
    for i in range(len(dataset)-time_step-1):
        a = dataset[i:(i+time_step), 0]
        X.append(a)
        Y.append(dataset[i + time_step, 0])
    return np.array(X), np.array(Y)

time_step = 100
X, Y = create_dataset(data_scaled, time_step)


# Split into training and testing sets
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
Y_train, Y_test = Y[:train_size], Y[train_size:]


# Reshape input to be [samples, time steps, features] for LSTM
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)


# Build the LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(time_step, 1)))
model.add(LSTM(50, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))


# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')


# Train the model
model.fit(X_train, Y_train, batch_size=1, epochs=1)


# Predict and scale back the predictions
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)
train_predict = scaler.inverse_transform(train_predict)
test_predict = scaler.inverse_transform(test_predict)


# Plot the results
plt.figure(figsize=(10, 5))
plt.plot(data['Close'], label='Actual')
plt.plot(data.index[time_step:len(train_predict)+time_step], train_predict, label='Train Predict')
plt.plot(data.index[len(train_predict)+(time_step*2)+1:len(data)-1], test_predict, label='Test Predict')
plt.title('Stock Price Prediction')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()


## Model Evaluation
* Evaluate the model performance using metrics like Mean Absolute Error (MAE), Mean Squared Error (MSE), or Root Mean Squared Error (RMSE).

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error


# Calculate MAE and RMSE
mae = mean_absolute_error(test, forecast)
rmse = np.sqrt(mean_squared_error(test, forecast))

print('Mean Absolute Error:', mae)
print('Root Mean Squared Error:', rmse)
