In [None]:
Stock Price Prediction using Machine Learning

Problem Statement:
The goal is to predict the next day's closing stock price for a given stock (AAPL) using historical stock data and machine learning models.

Objectives:
- Use `yfinance` to collect stock data.
- Preprocess the data and generate features.
- Train and evaluate Linear Regression and Random Forest models.
- Visualize predictions.
- Predict the next closing price based on the latest market data.


In [None]:
import yfinance as yf
import pandas as pd
import numpy as np

stock_symbol = "AAPL"
start_date = "2020-01-01"
end_date = "2023-12-31"
print("Downloading stock data...")
data = yf.download(stock_symbol, start=start_date, end=end_date)

# Create next-day target variable
data['Next Close'] = data['Close'].shift(-1)
data = data.dropna()

# Select features and target
features = ['Open', 'High', 'Low', 'Volume']
X = data[features]
y = data['Next Close']

# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [None]:
import matplotlib.pyplot as plt

# Visualize the Closing Price
plt.figure(figsize=(12, 6))
plt.plot(data.index, data['Close'], label='Closing Price')
plt.title('AAPL Closing Price (2020-2023)')
plt.xlabel('Date')
plt.ylabel('Price ($)')
plt.grid(True)
plt.legend()
plt.show()

# Show first few rows
data.head()


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Linear Regression
print("Training Linear Regression model...")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))
lr_r2 = lr_model.score(X_test, y_test)

# Random Forest
print("Training Random Forest model...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_r2 = rf_model.score(X_test, y_test)

# Comparison Table
print("\nModel Performance Comparison:")
print(f"{'Metric':<15} {'Linear Regression':<20} {'Random Forest':<20}")
print(f"{'RMSE':<15} {lr_rmse:<20.2f} {rf_rmse:<20.2f}")
print(f"{'R-squared':<15} {lr_r2:<20.4f} {rf_r2:<20.4f}")


In [None]:
# Linear Regression Plot
plt.figure(figsize=(12, 6))
plt.title('Linear Regression Predictions')
plt.plot(y_test.index, y_test, label='Actual Price', color='blue')
plt.plot(y_test.index, lr_pred, label='Predicted', color='orange')
plt.xlabel('Date')
plt.ylabel('Price ($)')
plt.legend()
plt.grid(True)
plt.show()

# Random Forest Plot
plt.figure(figsize=(12, 6))
plt.title('Random Forest Predictions')
plt.plot(y_test.index, y_test, label='Actual Price', color='blue')
plt.plot(y_test.index, rf_pred, label='Predicted', color='green')
plt.xlabel('Date')
plt.ylabel('Price ($)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Feature Importance
rf_importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nRandom Forest Feature Importance:")
rf_importance


In [None]:
print("\nFetching latest market data for prediction...")
latest_data = yf.download(stock_symbol, period='5d')

if not latest_data.empty:
    latest_day = latest_data.iloc[-1]
    prediction_input = pd.DataFrame({
        'Open': [latest_day['Open']],
        'High': [latest_day['High']],
        'Low': [latest_day['Low']],
        'Volume': [latest_day['Volume']]
    })
    
    next_day_pred = rf_model.predict(prediction_input)
    current_close = float(latest_day['Close'])
    
    print("\nNext Trading Day Prediction:")
    print(f"Current Close: ${current_close:.2f}")
    print(f"Predicted Next Close: ${next_day_pred[0]:.2f}")
    
    # Bar Plot
    plt.figure(figsize=(8, 5))
    plt.bar(['Current Close', 'Predicted Next Close'], 
            [current_close, next_day_pred[0]],
            color=['blue', 'green'])
    plt.title('Current vs Predicted Closing Price')
    plt.ylabel('Price ($)')
    plt.grid(axis='y')
    plt.show()
else:
    print("Could not fetch latest market data.")


In [None]:
# Export dataset as CSV
data.to_csv('stock_data.csv')

# Only for Google Colab users
try:
    from google.colab import files
    print("\nDownloading stock_data.csv...")
    files.download('stock_data.csv')
except:
    print("Not running in Colab. File saved locally.")
