My first Machine Learning project

In [1]:
%pip install yfinance

Collecting yfinance
  Downloading yfinance-0.2.55-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.6-py313-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.17.9.tar.gz (3.0 MB)
     ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
     --- ------------------------------------ 0.3/3.0 MB ? eta -:--:--
     ---------- ----------------------------- 0.8/3.0 MB 1.8 MB/s eta 0:00:02
     ----------------- ---------------------- 1.3/3.0 MB 2.0 MB/s eta 0:00:01
     ---------------------------------------- 3.0/3.0 MB 2.2 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Pre

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load stock data
ticker = 'AAPL'
stock = yf.download(ticker, start="2020-01-01", end="2024-01-01")  # Adjusted end date to current valid range

# Prepare the data
stock['Date'] = stock.index
stock['Days'] = (stock['Date'] - stock['Date'].min()).dt.days
X = stock[['Days']]
y = stock['Close']

# Sequential split (first 80% for training, last 20% for testing)
split_idx = int(0.8 * len(stock))
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

# Predict next 30 days
future_days = pd.DataFrame({'Days': np.arange(stock['Days'].max() + 1, stock['Days'].max() + 30)})
future_prices = model.predict(future_days)

# Predict future stock price 
future_dates = [stock['Date'].max() + pd.Timedelta(days=i) for i in range(1, 31)]

plt.figure(figsize=(12, 6))
plt.plot(stock['Date'], stock['Close'], label='Actual Prices', color='blue')
plt.plot(stock['Date'].iloc[split_idx:], y_pred, label='Predicted Prices (Test)', color='red')
plt.plot(future_dates, future_prices, label='Future Prediction', color='green', linestyle='dashed')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.title(f'{ticker} Stock Price Prediction')
plt.legend()
plt.show()

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime, timedelta

def load_stock_data(ticker, start_date, end_date):
    """Fetch stock data from Yahoo Finance"""
    try:
        data = yf.download(ticker, start=start_date, end=end_date)
        if data.empty:
            raise ValueError("No data returned for the given ticker and date range")
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def prepare_features(data):
    """Create time-based features"""
    df = data.copy()
    df['Days'] = (df.index - df.index.min()).days
    df['Date'] = df.index
    return df[['Days', 'Date']], df['Close']

def time_series_split(X, y, test_size=0.2):
    """Sequential train-test split for time series"""
    split_idx = int(len(X) * (1 - test_size))
    return (
        X.iloc[:split_idx], X.iloc[split_idx:],
        y.iloc[:split_idx], y.iloc[split_idx:]
    )

def predict_future(model, last_day, days_to_predict=30):
    """Generate future predictions"""
    future_days = np.arange(last_day + 1, last_day + days_to_predict + 1).reshape(-1, 1)
    return model.predict(future_days)

# Configuration
TICKER = 'AAPL'
START_DATE = "2020-01-01"
END_DATE = datetime.now().strftime("%Y-%m-%d")

# Load and prepare data
stock_data = load_stock_data(TICKER, START_DATE, END_DATE)
if stock_data is None:
    exit()

X, y = prepare_features(stock_data)
X_train, X_test, y_train, y_test = time_series_split(X[['Days']], y)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate model
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

print(f"Model Performance:")
print(f"Train R²: {r2_score(y_train, train_pred):.2f}")
print(f"Test R²: {r2_score(y_test, test_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, test_pred):.2f}")

# Generate future predictions
future_prices = predict_future(model, X['Days'].max())
future_dates = pd.date_range(
    start=X['Date'].max() + timedelta(days=1),
    periods=30
)

# Visualization
plt.figure(figsize=(14, 7))
plt.plot(X['Date'], y, label='Historical Prices', color='blue', alpha=0.7)
plt.plot(X_test['Date'], test_pred, label='Test Predictions', color='red', linewidth=2)
plt.plot(future_dates, future_prices, label='30-Day Forecast', 
         color='green', linestyle='--', marker='o', markersize=4)

plt.title(f"{TICKER} Stock Price Prediction\nLinear Regression Model", fontsize=14)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Price (USD)", fontsize=12)
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()

# Add regression line for visual reference
regression_line = model.predict(X[['Days']])
plt.plot(X['Date'], regression_line, color='orange', 
         linestyle=':', label='Regression Trend')

plt.legend()
plt.show()