<a href="https://colab.research.google.com/github/Coperr/amd-stock-ml-project/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import platform
from pathlib import Path
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import cross_val_predict
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection

In [1]:
from google.colab import files
uploaded = files.upload()

Saving AMD-Stock-Price-History.csv to AMD-Stock-Price-History.csv


In [3]:
data = pd.read_csv("AMD-Stock-Price-History.csv", low_memory=False)#("/content/AMD-Stock-Price-History.csv", low_memory=False)

In [4]:
data

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,12/04/2024,143.99,142.96,144.12,141.18,25.94M,1.42%
1,12/03/2024,141.98,142.58,143.45,141.08,22.67M,-0.06%
2,12/02/2024,142.06,137.91,142.82,137.80,33.10M,3.56%
3,11/29/2024,137.18,136.24,138.59,135.78,16.09M,0.69%
4,11/27/2024,136.24,137.20,137.94,132.96,30.18M,-1.07%
...,...,...,...,...,...,...,...
9027,02/06/1989,4.69,4.69,4.69,4.56,589.80K,-1.26%
9028,02/03/1989,4.75,4.75,4.81,4.62,1.57M,1.28%
9029,02/02/1989,4.69,4.69,4.69,4.50,838.20K,4.22%
9030,02/01/1989,4.50,4.50,4.56,4.50,682.40K,-1.32%


***DATA PROCESSING***

Functions used in feature engineering/selection:

In [5]:
def convert_volume(volume_str):
    if isinstance(volume_str, str):
        if 'M' in volume_str:
            return float(volume_str.replace('M', '').replace(',', '').strip()) * 1_000_000
        elif 'B' in volume_str:
            return float(volume_str.replace('B', '').replace(',', '').strip()) * 1_000_000_000
        elif 'K' in volume_str:
            return float(volume_str.replace('K', '').replace(',', '').strip()) * 1_000
        else:
            return float(volume_str.replace(',', '').strip())
    return volume_str

def convert_change_percentage(data):
    if 'Change %' not in data.columns:
        raise ValueError("DataFrame must contain a 'Change %' column.")

    if data['Change %'].dtype == object:
        data['Change %'] = data['Change %'].str.replace('%', '').astype(float) / 100
    elif data['Change %'].max() > 1 or data['Change %'].min() < -1:
        data['Change %'] = data['Change %'] / 100

def calculate_rsi(data, column='Price', window=14):
    delta = data[column].diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_adx(data, high_col='High', low_col='Low', close_col='Price', window=14):
    df = data.copy()
    df['TR'] = np.maximum(df[high_col] - df[low_col],
                          np.maximum(abs(df[high_col] - df[close_col].shift(1)),
                                     abs(df[low_col] - df[close_col].shift(1))))

    df['+DM'] = np.where((df[high_col] - df[high_col].shift(1)) > (df[low_col].shift(1) - df[low_col]),
                         np.maximum(df[high_col] - df[high_col].shift(1), 0), 0)
    df['-DM'] = np.where((df[low_col].shift(1) - df[low_col]) > (df[high_col] - df[high_col].shift(1)),
                         np.maximum(df[low_col].shift(1) - df[low_col], 0), 0)

    df['Smoothed TR'] = df['TR'].rolling(window=window).mean()
    df['Smoothed +DM'] = df['+DM'].rolling(window=window).mean()
    df['Smoothed -DM'] = df['-DM'].rolling(window=window).mean()

    df['+DI'] = (df['Smoothed +DM'] / df['Smoothed TR']) * 100
    df['-DI'] = (df['Smoothed -DM'] / df['Smoothed TR']) * 100

    df['DX'] = (abs(df['+DI'] - df['-DI']) / (df['+DI'] + df['-DI'])) * 100
    df['ADX'] = df['DX'].rolling(window=window).mean()

    return df['ADX']

def calculate_bollinger_bands(data, column='Price', window=20, num_std_dev=2):
    data['Middle Band'] = data[column].rolling(window=window).mean()
    data['Standard Deviation'] = data[column].rolling(window=window).std()
    data['Upper Band'] = data['Middle Band'] + (num_std_dev * data['Standard Deviation'])
    data['Lower Band'] = data['Middle Band'] - (num_std_dev * data['Standard Deviation'])
    return data

def calculate_ema(data, column='Price', span=12):
    return data[column].ewm(span=span, adjust=False).mean()

def calculate_macd(data, column='Price', short_span=12, long_span=26, signal_span=9):
    data['Short EMA'] = calculate_ema(data, column=column, span=short_span)
    data['Long EMA'] = calculate_ema(data, column=column, span=long_span)
    data['MACD Line'] = data['Short EMA'] - data['Long EMA']
    data['Signal Line'] = data['MACD Line'].ewm(span=signal_span, adjust=False).mean()
    return data

           Date   Price    Open    High     Low        Vol.  Change %  \
9031 1989-01-31    4.56    4.56    4.56    4.44    994800.0    0.0133   
9030 1989-02-01    4.50    4.50    4.56    4.50    682400.0   -0.0132   
9029 1989-02-02    4.69    4.69    4.69    4.50    838200.0    0.0422   
9028 1989-02-03    4.75    4.75    4.81    4.62   1570000.0    0.0128   
9027 1989-02-06    4.69    4.69    4.69    4.56    589800.0   -0.0126   
...         ...     ...     ...     ...     ...         ...       ...   
4    2024-11-27  136.24  137.20  137.94  132.96  30180000.0   -0.0107   
3    2024-11-29  137.18  136.24  138.59  135.78  16090000.0    0.0069   
2    2024-12-02  142.06  137.91  142.82  137.80  33100000.0    0.0356   
1    2024-12-03  141.98  142.58  143.45  141.08  22670000.0   -0.0006   
0    2024-12-04  143.99  142.96  144.12  141.18  25940000.0    0.0142   

         RSI_14        ADX  Middle Band  ...  Lower Band      EMA_12  \
9031        NaN        NaN          NaN  ...       

Adding features

In [None]:
data['Date'] = pd.to_datetime(data['Date'])

data['Vol.'] = data['Vol.'].apply(convert_volume)

convert_change_percentage(data)

data.sort_values('Date', inplace=True)

data['RSI_14'] = calculate_rsi(data, column='Price', window=14)

data['ADX'] = calculate_adx(data, high_col='High', low_col='Low', close_col='Price', window=14)

data = calculate_bollinger_bands(data, column='Price', window=20, num_std_dev=2)

data['EMA_12'] = calculate_ema(data, column='Price', span=12)

data['EMA_26'] = calculate_ema(data, column='Price', span=26)

data = calculate_macd(data, column='Price', short_span=12, long_span=26, signal_span=9)

window_sizes = [5, 10, 20]
for window in window_sizes:
    data[f'SMA_{window}'] = data['Price'].rolling(window=window).mean()

print(data)
#data.dropna(inplace=True)

MODEL SELECTION

In [6]:
def CrossValidation(pipeline, X, y, latest_row) :

    cv = KFold(n_splits=5, shuffle=True, random_state=22)
    mse_scores = -cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2')

    pipeline.fit(X, y)
    predicted_price = pipeline.predict(latest_row)[0]
    model_result = {
        'MSE (mean)': np.mean(mse_scores),
        'MSE (std)': np.std(mse_scores),
        'R2 (mean)': np.mean(r2_scores),
        'R2 (std)': np.std(r2_scores),
        'Predicted Price': predicted_price
    }

    return model_result

def BlockedCrossValidation(pipeline, X, y, latest_row) :

    cv = KFold(n_splits=5, shuffle=False)
    mse_scores = -cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2')

    pipeline.fit(X, y)
    predicted_price = pipeline.predict(latest_row)[0]
    model_result = {
        'MSE (mean)': np.mean(mse_scores),
        'MSE (std)': np.std(mse_scores),
        'R2 (mean)': np.mean(r2_scores),
        'R2 (std)': np.std(r2_scores),
        'Predicted Price': predicted_price
    }
    return model_result

def Rep_Holdout(pipeline, X, y, latest_row, n_repeats, start_ratio, end_ratio):
    mse_scores = []
    r2_scores = []
    for i in range(n_repeats):

        split_ratios = random.uniform(start_ratio, end_ratio)  # test set between 15% and 30% of the total dataset
        split_point = int(len(X) * (1 - split_ratios))

        X_train, X_test = X[:split_point], X[split_point:]
        y_train, y_test = y[:split_point], y[split_point:]

        # Train model
        pipeline.fit(X_train, y_train)

        # Validate model
        y_pred = pipeline.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mse_scores.append(mse)
        r2_scores.append(r2)

    pipeline.fit(X, y)
    predicted_price = pipeline.predict(latest_row)[0]
    model_result = {
        'MSE (mean)': np.mean(mse_scores),
        'MSE (std)': np.std(mse_scores),
        'R2 (mean)': np.mean(r2_scores),
        'R2 (std)': np.std(r2_scores),
        'Predicted Price': predicted_price
    }
    return model_result


def Preq_slide_window(pipeline, X, y, latest_row, n_splits):

    kf = KFold(n_splits, shuffle=False)
    mse_scores = []
    r2_scores = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
        if fold == 0: continue
        print("start:",test_idx[0]-1, " end", test_idx[-1])
        X_train, X_test = X[:test_idx[0]-1], X[test_idx[0]:test_idx[-1]]
        y_train, y_test = y[:test_idx[0]-1], y[test_idx[0]:test_idx[-1]]

        # Train model
        pipeline.fit(X_train, y_train)

        # Validate model
        y_pred = pipeline.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mse_scores.append(mse)
        r2_scores.append(r2)

    pipeline.fit(X, y)
    predicted_price = pipeline.predict(latest_row)[0]
    model_result = {
        'MSE (mean)': np.mean(mse_scores),
        'MSE (std)': np.std(mse_scores),
        'R2 (mean)': np.mean(r2_scores),
        'R2 (std)': np.std(r2_scores),
        'Predicted Price': predicted_price
    }
    return model_result





In [7]:
features = data[['Open', 'High', 'Low', 'Vol.', 'Change %', 'RSI_14', 'ADX',
                 'Middle Band', 'Upper Band', 'Lower Band',
                 'EMA_12', 'EMA_26', 'MACD Line', 'Signal Line'] + [f'SMA_{w}' for w in window_sizes]]
target = data['Price']

features.to_csv('path_to_file.csv')
models = {
    'Linear Regression': LinearRegression(),
    'AdaBoost Regressor': AdaBoostRegressor(),
    #'Random Forest': RandomForestRegressor(random_state=99),
    'Gradient Boosting': GradientBoostingRegressor(random_state=300),
    #'Support Vector Regression': SVR(kernel='rbf'),
    'Lasso Regression': Lasso(alpha=0.1, random_state=11),
    'Ridge Regression': Ridge(alpha=0.4, random_state=12)
}

model_results = {}
latest_row = features.iloc[-1:].copy()

for name, model in models.items():
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=12)),
        ('regressor', model)
    ])
    #model_results[name]=Rep_Holdout(pipeline, features, target, latest_row, 10, 0.05, 0.20)
    #model_results[name]=CrossValidation(pipeline, features, target, latest_row)
    #model_results[name]=BlockedCrossValidation(pipeline, features, target, latest_row)
    #model_results[name]=Preq_slide_window(pipeline, features, target, latest_row, 8)


for name, metrics in model_results.items():
    print(f"{name}:")
    print(f"  MSE (mean): {metrics['MSE (mean)']:.4f} ± {metrics['MSE (std)']:.4f}")
    print(f"  R2 (mean): {metrics['R2 (mean)']:.4f} ± {metrics['R2 (std)']:.4f}")
    print(f"  Predicted Price: {metrics['Predicted Price']:.2f}\n")




In [8]:
features = data[['Open', 'High', 'Low', 'Vol.', 'Change %', 'RSI_14', 'ADX',
                 'Middle Band', 'Upper Band', 'Lower Band',
                 'EMA_12', 'EMA_26', 'MACD Line', 'Signal Line'] + [f'SMA_{w}' for w in window_sizes]]
target = data['Price']

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=99),
    'Gradient Boosting': GradientBoostingRegressor(random_state=300),
    'Support Vector Regression': SVR(kernel='rbf'),
    'Lasso Regression': Lasso(alpha=0.1, random_state=11),
    'Ridge Regression': Ridge(alpha=0.1, random_state=12)
}
# non shuffled cross validation to preserve temporal dependency
cv = KFold(n_splits=5, shuffle=False)

model_results = {}

X = features
y = target

latest_row = features.iloc[-1:].copy()

for name, model in models.items():
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=15)),
        ('regressor', model)
    ])

    mse_scores = -cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2')

    pipeline.fit(X, y)

    predicted_price = pipeline.predict(latest_row)[0]

    model_results[name] = {
        'MSE (mean)': np.mean(mse_scores),
        'MSE (std)': np.std(mse_scores),
        'R2 (mean)': np.mean(r2_scores),
        'R2 (std)': np.std(r2_scores),
        'Predicted Price': predicted_price
    }

for name, metrics in model_results.items():
    print(f"{name}:")
    print(f"  MSE (mean): {metrics['MSE (mean)']:.4f} ± {metrics['MSE (std)']:.4f}")
    print(f"  R2 (mean): {metrics['R2 (mean)']:.4f} ± {metrics['R2 (std)']:.4f}")
    print(f"  Predicted Price: {metrics['Predicted Price']:.2f}\n")

Linear Regression:
  MSE (mean): 0.3521 ± 0.3920
  R2 (mean): 0.9922 ± 0.0137
  Predicted Price: 143.42

Random Forest:
  MSE (mean): 680.9069 ± 1360.6891
  R2 (mean): 0.6882 ± 0.5729
  Predicted Price: 142.61

Gradient Boosting:
  MSE (mean): 675.5461 ± 1350.1105
  R2 (mean): 0.6918 ± 0.5690
  Predicted Price: 143.61

Support Vector Regression:
  MSE (mean): 1195.6463 ± 2377.6655
  R2 (mean): 0.1983 ± 0.9609
  Predicted Price: 137.19

Lasso Regression:
  MSE (mean): 2.6550 ± 4.9431
  R2 (mean): 0.9925 ± 0.0060
  Predicted Price: 140.95

Ridge Regression:
  MSE (mean): 0.2376 ± 0.3618
  R2 (mean): 0.9981 ± 0.0019
  Predicted Price: 143.31

