In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [59]:
# reading the data
df = pd.read_csv(r"..\data\AAPL Stock Price (15-06-204 and last 4 years).csv", 
                 parse_dates=['Date'], dayfirst=True, index_col='Date')
df.columns = df.columns.str.lower()
df.index = df.index.strftime('%Y-%m-%d')
df.shape

(1008, 5)

In [60]:
display(df.head(),df.tail(),df.sample(5))

Unnamed: 0_level_0,open,high,low,close,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-06-15,83.31,86.42,83.15,85.75,34702230
2020-06-16,87.87,88.3,86.18,88.02,41357182
2020-06-17,88.79,88.85,87.77,87.9,28601626
2020-06-18,87.85,88.36,87.31,87.93,24205096
2020-06-19,88.66,89.14,86.29,87.43,66118952


Unnamed: 0_level_0,open,high,low,close,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-06-10,196.9,197.3,192.15,193.12,97262077
2024-06-11,193.65,207.16,193.63,207.15,172373296
2024-06-12,207.37,220.2,206.9,213.07,198134293
2024-06-13,214.74,216.75,211.6,214.24,97862729
2024-06-14,213.85,215.17,211.3,212.49,70122748


Unnamed: 0_level_0,open,high,low,close,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-02-09,176.05,176.65,174.9,176.28,71285038
2023-03-23,158.83,161.55,157.68,158.93,67622060
2022-06-30,137.25,138.37,133.77,136.72,98964467
2022-12-13,149.5,149.97,144.24,145.47,93886161
2021-09-16,148.44,148.97,147.22,148.79,68034149


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1008 entries, 2020-06-15 to 2024-06-14
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    1008 non-null   float64
 1   high    1008 non-null   float64
 2   low     1008 non-null   float64
 3   close   1008 non-null   float64
 4   volume  1008 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 47.2+ KB


# Feature Engineering

In [62]:
df['moving_avg_10'] = df['close'].rolling(window=10).mean()
df['moving_avg_50'] = df['close'].rolling(window=50).mean()

In [63]:
display(df.head(),df.tail())

Unnamed: 0_level_0,open,high,low,close,volume,moving_avg_10,moving_avg_50
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-06-15,83.31,86.42,83.15,85.75,34702230,,
2020-06-16,87.87,88.3,86.18,88.02,41357182,,
2020-06-17,88.79,88.85,87.77,87.9,28601626,,
2020-06-18,87.85,88.36,87.31,87.93,24205096,,
2020-06-19,88.66,89.14,86.29,87.43,66118952,,


Unnamed: 0_level_0,open,high,low,close,volume,moving_avg_10,moving_avg_50
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-06-10,196.9,197.3,192.15,193.12,97262077,193.256,179.9666
2024-06-11,193.65,207.16,193.63,207.15,172373296,194.972,180.709
2024-06-12,207.37,220.2,206.9,213.07,198134293,197.25,181.5936
2024-06-13,214.74,216.75,211.6,214.24,97862729,199.545,182.4854
2024-06-14,213.85,215.17,211.3,212.49,70122748,201.569,183.3588


In [64]:
display(f"Shape of data before dropping null values: {df.shape}")
df = df.dropna()
display(f"Shape of data after dropping null values: {df.shape}")

'Shape of data before dropping null values: (1008, 7)'

'Shape of data after dropping null values: (959, 7)'

# Model Selection

In [69]:
# treating moving average as indepedent variable and closing price as target variable
x = df[['moving_avg_10', 'moving_avg_50']]
y = df['close']

# splitting the data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
LinearRegression()

In [74]:
def best_model_gridsearch(x: pd.DataFrame, y: pd.DataFrame) -> pd.DataFrame:
    """
    Function to find the best model using GridSearchCV
    """
    algorithms = {
        'LinearRegression': {
            'model': LinearRegression(),
            'params': {
                'fit_intercept': [True, False],
                'normalize': [True, False],
                'n_jobs': [None, 1, 2, 3],
                'positive': [True, False]
            }
        },
        
        'Ridge': {
            'model': Ridge(),
            'params': {
                'alpha': [0.1,1,10,100],
                'fit_intercept': [True, False],
                'normalize': [True, False],
                'solver': ['auto','svd','cholesky','lsqr', 'sparse_cg', 'sag','saga'],
                'positive': [True, False]
            }
        },
        
        'Lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [0.1,1,10,100],
                'fit_intercept': [True, False],
                'normalize': [True, False],
                'selection': ['cyclic', 'random'],
                'positive': [True, False]
            }
        },
        
        'DecisionTree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['squared_error','friedman_mse','absolute_error', 'poisson'],
                'splitter': ['best','random'],
                'max_depth': [None, 10, 20, 30]
            }
        },
        
        'RandomForest': {
            'model': RandomForestRegressor(),
            'params': {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20],
                'bootstrap': [True, False],
                'criterion': ['mse', 'mae', 'poisson']
            }
        },
        
        'SVR': {
            'model': SVR(),
            'params': {
                'C': [0.1, 1, 10],
                'gamma': ['scale', 'auto'],
                'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
            }
        }
    }