In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [15]:
# reading the data
df = pd.read_csv(r"..\data\AAPL Stock Price (15-06-204 and last 4 years).csv", 
                 parse_dates=['Date'], dayfirst=True, index_col='Date')
df.columns = df.columns.str.lower()
df.index = df.index.strftime('%Y-%m-%d')
df.shape

(1008, 5)

In [16]:
display(df.head(),df.tail(),df.sample(5))

Unnamed: 0_level_0,open,high,low,close,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-06-15,83.31,86.42,83.15,85.75,34702230
2020-06-16,87.87,88.3,86.18,88.02,41357182
2020-06-17,88.79,88.85,87.77,87.9,28601626
2020-06-18,87.85,88.36,87.31,87.93,24205096
2020-06-19,88.66,89.14,86.29,87.43,66118952


Unnamed: 0_level_0,open,high,low,close,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-06-10,196.9,197.3,192.15,193.12,97262077
2024-06-11,193.65,207.16,193.63,207.15,172373296
2024-06-12,207.37,220.2,206.9,213.07,198134293
2024-06-13,214.74,216.75,211.6,214.24,97862729
2024-06-14,213.85,215.17,211.3,212.49,70122748


Unnamed: 0_level_0,open,high,low,close,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-05-24,126.01,127.94,125.94,127.1,63092945
2022-12-05,147.77,150.92,145.77,146.63,68826442
2021-05-12,123.4,124.64,122.25,122.77,112172282
2024-02-05,188.15,189.25,185.84,187.68,69668820
2022-12-16,136.69,137.65,133.73,134.51,160156900


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1008 entries, 2020-06-15 to 2024-06-14
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    1008 non-null   float64
 1   high    1008 non-null   float64
 2   low     1008 non-null   float64
 3   close   1008 non-null   float64
 4   volume  1008 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 47.2+ KB


# Feature Engineering

In [18]:
df['moving_avg_10'] = df['close'].rolling(window=10).mean()
df['moving_avg_50'] = df['close'].rolling(window=50).mean()

In [19]:
display(df.head(),df.tail())

Unnamed: 0_level_0,open,high,low,close,volume,moving_avg_10,moving_avg_50
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-06-15,83.31,86.42,83.15,85.75,34702230,,
2020-06-16,87.87,88.3,86.18,88.02,41357182,,
2020-06-17,88.79,88.85,87.77,87.9,28601626,,
2020-06-18,87.85,88.36,87.31,87.93,24205096,,
2020-06-19,88.66,89.14,86.29,87.43,66118952,,


Unnamed: 0_level_0,open,high,low,close,volume,moving_avg_10,moving_avg_50
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-06-10,196.9,197.3,192.15,193.12,97262077,193.256,179.9666
2024-06-11,193.65,207.16,193.63,207.15,172373296,194.972,180.709
2024-06-12,207.37,220.2,206.9,213.07,198134293,197.25,181.5936
2024-06-13,214.74,216.75,211.6,214.24,97862729,199.545,182.4854
2024-06-14,213.85,215.17,211.3,212.49,70122748,201.569,183.3588


In [20]:
display(f"Shape of data before dropping null values: {df.shape}")
df = df.dropna()
display(f"Shape of data after dropping null values: {df.shape}")

'Shape of data before dropping null values: (1008, 7)'

'Shape of data after dropping null values: (959, 7)'

# Model Selection

In [21]:
# treating moving average as indepedent variable and closing price as target variable
x = df[['moving_avg_10', 'moving_avg_50']]
y = df['close']

# splitting the data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [22]:
LinearRegression()

In [82]:
def best_model_gridsearch(independent_variables: pd.DataFrame, 
                          dependent_variable: pd.DataFrame) -> pd.DataFrame:
    """This function performs a grid search for the best model among several machine learning algorithms.

    Parameters:
    - independent_variables (pd.DataFrame): A DataFrame containing the independent variables used for prediction.
    - dependent_variable (pd.DataFrame): A DataFrame containing the dependent variable to be predicted.

    Returns:
    - pd.DataFrame: A DataFrame containing the best score achieved by each model, the best parameters used for each model, and the name of the model.
    """
    algorithms = {
        'LinearRegression': {
            'model': LinearRegression(),
            'params': {
                'fit_intercept': [True, False],
                'copy_X': [True, False],
                'n_jobs': [None, 1, 2, 3],
                'positive': [True, False]
            }
        },
        
        'Ridge': {
            'model': Ridge(),
            'params': {
                'alpha': [0.1,1,10,100],
                'fit_intercept': [True, False],
                'copy_X': [True, False],
                'solver': ['auto','svd','cholesky','lsqr', 'sparse_cg', 'sag','saga'],
                'positive': [True, False]
            }
        },
        
        'Lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [0.1,1,10,100],
                'fit_intercept': [True, False],
                'copy_X': [True, False],
                'precompute': [True, False],
                'warm_start':[True, False],
                'selection': ['cyclic', 'random'],
                'positive': [True, False]
            }
        },
        
        'DecisionTree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['squared_error','friedman_mse','absolute_error', 'poisson'],
                'splitter': ['best','random'],
                'max_depth': [None, 10, 20, 30]
            }
        },
        
        'RandomForest': {
            'model': RandomForestRegressor(),
            'params': {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20],
                'bootstrap': [True, False],
                'criterion': ['mse', 'mae', 'poisson']
            }
        },
        
        'SVR': {
            'model': SVR(),
            'params': {
                'C': [0.1, 1, 10],
                'gamma': ['scale', 'auto'],
                'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
            }
        }
    }
    
    scores = []
    cross_validation = ShuffleSplit(n_splits= 5, test_size= 0.2, random_state= 42)
    
    for algorithm, config in algorithms.items():
        grid_search = GridSearchCV(config['model'],config['params'],cv=cross_validation,
                                  scoring='neg_mean_squared_error',
                                  return_train_score=False)
        grid_search.fit(independent_variables,dependent_variable)
        scores.append({
            'model': algorithm,
            'best_score': grid_search.best_score_,
            'best_params': grid_search.best_params_
        })
        
    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

In [None]:
best_model_gridsearch(independent_variables=x, dependent_variable=y)