In [15]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load data
df = pd.read_csv('all_stocks_5yr.csv')

# Check date column name 
print(df.columns)

# Set date as index and convert to datetime
df['date'] = pd.to_datetime(df['date'])   
df = df.set_index('date')

# Check dtypes
df.dtypes

# View sample data
df.head()  

# Check for missing values  
df.isnull().sum()

# Identify target column
y = df['close']  

# Feature engineering
df['Return'] = df['close'].pct_change()
df['MA50'] = df['close'].rolling(50).mean()

# Split data
train, test = train_test_split(df, test_size=0.2)  

# Save processed data
train.to_csv('train.csv')
test.to_csv('test.csv')


Index(['date', 'open', 'high', 'low', 'close', 'volume', 'Name'], dtype='object')


In [16]:
df.dtypes


open      float64
high      float64
low       float64
close     float64
volume      int64
Name       object
Return    float64
MA50      float64
dtype: object

In [17]:
df.isnull().sum()

open      11
high       8
low        8
close      0
volume     0
Name       0
Return     1
MA50      49
dtype: int64

In [18]:
df.head(20)

Unnamed: 0_level_0,open,high,low,close,volume,Name,Return,MA50
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL,,
2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL,-0.019661,
2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL,-0.01314,
2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL,0.02733,
2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL,-0.045703,
2013-02-15,13.93,14.61,13.93,14.5,15628000,AAL,0.036455,
2013-02-19,14.33,14.56,14.08,14.26,11354400,AAL,-0.016552,
2013-02-20,14.17,14.26,13.15,13.33,14725200,AAL,-0.065217,
2013-02-21,13.62,13.95,12.9,13.37,11922100,AAL,0.003001,
2013-02-22,13.57,13.6,13.21,13.57,6071400,AAL,0.014959,


In [19]:
# Load data
df = pd.read_csv('all_stocks_5yr.csv')

# Set date as index and drop nulls
df = df.set_index('date').dropna()  

# Select features and target
X = df[['open', 'high', 'low', 'volume']]
y = df['close']

# Train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Fit linear regression 
lr_model = LinearRegression().fit(X_train, y_train)

# Predict on test set
y_pred = lr_model.predict(X_test) 

# Evaluate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')

# Save model
import pickle
with open('lr_model.pkl', 'wb') as f:
   pickle.dump(lr_model, f)


RMSE: 0.6914272690576048


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
