# There are Dow Jones records of 5.5 months of changes in 30 different stocks.

## TASK: Our goal in this project is to build a model for each stocks for predict the next week percent price changing.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import joblib

In [2]:
df = pd.read_csv("./DATA/Dow Jones Index/dow_jones_index.data")
df.head()

Unnamed: 0,quarter,stock,date,open,high,low,close,volume,percent_change_price,percent_change_volume_over_last_wk,previous_weeks_volume,next_weeks_open,next_weeks_close,percent_change_next_weeks_price,days_to_next_dividend,percent_return_next_dividend
0,1,AA,1/7/2011,$15.82,$16.72,$15.78,$16.42,239655616,3.79267,,,$16.71,$15.97,-4.42849,26,0.182704
1,1,AA,1/14/2011,$16.71,$16.71,$15.64,$15.97,242963398,-4.42849,1.380223,239655616.0,$16.19,$15.79,-2.47066,19,0.187852
2,1,AA,1/21/2011,$16.19,$16.38,$15.60,$15.79,138428495,-2.47066,-43.024959,242963398.0,$15.87,$16.13,1.63831,12,0.189994
3,1,AA,1/28/2011,$15.87,$16.63,$15.82,$16.13,151379173,1.63831,9.3555,138428495.0,$16.18,$17.14,5.93325,5,0.185989
4,1,AA,2/4/2011,$16.18,$17.39,$16.18,$17.14,154387761,5.93325,1.987452,151379173.0,$17.33,$17.37,0.230814,97,0.175029


In [3]:
df.shape

(750, 16)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 16 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   quarter                             750 non-null    int64  
 1   stock                               750 non-null    object 
 2   date                                750 non-null    object 
 3   open                                750 non-null    object 
 4   high                                750 non-null    object 
 5   low                                 750 non-null    object 
 6   close                               750 non-null    object 
 7   volume                              750 non-null    int64  
 8   percent_change_price                750 non-null    float64
 9   percent_change_volume_over_last_wk  720 non-null    float64
 10  previous_weeks_volume               720 non-null    float64
 11  next_weeks_open                     750 non-n

In [5]:
df.columns

Index(['quarter', 'stock', 'date', 'open', 'high', 'low', 'close', 'volume',
       'percent_change_price', 'percent_change_volume_over_last_wk',
       'previous_weeks_volume', 'next_weeks_open', 'next_weeks_close',
       'percent_change_next_weeks_price', 'days_to_next_dividend',
       'percent_return_next_dividend'],
      dtype='object')

# Utils

In [7]:
def apply_feature_engineering(df_stock):
    df_stock_copy = df_stock.copy()
    df_stock_copy = df_stock_copy.dropna()
    
    # Convert 'date' column to "datetime" data type
    df_stock_copy['date'] = pd.to_datetime(df_stock_copy['date'])

    # Add Year Feature
    df_stock_copy.loc[:, 'year'] = df_stock_copy['date'].dt.year

    # Add Month Feature
    df_stock_copy.loc[:, 'month'] = df_stock_copy['date'].dt.month

    # Add Day Feature
    df_stock_copy.loc[:, 'day'] = df_stock_copy['date'].dt.day

    # Add Weekday/Weekend and Month of year features.
    df_stock_copy.loc[:, 'weekday'] = df_stock_copy['date'].dt.weekday  # Monday: 0, Sunday: 6

    df_stock_copy.loc[:, 'month_of_year'] = df_stock_copy['date'].dt.month

    # Add Week of the Year feature
    df_stock_copy.loc[:, 'week_of_year'] = df_stock_copy['date'].dt.isocalendar().week  # As the number of the week

    df_stock_copy.drop('date', axis=1, inplace=True)
    
    df_stock_copy['week_of_year'] = df_stock_copy['week_of_year'].astype(int)  # For XGBoost
    
    for col in ['open' , 'high' , 'low' , 'close' , 'next_weeks_open' , 'next_weeks_close']:
        df_stock_copy[col] = df_stock_copy[col].str.replace('$' , '')
        df_stock_copy[col] = df_stock_copy[col].astype(float)
    
    return df_stock_copy

In [8]:
df = apply_feature_engineering(df)
df.head()

Unnamed: 0,quarter,stock,open,high,low,close,volume,percent_change_price,percent_change_volume_over_last_wk,previous_weeks_volume,...,next_weeks_close,percent_change_next_weeks_price,days_to_next_dividend,percent_return_next_dividend,year,month,day,weekday,month_of_year,week_of_year
1,1,AA,16.71,16.71,15.64,15.97,242963398,-4.42849,1.380223,239655616.0,...,15.79,-2.47066,19,0.187852,2011,1,14,4,1,2
2,1,AA,16.19,16.38,15.6,15.79,138428495,-2.47066,-43.024959,242963398.0,...,16.13,1.63831,12,0.189994,2011,1,21,4,1,3
3,1,AA,15.87,16.63,15.82,16.13,151379173,1.63831,9.3555,138428495.0,...,17.14,5.93325,5,0.185989,2011,1,28,4,1,4
4,1,AA,16.18,17.39,16.18,17.14,154387761,5.93325,1.987452,151379173.0,...,17.37,0.230814,97,0.175029,2011,2,4,4,2,5
5,1,AA,17.33,17.48,16.97,17.37,114691279,0.230814,-25.712195,154387761.0,...,17.28,-0.632547,90,0.172712,2011,2,11,4,2,6


# Train | Test Split

In [18]:
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Build and train separate models for each stock
unique_stocks = df['stock'].unique()

# Create a dictionary to store the model
stock_models = {}

for stock in unique_stocks:
    # Filter your stock.
    stock_data = df[df['stock'] == stock]
    
    # Split into X and y
    X = stock_data.drop(['percent_change_next_weeks_price', 'stock'] , axis = 1)
    y = stock_data['percent_change_next_weeks_price']
    
    # Lets split our data.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Build a model and fit.
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Save the Model
    model_filename = f"{stock}_model.pkl"
    joblib.dump(model, f"stock_models/{model_filename}")
    
    # Modeli ve sonuçları saklamak için bir sözlüğe ekleyin
    stock_models[stock] = {
        'model': model_filename,
        'mae': mean_absolute_error(y_test, model.predict(X_test)),
        'rmse': (mean_squared_error(y_test, model.predict(X_test)))** 0.5,
        'mse': mean_squared_error(y_test, model.predict(X_test))
    }

# You can retrieve the model and results for any stock using the dictionary that stores the results.
# For example, for an 'AA' stock:
selected_stock = 'AA'
selected_model = stock_models[selected_stock]['model']
selected_mae = stock_models[selected_stock]['mae']
selected_rmse = stock_models[selected_stock]['rmse']
selected_mse = stock_models[selected_stock]['mse']

print(f"Selected stock: {selected_stock}")
print(f"Model filename: {selected_model}")
print(f"MAE: {selected_mae}")
print(f"RMSE: {selected_rmse}")
print(f"MSE: {selected_mse}")

Selected stock: AA
Model filename: AA_model.pkl
MAE: 0.6936763565478955
RMSE: 0.9472013168543562
MSE: 0.8971903346506265


In [19]:
stock_models

{'AA': {'model': 'AA_model.pkl',
  'mae': 0.6936763565478955,
  'rmse': 0.9472013168543562,
  'mse': 0.8971903346506265},
 'AXP': {'model': 'AXP_model.pkl',
  'mae': 0.22434090070000096,
  'rmse': 0.2413227647600568,
  'mse': 0.0582366767914377},
 'BA': {'model': 'BA_model.pkl',
  'mae': 0.1269598158898832,
  'rmse': 0.14267853921165113,
  'mse': 0.02035716555157067},
 'BAC': {'model': 'BAC_model.pkl',
  'mae': 0.6665360170822662,
  'rmse': 0.8000264360265882,
  'mse': 0.6400422983414046},
 'CAT': {'model': 'CAT_model.pkl',
  'mae': 0.6012551816054177,
  'rmse': 0.8002163776219159,
  'mse': 0.6403462510143407},
 'CSCO': {'model': 'CSCO_model.pkl',
  'mae': 0.6578678835863948,
  'rmse': 0.7227588511996293,
  'mse': 0.5223803569874079},
 'CVX': {'model': 'CVX_model.pkl',
  'mae': 1.069314191304969,
  'rmse': 1.233741291161403,
  'mse': 1.522117573516606},
 'DD': {'model': 'DD_model.pkl',
  'mae': 0.3779044612023291,
  'rmse': 0.43119035995055344,
  'mse': 0.18592512651428786},
 'DIS': {'

Every model has good results. Thats it !

In [20]:
mae_list = []

for model in stock_models:
    mae_list.append(stock_models[model]['mae'])
    
avg_mae = sum(mae_list) / len(mae_list)

print(f"Models average MAE: {avg_mae}")

Models average MAE: 0.556910058633806


In [21]:
mae_list # Just 2 stock have bad results.

[0.6936763565478955,
 0.22434090070000096,
 0.1269598158898832,
 0.6665360170822662,
 0.6012551816054177,
 0.6578678835863948,
 1.069314191304969,
 0.3779044612023291,
 0.22796912919677137,
 0.2647139182340358,
 0.16106462479113276,
 3.7640071452636077,
 0.08925923300186259,
 0.6479631829986958,
 0.09253098547424013,
 0.18948707565289663,
 0.12997036054410455,
 0.16346352839330472,
 0.10473226234851096,
 0.10640432300525933,
 0.21710434865346287,
 0.13799863912601282,
 0.3386397081630321,
 3.915829575796818,
 0.2655747239465469,
 0.08226240019829856,
 0.5020063791817369,
 0.17301902213635129,
 0.309682733638274,
 0.40576365135007164]

In [None]:
# Done.