In [1]:
import pandas as pd
from pre_processing import processed_data_dict
from enum import Enum

In [2]:
class Action(Enum):
    BUY = 0
    SELL = 1
    HOLD = 2

In [3]:
input_data_dict = processed_data_dict()

In [5]:
# comment others to test one stock at a time
input_df = input_data_dict['AAPL']
# input_df = input_data_dict['TSLA']
# input_df = input_data_dict['GOOG']
# input_df = input_data_dict['MSFT']
# input_df = input_data_dict['AMZN']

In [6]:
# data count in the df
data_count = len(input_df)
data_count

9914

In [7]:
input_df.iloc[:40]

Unnamed: 0,Date,Open,High,Low,Close,Adj. Close,Volume
0,1984-08-09,0.13,0.13,0.12,0.13,0.1,257622400.0
1,1984-08-10,0.13,0.14,0.13,0.13,0.1,397376000.0
2,1984-08-13,0.13,0.14,0.13,0.13,0.1,241449600.0
3,1984-08-14,0.13,0.14,0.13,0.13,0.1,174070400.0
4,1984-08-15,0.13,0.13,0.12,0.12,0.1,178886400.0
5,1984-08-16,0.12,0.13,0.12,0.13,0.1,144816000.0
6,1984-08-17,0.13,0.13,0.12,0.12,0.1,153932800.0
7,1984-08-20,0.12,0.12,0.12,0.12,0.09,138454400.0
8,1984-08-21,0.12,0.13,0.12,0.13,0.1,179536000.0
9,1984-08-22,0.13,0.13,0.12,0.13,0.1,220416000.0


In [8]:
# adding a column with values {-1, 0, 1} (Sell, Hold, Buy) for the stock price movement
# based on the difference btw the close and open prices of previous day and current day respectively
# if the difference is positive and greater than 1.5 then action = -1 (Sell)
# if the difference is negative then action = 1 (Buy)
# else action = 0 (Hold)
def add_label(df):
    df['Action'] = 0
    threshold = 0.01
    for i in range(3, len(df)):
        three_days_pred = 0.4 * (df['Close'][i-2] - df['Close'][i-3])/df['Close'][i-3] + 0.32 * (df['Close'][i-1] - df['Close'][i-2])/df['Close'][i-2] + 0.28 * (df['Close'][i] - df['Close'][i-1])/df['Close'][i-1]
        # sell, the stock price will rise in the next three days
        if three_days_pred > threshold:
            df.iloc[i, df.columns.get_loc('Action')] = Action.SELL.value
        # buy, the stock price will fall in the next three days
        elif three_days_pred < -threshold:
            df.iloc[i, df.columns.get_loc('Action')] = Action.BUY.value
        # hold, the stock price will remain the same in the next three days
        else:
            df.iloc[i, df.columns.get_loc('Action')] = Action.HOLD.value
    return df

In [9]:
new_df = add_label(input_df)
new_df

Unnamed: 0,Date,Open,High,Low,Close,Adj. Close,Volume,Action
0,1984-08-09,0.13,0.13,0.12,0.13,0.10,257622400.0,0
1,1984-08-10,0.13,0.14,0.13,0.13,0.10,397376000.0,0
2,1984-08-13,0.13,0.14,0.13,0.13,0.10,241449600.0,0
3,1984-08-14,0.13,0.14,0.13,0.13,0.10,174070400.0,2
4,1984-08-15,0.13,0.13,0.12,0.12,0.10,178886400.0,0
...,...,...,...,...,...,...,...,...
9909,2023-12-04,189.98,190.05,187.45,189.43,189.43,43389500.0,2
9910,2023-12-05,190.21,194.40,190.18,193.42,193.42,66628400.0,2
9911,2023-12-06,194.45,194.76,192.11,192.32,192.32,41089700.0,2
9912,2023-12-07,193.63,195.00,193.59,194.27,194.27,47477700.0,2


In [10]:
# get count of 0 (hold), 1 (buy), -1 (sell) labels
new_df['Action'].value_counts()

Action
2    5742
1    2296
0    1876
Name: count, dtype: int64

In [11]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()

In [12]:
features = ['Open', 'Close']
label = ['Action']

# split the data into train and test sets
train, test = train_test_split(new_df, test_size=0.3, shuffle=False)

In [13]:
# split the train and test sets into features and labels
X_train = train[features]
y_train = train[label]
# y_train = le.fit_transform(train[label])
X_test = test[features]
y_test = test[label]
# y_test = le.fit_transform(test[label])

In [14]:
# fit the model
model = xgb.XGBClassifier(objective='multi:softmax', num_class=3, max_depth=1, n_estimators=200, 
                          random_state=42, learning_rate=0.01)
model.fit(X_train, y_train)
# predict the labels
y_pred = model.predict(X_test)

In [15]:
# calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.10f%%" % (accuracy * 100.0))

Accuracy: 73.4453781513%


In [16]:
# confusion matrix
print(confusion_matrix(y_test, y_pred))

[[   0    0  341]
 [   0    0  449]
 [   0    0 2185]]


In [17]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# specify parameters via map
param = {'max_depth': 1, 'eta': 1, 'objective': 'multi:softmax', 'num_class': 3, 'learning_rate': 0.000001}
num_round = 200
bst = xgb.train(param, dtrain, num_round)

# make prediction
preds = bst.predict(dtest)

In [18]:
accuracy_score(y_test, preds) * 100

73.44537815126051