# XGBoost Classification
This programm is an attempt to predict valuable stocks by XGBoost Classification.

### 1. Imports

In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report, make_scorer
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint, uniform

## First attempt (fewer data)

### 2. Load the data

In [85]:
data = pd.read_csv('stocks_data.csv')
data.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Ticker,Year,Month,MA Ratio,Buy,Result,ROE,Insider Ownership Growth,Institutional Ownership Growth,Forecast EPS Growth,Avg 2Q EPS Growth,Avg 2Q EPS Surprise,YoY EPS Growth,Sector Performance,Market Performance,Benchmark SP500 Performance
count,14854.0,14854,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0
unique,,393,,,,,,,,,,,,,,,
top,,AWK,,,,,,,,,,,,,,,
freq,,55,,,,,,,,,,,,,,,
mean,7426.5,,2020.669113,6.22324,1.004148,0.415848,1.032817,39.494365,0.015486,0.026708,0.057775,0.181477,13.755183,0.369529,1.488003,1.438443,1.029219
std,4288.124784,,1.428016,3.520757,0.046473,0.492884,0.14926,181.839873,0.269863,0.230675,2.136724,2.111809,46.751483,3.637998,8.164589,7.038394,0.077491
min,0.0,,2018.0,1.0,0.580721,0.0,0.259712,-613.743387,-0.633527,-0.714136,-0.992366,-45.05,-65.625,-0.961538,-44.900728,-22.795349,0.769903
25%,3713.25,,2019.0,3.0,0.977766,0.0,0.944153,10.160854,-0.00135,-0.023114,-0.184264,-0.040838,2.015,0.017606,-3.453784,-3.160007,0.983314
50%,7426.5,,2021.0,6.0,1.00536,0.0,1.028547,19.251991,0.0,-0.000648,-0.039062,0.045662,6.055,0.130688,1.496227,2.069271,1.043101
75%,11139.75,,2022.0,9.0,1.031953,1.0,1.113949,31.949569,0.008,0.033653,0.086957,0.154182,13.135,0.275148,6.429508,5.50743,1.080728


### 3. Split the data for train and test, standarise the data

In [86]:
data = data.reset_index(drop=True)
train_data = data[data['Year'] <= 2022]
test_data = data[data['Year'] > 2022]
x_train = train_data.drop(['Year', 'Buy', 'Month', 'Ticker', 'Result', 'Benchmark SP500 Performance', data.columns[0]], axis=1)
y_train = train_data['Buy']
x_test = test_data.drop(['Year', 'Buy', 'Month', 'Ticker', 'Result', 'Benchmark SP500 Performance', data.columns[0]], axis=1)
y_test = test_data['Buy']

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

### 4. Train the model with randomized search

In [89]:
xg_clf = xgb.XGBClassifier(objective='binary:logistic')
param_dist = {
    'max_depth': randint(3, 8),
    'subsample': uniform(0.5, 0.5),
    'eta': uniform(0.01, 0.2),
    'colsample_bytree': uniform(0.6, 0.4),
    'min_child_weight': randint(1, 5),
    'gamma': uniform(0.0, 0.4),
    'lambda': uniform(0.0, 10.0),
    'alpha': uniform(0.0, 10.0),
    'scale_pos_weight': uniform(0.8, 5.0)
}

random_search = RandomizedSearchCV(estimator=xg_clf,
                                   param_distributions=param_dist,
                                   n_iter=50, 
                                   cv=10,
                                   scoring='precision',
                                   n_jobs=-1,
                                   verbose=2,
                                   random_state=42)
random_search.fit(x_train, y_train)
best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

y_pred = best_model.predict(x_test)

Fitting 10 folds for each of 50 candidates, totalling 500 fits
Best Parameters: {'alpha': 3.418796667164016, 'colsample_bytree': 0.6367196263253767, 'eta': 0.028831397653712024, 'gamma': 0.12456532375651769, 'lambda': 9.795105286215085, 'max_depth': 7, 'min_child_weight': 4, 'scale_pos_weight': 0.8858055091587512, 'subsample': 0.8816822115019555}


### 5. Evaluation

In [90]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.6574923547400612
Precision: 0.43434343434343436
Confusion Matrix:
[[1032   56]
 [ 504   43]]
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.95      0.79      1088
           1       0.43      0.08      0.13       547

    accuracy                           0.66      1635
   macro avg       0.55      0.51      0.46      1635
weighted avg       0.59      0.66      0.57      1635



In [92]:
test_results = test_data.copy()
test_results['Predicted_Buy'] = y_pred

predicted_stocks_to_buy = test_results[test_results['Predicted_Buy'] == 1]
predicted_stocks_avg_return = predicted_stocks_to_buy['Result'].mean()

avg_stock_return = test_results['Result'].mean()

best_stocks = test_results[test_results['Buy'] == 1]
avg_best_stocks_return = best_stocks['Result'].mean()

sp500_return = predicted_stocks_to_buy['Benchmark SP500 Performance'].mean()

print("Benchmarks: ")
print(f"Average stock return (whole test sample): {avg_stock_return:.5f}")
print(f"'Buy' stocks average return: {avg_best_stocks_return:.5f}")
print(f"SP500 return: {sp500_return:.5f}")

print(f"\nModel's predicted stock average return: {predicted_stocks_avg_return:.5f}")

Benchmarks: 
Average stock return (whole test sample): 1.03164
'Buy' stocks average return: 1.17566
SP500 return: 1.05484

Model's predicted stock average return: 1.03850


### 6. Conclusion
Despite numerous attempts, I was unable to find a model that surpasses the SP500 benchmark. The best XGBoost model achieved only 43% precision and a three-month ROI of 3.9% (with an equal distribution of capital). While this exceeds the average ROI for the entire test sample, it does not outperform the SP500 benchmark (the key difference is that the SP500 has an unequal capital distribution, meaning that some large companies may contribute significantly more to the overall ROI)

## Second attempt (more data)
In this attempt, data with a broader date range (from around 2008 for most companies) was used. This range includes several recession periods, providing greater diversity for the model to learn from. Additionally, columns containing information on the companies' Return on Assets and Return on Invested Capital were added.

### 1. Read & preprocess the data

In [2]:
data = pd.read_csv('stocks_data4.csv')
data.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Ticker,Year,Month,Price,MA Ratio,Buy,Result,ROE,ROA,ROI,Insider Ownership Growth,Institutional Ownership Growth,Forecast EPS Growth,Avg 2Q EPS Growth,Avg 2Q EPS Surprise,YoY EPS Growth,Sector Performance,Market Performance,Benchmark SP500 Performance
count,61517.0,61517,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0
unique,,417,,,,,,,,,,,,,,,,,,
top,,CPB,,,,,,,,,,,,,,,,,,
freq,,225,,,,,,,,,,,,,,,,,,
mean,30758.0,,2015.561064,6.475576,86.090852,1.005282,0.454102,1.035127,0.262087,0.08961,0.15072,0.031192,0.025757,0.090401,0.149241,11.124411,3969275000000.0,1.641091,1.603703,1.023901
std,17758.572592,,5.130421,3.448068,145.204509,0.043844,0.497893,0.139926,8.540118,1.104599,1.285496,0.891108,0.250149,1.732801,1.515826,52.925722,430114400000000.0,7.192112,6.071008,0.072566
min,0.0,,2005.0,1.0,0.17,0.580721,0.0,0.110349,-347.69357,-1.36977,-15.3364,-0.994779,-0.930676,-0.992366,-58.668103,-93.235,-1.0,-49.501466,-24.778692,0.690014
25%,15379.0,,2012.0,3.0,26.35,0.982233,0.0,0.955759,0.09591,0.03754,0.06528,-0.003494,-0.020228,-0.156716,-0.036162,1.27,0.01214575,-2.055089,-1.243019,0.989798
50%,30758.0,,2016.0,6.0,49.24,1.006824,0.0,1.035871,0.1664,0.07018,0.11467,0.0,0.00079,-0.016129,0.047591,4.74,0.1157895,2.177343,2.256661,1.034544
75%,46137.0,,2020.0,9.0,95.34,1.029836,1.0,1.113424,0.26634,0.11323,0.18696,0.008696,0.027434,0.11,0.157784,10.68,0.2425068,5.800866,5.36785,1.066909


In [3]:
cut_off_year = 2019

data = data.reset_index(drop=True)
train_data = data[(data['Year'] < cut_off_year) & ((data['Year'] != cut_off_year - 1) | (data['Month'] < 9))]
test_data = data[data['Year'] >= cut_off_year]
x_train = train_data.drop(['Year', 'Buy', 'Month', 'Ticker', 'Result', 'Benchmark SP500 Performance', 'Price', data.columns[0]], axis=1)
y_train = train_data['Buy']
x_test = test_data.drop(['Year', 'Buy', 'Month', 'Ticker', 'Result', 'Benchmark SP500 Performance', 'Price', data.columns[0]], axis=1)
y_test = test_data['Buy']

print(f"Amount of train data: {len(train_data)}")
print(f"Amount of test data: {len(test_data)}")

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

Amount of train data: 40199
Amount of test data: 19829


### 2. Split the data and train the model

In [4]:
xg_clf = xgb.XGBClassifier(objective='binary:logistic')
param_dist = {
    'max_depth': randint(3, 8),
    'subsample': uniform(0.5, 0.5),
    'eta': uniform(0.01, 0.2),
    'colsample_bytree': uniform(0.6, 0.4),
    'min_child_weight': randint(1, 5),
    'gamma': uniform(0.0, 0.4),
    'lambda': uniform(0.0, 10.0),
    'alpha': uniform(0.0, 10.0),
    'scale_pos_weight': uniform(0.7, 1.2)
}

random_search = RandomizedSearchCV(estimator=xg_clf,
                                   param_distributions=param_dist,
                                   n_iter=100, 
                                   cv=10,
                                   scoring='precision',
                                   n_jobs=-1,
                                   verbose=2,
                                   random_state=42)
random_search.fit(x_train, y_train)
best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

y_pred = best_model.predict(x_test)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best Parameters: {'alpha': 3.418796667164016, 'colsample_bytree': 0.6367196263253767, 'eta': 0.028831397653712024, 'gamma': 0.12456532375651769, 'lambda': 9.795105286215085, 'max_depth': 7, 'min_child_weight': 4, 'scale_pos_weight': 0.7205933221981002, 'subsample': 0.8816822115019555}


### 3. Evaluation

In [5]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.5960461949669675
Precision: 0.5072886297376094
Confusion Matrix:
[[11471   338]
 [ 7672   348]]
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.97      0.74     11809
           1       0.51      0.04      0.08      8020

    accuracy                           0.60     19829
   macro avg       0.55      0.51      0.41     19829
weighted avg       0.56      0.60      0.47     19829



In [6]:
test_results = test_data.copy()
test_results['Predicted_Buy'] = y_pred

predicted_stocks_to_buy = test_results[test_results['Predicted_Buy'] == 1]
predicted_stocks_avg_return = predicted_stocks_to_buy['Result'].mean()

avg_stock_return = test_results['Result'].mean()

best_stocks = test_results[test_results['Buy'] == 1]
avg_best_stocks_return = best_stocks['Result'].mean()

sp500_return = predicted_stocks_to_buy['Benchmark SP500 Performance'].mean()

print("Benchmarks: ")
print(f"Average stock return (whole test sample): {avg_stock_return:.5f}")
print(f"'Buy' stocks average return: {avg_best_stocks_return:.5f}")
print(f"SP500 return: {sp500_return:.5f}")

print(f"\nModel's predicted stock average return: {predicted_stocks_avg_return:.5f}")

Benchmarks: 
Average stock return (whole test sample): 1.03403
'Buy' stocks average return: 1.16015
SP500 return: 1.02983

Model's predicted stock average return: 1.05702


### 4. Conclusions
Again, with greater dataset to train on, the model achieved much higher results. Not only the average 3-Month investment return was almost 6%, but also it's much higher than SP500 result in the same investment periods. I suspect one can build profitable investment strategy with this prediction model.

In [7]:
available_cash = 1000000
portfolio_worth = 1000000
current_buys = {}

backtest_data = test_results.copy()
backtest_data = backtest_data.sort_values(by=['Year', 'Month'])

def sell_stock(ticker, price):
    prev_price = current_buys[ticker]['price']
    amount = current_buys[ticker]['shares']

    global portfolio_worth, available_cash
    portfolio_worth -= prev_price * amount
    portfolio_worth += price * amount
    available_cash += price * amount

for index, row in backtest_data.iterrows():
    ticker = row['Ticker']
    prediction = row['Predicted_Buy']
    price = row['Price']

    if prediction == True:
        if ticker not in current_buys:
            allowed_spend = int(portfolio_worth / 5)
            
            if allowed_spend > available_cash:
                allowed_spend = available_cash

            if allowed_spend < portfolio_worth / 50:
                continue
                
            amount = int(allowed_spend / price)
            available_cash -= amount * price
            
            current_buys[ticker] = {'price': price, 'shares': amount, 'last_price': price}
            print(f"Added {ticker} to current_buys for {row['Year']}-{row['Month']} with price {price}")
        else:
            if price < current_buys[ticker]['price'] * 0.95: # Stop loss
                prev_price = current_buys[ticker]['price']
                
                sell_stock(ticker, price)
                del current_buys[ticker]
                
                print(f"Removed {ticker} from current_buys for {row['Year']}-{row['Month']} with price {price}; prev price: {prev_price}")
                print(f"New net worth: {portfolio_worth}")
            else:
                current_buys[ticker]['last_price'] = price
            
    else:
        if ticker in current_buys:
            prev_price = current_buys[ticker]['price']

            sell_stock(ticker, price)            
            del current_buys[ticker]
            
            print(f"Removed {ticker} from current_buys for {row['Year']}-{row['Month']} with price {price}; prev price: {prev_price}")
            print(f"New net worth: {portfolio_worth}")

for ticker in current_buys:
    sell_stock(ticker, current_buys[ticker]['last_price'])
    
    print(f"Removed {ticker} from current_buys")
    
print(portfolio_worth)

Added AMZN to current_buys for 2019-3 with price 83.59
Added ANSS to current_buys for 2019-3 with price 182.23
Added BXP to current_buys for 2019-3 with price 102.66
Added DIS to current_buys for 2019-3 with price 111.75
Added EXR to current_buys for 2019-3 with price 78.72
Removed AMZN from current_buys for 2019-4 with price 90.71; prev price: 83.59
New net worth: 1017031.0399999999
Removed ANSS from current_buys for 2019-4 with price 187.45; prev price: 182.23
New net worth: 1022757.38
Removed BXP from current_buys for 2019-4 with price 106.23; prev price: 102.66
New net worth: 1029711.74
Removed DIS from current_buys for 2019-4 with price 110.28; prev price: 111.75
New net worth: 1027081.91
Removed EXR from current_buys for 2019-4 with price 84.05; prev price: 78.72
New net worth: 1040620.1100000001
Added CTRA to current_buys for 2019-5 with price 20.02
Added NI to current_buys for 2019-5 with price 22.88
Added UDR to current_buys for 2019-5 with price 36.89
Removed CTRA from curren