# XGBoost Classification
This programm is an attempt to predict valuable stocks by XGBoost Classification.

### 1. Imports

In [84]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report, make_scorer
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint, uniform

### 2. Load the data

In [85]:
data = pd.read_csv('stocks_data.csv')
data.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Ticker,Year,Month,MA Ratio,Buy,Result,ROE,Insider Ownership Growth,Institutional Ownership Growth,Forecast EPS Growth,Avg 2Q EPS Growth,Avg 2Q EPS Surprise,YoY EPS Growth,Sector Performance,Market Performance,Benchmark SP500 Performance
count,14854.0,14854,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0
unique,,393,,,,,,,,,,,,,,,
top,,AWK,,,,,,,,,,,,,,,
freq,,55,,,,,,,,,,,,,,,
mean,7426.5,,2020.669113,6.22324,1.004148,0.415848,1.032817,39.494365,0.015486,0.026708,0.057775,0.181477,13.755183,0.369529,1.488003,1.438443,1.029219
std,4288.124784,,1.428016,3.520757,0.046473,0.492884,0.14926,181.839873,0.269863,0.230675,2.136724,2.111809,46.751483,3.637998,8.164589,7.038394,0.077491
min,0.0,,2018.0,1.0,0.580721,0.0,0.259712,-613.743387,-0.633527,-0.714136,-0.992366,-45.05,-65.625,-0.961538,-44.900728,-22.795349,0.769903
25%,3713.25,,2019.0,3.0,0.977766,0.0,0.944153,10.160854,-0.00135,-0.023114,-0.184264,-0.040838,2.015,0.017606,-3.453784,-3.160007,0.983314
50%,7426.5,,2021.0,6.0,1.00536,0.0,1.028547,19.251991,0.0,-0.000648,-0.039062,0.045662,6.055,0.130688,1.496227,2.069271,1.043101
75%,11139.75,,2022.0,9.0,1.031953,1.0,1.113949,31.949569,0.008,0.033653,0.086957,0.154182,13.135,0.275148,6.429508,5.50743,1.080728


### 3. Split the data for train and test, standarise the data

In [86]:
data = data.reset_index(drop=True)
train_data = data[data['Year'] <= 2022]
test_data = data[data['Year'] > 2022]
x_train = train_data.drop(['Year', 'Buy', 'Month', 'Ticker', 'Result', 'Benchmark SP500 Performance', data.columns[0]], axis=1)
y_train = train_data['Buy']
x_test = test_data.drop(['Year', 'Buy', 'Month', 'Ticker', 'Result', 'Benchmark SP500 Performance', data.columns[0]], axis=1)
y_test = test_data['Buy']

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

### 4. Train the model with randomized search

In [89]:
xg_clf = xgb.XGBClassifier(objective='binary:logistic')
param_dist = {
    'max_depth': randint(3, 8),
    'subsample': uniform(0.5, 0.5),
    'eta': uniform(0.01, 0.2),
    'colsample_bytree': uniform(0.6, 0.4),
    'min_child_weight': randint(1, 5),
    'gamma': uniform(0.0, 0.4),
    'lambda': uniform(0.0, 10.0),
    'alpha': uniform(0.0, 10.0),
    'scale_pos_weight': uniform(0.8, 5.0)
}

random_search = RandomizedSearchCV(estimator=xg_clf,
                                   param_distributions=param_dist,
                                   n_iter=50, 
                                   cv=10,
                                   scoring='precision',
                                   n_jobs=-1,
                                   verbose=2,
                                   random_state=42)
random_search.fit(x_train, y_train)
best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

y_pred = best_model.predict(x_test)

Fitting 10 folds for each of 50 candidates, totalling 500 fits
Best Parameters: {'alpha': 3.418796667164016, 'colsample_bytree': 0.6367196263253767, 'eta': 0.028831397653712024, 'gamma': 0.12456532375651769, 'lambda': 9.795105286215085, 'max_depth': 7, 'min_child_weight': 4, 'scale_pos_weight': 0.8858055091587512, 'subsample': 0.8816822115019555}


### 5. Evaluation

In [90]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.6574923547400612
Precision: 0.43434343434343436
Confusion Matrix:
[[1032   56]
 [ 504   43]]
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.95      0.79      1088
           1       0.43      0.08      0.13       547

    accuracy                           0.66      1635
   macro avg       0.55      0.51      0.46      1635
weighted avg       0.59      0.66      0.57      1635



In [92]:
test_results = test_data.copy()
test_results['Predicted_Buy'] = y_pred

predicted_stocks_to_buy = test_results[test_results['Predicted_Buy'] == 1]
predicted_stocks_avg_return = predicted_stocks_to_buy['Result'].mean()

avg_stock_return = test_results['Result'].mean()

best_stocks = test_results[test_results['Buy'] == 1]
avg_best_stocks_return = best_stocks['Result'].mean()

sp500_return = predicted_stocks_to_buy['Benchmark SP500 Performance'].mean()

print("Benchmarks: ")
print(f"Average stock return (whole test sample): {avg_stock_return:.5f}")
print(f"'Buy' stocks average return: {avg_best_stocks_return:.5f}")
print(f"SP500 return: {sp500_return:.5f}")

print(f"\nModel's predicted stock average return: {predicted_stocks_avg_return:.5f}")

Benchmarks: 
Average stock return (whole test sample): 1.03164
'Buy' stocks average return: 1.17566
SP500 return: 1.05484

Model's predicted stock average return: 1.03850


### 6. Conclusion
Despite numerous attempts, I was unable to find a model that surpasses the SP500 benchmark. The best XGBoost model achieved only 43% precision and a three-month ROI of 3.9% (with an equal distribution of capital). While this exceeds the average ROI for the entire test sample, it does not outperform the SP500 benchmark (the key difference is that the SP500 has an unequal capital distribution, meaning that some large companies may contribute significantly more to the overall ROI)