# Logistic Regression
This programm runs logistic regression in order to predict wether stock is a good option to buy. Stock is classified as 'Buy' if it'll beat SP500 and ROI is above 2%.

### 1. Imports

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

## First attempt (fewer data)

### 2. Load the data

In [11]:
data = pd.read_csv('stocks_data.csv')
data.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Ticker,Year,Month,MA Ratio,Buy,Result,ROE,Insider Ownership Growth,Institutional Ownership Growth,Forecast EPS Growth,Avg 2Q EPS Growth,Avg 2Q EPS Surprise,YoY EPS Growth,Sector Performance,Market Performance,Benchmark SP500 Performance
count,14854.0,14854,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0,14854.0
unique,,393,,,,,,,,,,,,,,,
top,,AWK,,,,,,,,,,,,,,,
freq,,55,,,,,,,,,,,,,,,
mean,7426.5,,2020.669113,6.22324,1.004148,0.415848,1.032817,39.494365,0.015486,0.026708,0.057775,0.181477,13.755183,0.369529,1.488003,1.438443,1.029219
std,4288.124784,,1.428016,3.520757,0.046473,0.492884,0.14926,181.839873,0.269863,0.230675,2.136724,2.111809,46.751483,3.637998,8.164589,7.038394,0.077491
min,0.0,,2018.0,1.0,0.580721,0.0,0.259712,-613.743387,-0.633527,-0.714136,-0.992366,-45.05,-65.625,-0.961538,-44.900728,-22.795349,0.769903
25%,3713.25,,2019.0,3.0,0.977766,0.0,0.944153,10.160854,-0.00135,-0.023114,-0.184264,-0.040838,2.015,0.017606,-3.453784,-3.160007,0.983314
50%,7426.5,,2021.0,6.0,1.00536,0.0,1.028547,19.251991,0.0,-0.000648,-0.039062,0.045662,6.055,0.130688,1.496227,2.069271,1.043101
75%,11139.75,,2022.0,9.0,1.031953,1.0,1.113949,31.949569,0.008,0.033653,0.086957,0.154182,13.135,0.275148,6.429508,5.50743,1.080728


### 3. Clean the data
Let's remove outliers.

In [12]:
for column in ['ROE', 'Insider Ownership Growth', 'Institutional Ownership Growth', 'Forecast EPS Growth', 'Avg 2Q EPS Growth', 'YoY EPS Growth']:
    upper_bound = data[column].quantile(0.999)
    data = data[(data[column] <= upper_bound)]
q = data['Avg 2Q EPS Growth'].quantile(0.001)
data = data[(data['Avg 2Q EPS Growth'] >= q)]
data.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Ticker,Year,Month,MA Ratio,Buy,Result,ROE,Insider Ownership Growth,Institutional Ownership Growth,Forecast EPS Growth,Avg 2Q EPS Growth,Avg 2Q EPS Surprise,YoY EPS Growth,Sector Performance,Market Performance,Benchmark SP500 Performance
count,14757.0,14757,14757.0,14757.0,14757.0,14757.0,14757.0,14757.0,14757.0,14757.0,14757.0,14757.0,14757.0,14757.0,14757.0,14757.0,14757.0
unique,,391,,,,,,,,,,,,,,,
top,,IQV,,,,,,,,,,,,,,,
freq,,55,,,,,,,,,,,,,,,
mean,7431.564546,,2020.665447,6.222877,1.004135,0.415735,1.032886,35.930619,0.009596,0.021883,0.011317,0.145736,13.522146,0.272787,1.486683,1.441679,1.029255
std,4285.343004,,1.428264,3.521282,0.046414,0.492865,0.149125,99.720733,0.072866,0.133643,0.519399,0.806091,45.365794,1.069267,8.159292,7.043774,0.077495
min,0.0,,2018.0,1.0,0.580721,0.0,0.259712,-613.743387,-0.633527,-0.714136,-0.992366,-3.489011,-65.625,-0.961538,-44.900728,-22.795349,0.769903
25%,3726.0,,2019.0,3.0,0.977791,0.0,0.944257,10.176162,-0.001361,-0.023167,-0.182927,-0.040646,1.995,0.018152,-3.453784,-3.160007,0.983314
50%,7430.0,,2021.0,6.0,1.005388,0.0,1.028536,19.293997,0.0,-0.000699,-0.038835,0.045532,6.01,0.130396,1.496227,2.069271,1.043101
75%,11144.0,,2022.0,9.0,1.031914,1.0,1.113877,31.949569,0.007968,0.033287,0.086705,0.152364,13.08,0.274611,6.429508,5.50743,1.080728


### 4. Split the data for train and test, standarise the data

In [13]:
data = data.reset_index(drop=True)
train_data = data[data['Year'] <= 2022]
test_data = data[data['Year'] > 2022]
x_train = train_data.drop(['Year', 'Buy', 'Month', 'Ticker', 'Result', 'Benchmark SP500 Performance', data.columns[0]], axis=1)
y_train = train_data['Buy']
x_test = test_data.drop(['Year', 'Buy', 'Month', 'Ticker', 'Result', 'Benchmark SP500 Performance', data.columns[0]], axis=1)
y_test = test_data['Buy']

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

x_test = scaler.transform(x_test)

### 5. Train the model

In [79]:
model = LogisticRegression(max_iter = 10000, class_weight={0: 100, 1: 100})
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

### 6. Evaluation

In [80]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.6596664607782582
Precision: 0.28
Confusion Matrix:
[[1061   18]
 [ 533    7]]
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.98      0.79      1079
           1       0.28      0.01      0.02       540

    accuracy                           0.66      1619
   macro avg       0.47      0.50      0.41      1619
weighted avg       0.54      0.66      0.54      1619



In [81]:
test_results = test_data.copy()
test_results['Predicted_Buy'] = y_pred

predicted_stocks_to_buy = test_results[test_results['Predicted_Buy'] == 1]
predicted_stocks_avg_return = predicted_stocks_to_buy['Result'].mean()

avg_stock_return = test_results['Result'].mean()

best_stocks = test_results[test_results['Buy'] == 1]
avg_best_stocks_return = best_stocks['Result'].mean()

sp500_return = predicted_stocks_to_buy['Benchmark SP500 Performance'].mean()

print("Benchmarks: ")
print(f"Average stock return (whole test sample): {avg_stock_return:.3f}")
print(f"'Buy' stocks average return: {avg_best_stocks_return:.3f}")
print(f"SP500 return: {sp500_return:.3f}")

print(f"\nModel's predicted stock average return: {predicted_stocks_avg_return:.3f}")

Benchmarks: 
Average stock return (whole test sample): 1.032
'Buy' stocks average return: 1.176
SP500 return: 1.057

Model's predicted stock average return: 1.043


### 7. Conclusion
The results achieved by the model leave much to be desired. Both the low precision and poorer performance compared to the S&P 500 suggest that logistic regression may not be suitable for building an investment strategy in this case. To investigate this further, more data is needed as well as the application of feature engineering. For example, by modifying the class weights and giving 20% more weight to class 0, the model achieves an ROI of 13%, which is a very good result compared to the S&P 500 (5%). However, the recall is extremely low, below 1%, so there is a high probability that this is just a fluke.

## Second attempt (more data)
In this attempt, data with a broader date range (from around 2008 for most companies) was used. This range includes several recession periods, providing greater diversity for the model to learn from. Additionally, columns containing information on the companies' Return on Assets and Return on Invested Capital were added.

### 1. Read & preprocess the data

In [3]:
data = pd.read_csv('stocks_data4.csv')
data.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Ticker,Year,Month,Price,MA Ratio,Buy,Result,ROE,ROA,ROI,Insider Ownership Growth,Institutional Ownership Growth,Forecast EPS Growth,Avg 2Q EPS Growth,Avg 2Q EPS Surprise,YoY EPS Growth,Sector Performance,Market Performance,Benchmark SP500 Performance
count,61517.0,61517,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0,61517.0
unique,,417,,,,,,,,,,,,,,,,,,
top,,CPB,,,,,,,,,,,,,,,,,,
freq,,225,,,,,,,,,,,,,,,,,,
mean,30758.0,,2015.561064,6.475576,86.090852,1.005282,0.454102,1.035127,0.262087,0.08961,0.15072,0.031192,0.025757,0.090401,0.149241,11.124411,3969275000000.0,1.641091,1.603703,1.023901
std,17758.572592,,5.130421,3.448068,145.204509,0.043844,0.497893,0.139926,8.540118,1.104599,1.285496,0.891108,0.250149,1.732801,1.515826,52.925722,430114400000000.0,7.192112,6.071008,0.072566
min,0.0,,2005.0,1.0,0.17,0.580721,0.0,0.110349,-347.69357,-1.36977,-15.3364,-0.994779,-0.930676,-0.992366,-58.668103,-93.235,-1.0,-49.501466,-24.778692,0.690014
25%,15379.0,,2012.0,3.0,26.35,0.982233,0.0,0.955759,0.09591,0.03754,0.06528,-0.003494,-0.020228,-0.156716,-0.036162,1.27,0.01214575,-2.055089,-1.243019,0.989798
50%,30758.0,,2016.0,6.0,49.24,1.006824,0.0,1.035871,0.1664,0.07018,0.11467,0.0,0.00079,-0.016129,0.047591,4.74,0.1157895,2.177343,2.256661,1.034544
75%,46137.0,,2020.0,9.0,95.34,1.029836,1.0,1.113424,0.26634,0.11323,0.18696,0.008696,0.027434,0.11,0.157784,10.68,0.2425068,5.800866,5.36785,1.066909


In [4]:
for column in ['ROE', 'Insider Ownership Growth', 'Institutional Ownership Growth', 'Forecast EPS Growth', 'Avg 2Q EPS Growth', 'YoY EPS Growth']:
    upper_bound = data[column].quantile(0.999)
    data = data[(data[column] <= upper_bound)]
q = data['Avg 2Q EPS Growth'].quantile(0.001)
data = data[(data['Avg 2Q EPS Growth'] >= q)]
data.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Ticker,Year,Month,Price,MA Ratio,Buy,Result,ROE,ROA,ROI,Insider Ownership Growth,Institutional Ownership Growth,Forecast EPS Growth,Avg 2Q EPS Growth,Avg 2Q EPS Surprise,YoY EPS Growth,Sector Performance,Market Performance,Benchmark SP500 Performance
count,61087.0,61087,61087.0,61087.0,61087.0,61087.0,61087.0,61087.0,61087.0,61087.0,61087.0,61087.0,61087.0,61087.0,61087.0,61087.0,61087.0,61087.0,61087.0,61087.0
unique,,417,,,,,,,,,,,,,,,,,,
top,,SHW,,,,,,,,,,,,,,,,,,
freq,,225,,,,,,,,,,,,,,,,,,
mean,30760.834187,,2015.569221,6.478662,86.040855,1.005279,0.453894,1.035114,0.111004,0.082006,0.141937,0.011203,0.020213,0.048165,0.135944,10.844773,0.20432,1.642579,1.606065,1.023948
std,17750.016433,,5.121289,3.449648,144.358576,0.043759,0.497874,0.139492,4.354249,0.065388,0.227534,0.160279,0.158886,0.661588,0.638213,45.682348,0.70825,7.179243,6.067306,0.072557
min,0.0,,2005.0,1.0,0.17,0.580721,0.0,0.110349,-347.69357,-1.36977,-15.3364,-0.994779,-0.930676,-0.992366,-7.247826,-93.235,-1.0,-49.501466,-24.778692,0.690014
25%,15398.5,,2012.0,3.0,26.39,0.982252,0.0,0.955893,0.09613,0.03773,0.065425,-0.003509,-0.020261,-0.15625,-0.035667,1.275,0.012584,-2.055089,-1.243019,0.989798
50%,30760.0,,2016.0,6.0,49.29,1.006816,0.0,1.035912,0.16667,0.07025,0.11481,0.0,0.000723,-0.016129,0.047619,4.73,0.115741,2.177343,2.256661,1.034544
75%,46143.5,,2020.0,9.0,95.41,1.029818,1.0,1.113338,0.266075,0.11329,0.18691,0.008621,0.027216,0.108752,0.157197,10.6425,0.241803,5.800866,5.36785,1.066909


### 2. Split the data and train the model

In [5]:
cut_off_year = 2019

data = data.reset_index(drop=True)
train_data = data[(data['Year'] < cut_off_year) & ((data['Year'] != cut_off_year - 1) | (data['Month'] < 9))]
test_data = data[data['Year'] >= cut_off_year]
x_train = train_data.drop(['Year', 'Buy', 'Month', 'Ticker', 
                           'Result', 'Benchmark SP500 Performance', 
                           'Price', data.columns[0]], axis=1)
y_train = train_data['Buy']
x_test = test_data.drop(['Year', 'Buy', 'Month', 
                         'Ticker', 'Result', 'Benchmark SP500 Performance', 
                         'Price', data.columns[0]], axis=1)
y_test = test_data['Buy']

print(f"Amount of train data: {len(train_data)}")
print(f"Amount of test data: {len(test_data)}")

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

x_test = scaler.transform(x_test)

Amount of train data: 39907
Amount of test data: 19700


In [16]:
model = LogisticRegression(max_iter = 10000, class_weight={0: 130, 1: 100})
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

### 3. Evaluate the model

In [17]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.596243654822335
Precision: 0.5211864406779662
Confusion Matrix:
[[11623   113]
 [ 7841   123]]
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.99      0.75     11736
           1       0.52      0.02      0.03      7964

    accuracy                           0.60     19700
   macro avg       0.56      0.50      0.39     19700
weighted avg       0.57      0.60      0.46     19700



In [18]:
test_results = test_data.copy()
test_results['Predicted_Buy'] = y_pred

predicted_stocks_to_buy = test_results[test_results['Predicted_Buy'] == 1]
predicted_stocks_avg_return = predicted_stocks_to_buy['Result'].mean()

avg_stock_return = test_results['Result'].mean()

best_stocks = test_results[test_results['Buy'] == 1]
avg_best_stocks_return = best_stocks['Result'].mean()

sp500_return = predicted_stocks_to_buy['Benchmark SP500 Performance'].mean()

print("Benchmarks: ")
print(f"Average stock return (whole test sample): {avg_stock_return:.3f}")
print(f"'Buy' stocks average return: {avg_best_stocks_return:.3f}")
print(f"SP500 return: {sp500_return:.3f}")

print(f"\nModel's predicted stock average return: {predicted_stocks_avg_return:.3f}")

Benchmarks: 
Average stock return (whole test sample): 1.034
'Buy' stocks average return: 1.160
SP500 return: 1.051

Model's predicted stock average return: 1.097


### 4. Conclusions
The model performs significantly better after being retrained on a larger dataset. Despite a modest accuracy of 52%, the model achieved an average ROI of 9.7% per 'Buy' signal. This improvement underscores the positive impact of increasing the dataset size on the model's accuracy and predictive capabilities. Such a result suggests that the model can be used to build a profitable investment strategy, but it also requires other elements, such as risk management and a precise definition of entry and exit strategies.

### 5. Example strategy - backtest
In this simple example strategy, a stock is purchased for up to 20% of the portfolio value if, in the new month, it is classified as 'Buy' and is not currently in the portfolio. A stock is sold in three cases:
- if in the new month the stock is no longer classified as 'Buy',
- if a stop loss occurs, meaning that in the new month the stocks' price drops by more than 5%,
- at the end of the backtest, i.e., in May 2024, all stocks in the portfolio are sold at the last available price.

In [21]:
available_cash = 1000000
portfolio_worth = 1000000
current_buys = {}

backtest_data = test_results.copy()
backtest_data = backtest_data.sort_values(by=['Year', 'Month'])

def sell_stock(ticker, price):
    prev_price = current_buys[ticker]['price']
    amount = current_buys[ticker]['shares']

    global portfolio_worth, available_cash
    portfolio_worth -= prev_price * amount
    portfolio_worth += price * amount
    available_cash += price * amount

for index, row in backtest_data.iterrows():
    ticker = row['Ticker']
    prediction = row['Predicted_Buy']
    price = row['Price']

    if prediction == True:
        if ticker not in current_buys:
            allowed_spend = int(portfolio_worth / 5)
            
            if allowed_spend > available_cash:
                allowed_spend = available_cash

            if allowed_spend < portfolio_worth / 50:
                continue
                
            amount = int(allowed_spend / price)
            available_cash -= amount * price
            
            current_buys[ticker] = {'price': price, 'shares': amount, 'last_price': price}
            print(f"Added {ticker} to current_buys for {row['Year']}-{row['Month']} with price {price}")
        else:
            if price < current_buys[ticker]['price'] * 0.95: # Stop loss
                prev_price = current_buys[ticker]['price']
                
                sell_stock(ticker, price)
                del current_buys[ticker]
                
                print(f"Removed {ticker} from current_buys for {row['Year']}-{row['Month']} with price {price}; prev price: {prev_price}")
                print(f"New net worth: {portfolio_worth}")
            else:
                current_buys[ticker]['last_price'] = price
            
    else:
        if ticker in current_buys:
            prev_price = current_buys[ticker]['price']

            sell_stock(ticker, price)            
            del current_buys[ticker]
            
            print(f"Removed {ticker} from current_buys for {row['Year']}-{row['Month']} with price {price}; prev price: {prev_price}")
            print(f"New net worth: {portfolio_worth}")

for ticker in current_buys:
    sell_stock(ticker, current_buys[ticker]['last_price'])
    
    print(f"Removed {ticker} from current_buys")
    
print(portfolio_worth)

Added BBWI to current_buys for 2019-1 with price 18.72
Added BG to current_buys for 2019-1 with price 44.69
Added CAG to current_buys for 2019-1 with price 17.39
Added EOG to current_buys for 2019-1 with price 71.04
Added IDXX to current_buys for 2019-1 with price 182.46
Removed CAG from current_buys for 2019-2 with price 17.62; prev price: 17.39
New net worth: 1002645.0
Removed EOG from current_buys for 2019-2 with price 77.68; prev price: 71.04
New net worth: 1021336.6000000001
Added CNC to current_buys for 2019-3 with price 61.34
Added DPZ to current_buys for 2019-3 with price 236.41
Removed CNC from current_buys for 2019-4 with price 54.16; prev price: 61.34
New net worth: 997427.2000000002
Removed DPZ from current_buys for 2019-4 with price 235.82; prev price: 236.41
New net worth: 996917.4400000002
Added MNST to current_buys for 2019-4 with price 26.64
Added INTU to current_buys for 2019-5 with price 235.62
Removed MNST from current_buys for 2019-5 with price 28.97; prev price: 2