In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

c:\Users\peter\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\peter\anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll


In [2]:
def normalize_dataframe(df):
    """
    Normalizes all columns in a pandas DataFrame  using MinMaxScaler.

    Args:
        df (pandas.DataFrame): The input DataFrame.

    Returns:
        pandas.DataFrame: The normalized DataFrame.
    """
    scaler = MinMaxScaler()
    columns_to_normalize = [col for col in df.columns]
    df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
    return df

In [3]:
def add_up_column(df):
    # Create empty 'up' and 'down' columns
    df['up'] = np.nan
    
    # Loop over the rows (skipping the first row)
    for i in range(0, len(df)-1):
        if df.loc[i+1, '4. close'] > df.loc[i, '4. close']:
            df.loc[i, 'up'] = 1
        else:
            df.loc[i, 'up'] = 0
    return df

In [4]:
def combine_csvs_from_folder(folder_path):
    """
    Combines all CSV files in a folder into a single pandas DataFrame also normalizes before combining them.

    Args:
        folder_path (str): The path to the folder containing the CSV files.

    Returns:
        A pandas DataFrame containing the concatenated data from all CSV files in the input folder.
    """
    # Use a list comprehension to read all CSV files in the folder into a list of DataFrames.
    dfs = [pd.read_csv(os.path.join(folder_path, f)) for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    # Use a list comprehension to get the filenames of all CSV files in the folder.
    filenames = [os.path.splitext(os.path.basename(f))[0] for f in os.listdir(folder_path) if f.endswith('.csv')]

    processed_dfs = []
    i = 0
    for df, filename in zip(dfs, filenames):
        # Dont need the date column
        df = df.drop(['date'], axis=1)
        # normalize the dataframes before combining them
        df = normalize_dataframe(df)
        # for the neural network to understand the company name we need to convert it to a number
        df['company'] = i
        i += 1
        df = add_up_column(df)
        processed_dfs.append(df)
    combined_df = pd.concat(processed_dfs, ignore_index=True)
    
    return combined_df

df_full = combine_csvs_from_folder('market_data/merged_sentiment_data')

df_full.tail()

Unnamed: 0,Chaikin A/D,ADOSC,ADX,ADXR,APO,Aroon Down,Aroon Up,AROONOSC,ATR,Real Upper Band,...,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,sentiment,company,up
8579,1.0,0.67249,0.258971,0.316359,0.75496,0.0,0.85,0.925,0.302798,0.95102,...,0.92954,0.951991,0.944602,0.965739,0.140781,0.0,0.0,0.766338,28,1.0
8580,0.998778,0.664732,0.266415,0.306323,0.732013,0.0,1.0,1.0,0.320301,0.943965,...,1.0,1.0,0.98035,1.0,0.265542,0.0,0.0,0.76478,28,0.0
8581,0.966297,0.581039,0.254699,0.296031,0.687362,0.0,0.95,0.975,0.365638,0.918791,...,0.936562,0.94135,0.893466,0.91673,0.157345,0.0,0.0,0.755167,28,0.0
8582,0.931217,0.468669,0.24695,0.288036,0.623677,1.0,0.9,0.45,0.462123,0.939905,...,0.855206,0.814194,0.785275,0.813039,0.264651,0.0,0.0,0.502183,28,0.0
8583,0.901446,0.36545,0.247488,0.285341,0.568705,1.0,0.85,0.425,0.444907,0.964661,...,0.755206,0.776368,0.733902,0.763803,0.139751,0.0,0.0,0.415032,28,


In [5]:
#we need this for later
def find_indices_of_test_rows(df):
    indices = []
    for i in range(1, len(df)):
        if np.isnan(df.loc[i, 'up']):
            indices.append(i)
    return indices
idxs = find_indices_of_test_rows(df_full)
print(idxs)

[295, 591, 887, 1183, 1479, 1775, 2071, 2367, 2663, 2959, 3255, 3551, 3847, 4143, 4439, 4735, 5031, 5327, 5623, 5919, 6215, 6511, 6807, 7103, 7399, 7695, 7991, 8287, 8583]


In [6]:
# we should one hot encode the company column
# first we need to change it to a string so we can one hot encode it
df_full['company'] = df_full['company'].astype(str)
df_full = pd.get_dummies(df_full, columns=['company'])

In [7]:
#drop nan rows
df_train = df_full.dropna()

In [8]:
df_master = df_train
df_master

Unnamed: 0,Chaikin A/D,ADOSC,ADX,ADXR,APO,Aroon Down,Aroon Up,AROONOSC,ATR,Real Upper Band,...,company_26,company_27,company_28,company_3,company_4,company_5,company_6,company_7,company_8,company_9
0,0.135312,0.619734,0.395503,0.459299,0.422455,0.85,0.35,0.250,0.603205,0.855696,...,0,0,0,0,0,0,0,0,0,0
1,0.161065,0.647807,0.407572,0.449526,0.400695,0.80,0.30,0.250,0.599784,0.843167,...,0,0,0,0,0,0,0,0,0,0
2,0.137926,0.582208,0.409131,0.430583,0.372748,0.75,0.25,0.250,0.577742,0.824053,...,0,0,0,0,0,0,0,0,0,0
3,0.121791,0.499820,0.424835,0.416918,0.317456,0.70,0.20,0.250,0.570574,0.816083,...,0,0,0,0,0,0,0,0,0,0
4,0.077162,0.354716,0.451172,0.410581,0.252092,0.65,0.15,0.250,0.596903,0.814220,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8578,0.970065,0.646422,0.270488,0.335065,0.776503,0.00,0.90,0.950,0.320660,0.959564,...,0,0,1,0,0,0,0,0,0,0
8579,1.000000,0.672490,0.258971,0.316359,0.754960,0.00,0.85,0.925,0.302798,0.951020,...,0,0,1,0,0,0,0,0,0,0
8580,0.998778,0.664732,0.266415,0.306323,0.732013,0.00,1.00,1.000,0.320301,0.943965,...,0,0,1,0,0,0,0,0,0,0
8581,0.966297,0.581039,0.254699,0.296031,0.687362,0.00,0.95,0.975,0.365638,0.918791,...,0,0,1,0,0,0,0,0,0,0


## Without sentiment analysis

In [9]:
df = df_master.drop(columns="sentiment").copy()
X_columns = [c for c in df.columns if c != "up"]
y_column = "up"

In [10]:
X_data = df[X_columns].to_numpy()
y_data = df[y_column].to_numpy()

In [11]:
def train_test(models, X, y):
    """
    input: list(tuple(str, class)) models in the following format:
        [(<name1>, <model1>), (<name2>, <model2>), ...]
    output: list of tuples of name, trained model, and accuracy
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    names = []
    trained_models = []
    accuracies = []
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc_test = accuracy_score(y_test, y_pred)
        acc_train = accuracy_score(y_train, model.predict(X_train))

        names.append(name)
        trained_models.append(model)
        accuracies.append(acc_test)
        print(f"{name} got a train acc of {acc_train} and a test acc of {acc_test}")

    return [(a,b,c) for a,b,c in zip(names, trained_models, accuracies)]

In [12]:
xgb_model = GradientBoostingClassifier()
ada_model = AdaBoostClassifier()
bag_model = BaggingClassifier()
svc_model = SVC()
sgd_model = SGDClassifier()
mlp_model = MLPClassifier(max_iter=500)
knn_model = KNeighborsClassifier()

models = [("xgb", GradientBoostingClassifier()),
           ("ada", AdaBoostClassifier()), 
            ("bag", BaggingClassifier()), 
             ("svc", SVC()), 
              ("sgd", SGDClassifier()), 
               ("mlp", MLPClassifier(max_iter=500))]
stc_model = StackingClassifier(models)

test_models = [("Grad. Boost", xgb_model),
               ("AdaBoost", ada_model),
               ("Bagging", bag_model),
               ("SVC", svc_model),
               ("SGD", sgd_model),
               ("MLP", mlp_model),
               #("KNN", knn_model),
               ("Stacking", stc_model)
               ]

tech_train_results = train_test(test_models, X_data, y_data)

Grad. Boost got a train acc of 0.7086023381608795 and a test acc of 0.580028328611898
AdaBoost got a train acc of 0.6290350724131915 and a test acc of 0.5601983002832861
Bagging got a train acc of 0.9853428720991101 and a test acc of 0.5470963172804533
SVC got a train acc of 0.6389809806316524 and a test acc of 0.5435552407932012
SGD got a train acc of 0.5534810678764613 and a test acc of 0.5276203966005666




MLP got a train acc of 0.8689582969813296 and a test acc of 0.5601983002832861




Stacking got a train acc of 0.9164194730413541 and a test acc of 0.5892351274787535


## With Sentiment Analysis

In [13]:
df = df_master.copy()
X_columns = [c for c in df.columns if c != "up"]
y_column = "up"

In [14]:
X_data = df[X_columns].to_numpy()
y_data = df[y_column].to_numpy()

In [15]:
xgb_model = GradientBoostingClassifier()
ada_model = AdaBoostClassifier()
bag_model = BaggingClassifier()
svc_model = SVC()
sgd_model = SGDClassifier()
mlp_model = MLPClassifier(max_iter=500)
knn_model = KNeighborsClassifier()

models = [("xgb", GradientBoostingClassifier()),
           ("ada", AdaBoostClassifier()), 
            ("bag", BaggingClassifier()), 
             ("svc", SVC()), 
              ("sgd", SGDClassifier()), 
               ("mlp", MLPClassifier(max_iter=500))]
stc_model = StackingClassifier(models)

test_models = [("Grad. Boost", xgb_model),
               ("AdaBoost", ada_model),
               ("Bagging", bag_model),
               ("SVC", svc_model),
               ("SGD", sgd_model),
               ("MLP", mlp_model),
               #("KNN", knn_model),
               ("Stacking", stc_model),
               ]

sentiment_train_results = train_test(test_models, X_data, y_data)

Grad. Boost got a train acc of 0.7044145873320538 and a test acc of 0.5881728045325779
AdaBoost got a train acc of 0.6234514046414238 and a test acc of 0.5626770538243626
Bagging got a train acc of 0.9848194032455069 and a test acc of 0.5506373937677054
SVC got a train acc of 0.6344442505670913 and a test acc of 0.5347025495750708
SGD got a train acc of 0.5564473913802129 and a test acc of 0.5460339943342776




MLP got a train acc of 0.8816960390856744 and a test acc of 0.5524079320113314




Stacking got a train acc of 0.860059326470075 and a test acc of 0.5828611898016998


## With Sentiment Analysis Plus Additional Previous Day's news

In [27]:
df = df_master.copy()
for i in range(1, 7):
    df[f"sentiment_{i}"] = df["sentiment"].shift(i)
df.dropna(inplace=True)

In [28]:
X_columns = [c for c in df.columns if c != "up"]
y_column = "up"

In [29]:
X_data = df[X_columns].to_numpy()
y_data = df[y_column].to_numpy()

In [30]:
xgb_model = GradientBoostingClassifier()
ada_model = AdaBoostClassifier()
bag_model = BaggingClassifier()
svc_model = SVC()
sgd_model = SGDClassifier()
mlp_model = MLPClassifier(max_iter=500)
knn_model = KNeighborsClassifier()

models = [("xgb", GradientBoostingClassifier()),
           ("ada", AdaBoostClassifier()), 
            ("bag", BaggingClassifier()), 
             ("svc", SVC()), 
              ("sgd", SGDClassifier()), 
               ("mlp", MLPClassifier(max_iter=500))]
stc_model = StackingClassifier(models)

test_models = [("Grad. Boost", xgb_model),
               ("AdaBoost", ada_model),
               ("Bagging", bag_model),
               ("SVC", svc_model),
               ("SGD", sgd_model),
               ("MLP", mlp_model),
               #("KNN", knn_model),
               ("Stacking", stc_model),
               ]

sentiment_p_train_results = train_test(test_models, X_data, y_data)

Grad. Boost got a train acc of 0.7246376811594203 and a test acc of 0.5744153082919915
AdaBoost got a train acc of 0.6280775275013096 and a test acc of 0.5655563430191354
Bagging got a train acc of 0.9830626855247075 and a test acc of 0.5712260807937632
SVC got a train acc of 0.6403003317618299 and a test acc of 0.5460666194188519
SGD got a train acc of 0.5763925266282521 and a test acc of 0.5439404677533664




MLP got a train acc of 0.8894709271870089 and a test acc of 0.5744153082919915




Stacking got a train acc of 0.9032652348524532 and a test acc of 0.5889440113394756


## Test predictions

In [20]:
# tomorrow_dfs = [pd.read_csv(os.path.join(folder_path, f)) for f in os.listdir(folder_path) if f.endswith('.csv')]
#     for tom_df, filename in zip(tomorrow_dfs, filenames):
        
#         df = df.drop(['date'], axis=1)
#         # normalize the dataframes before combining them
#         df = normalize_dataframe(df)
#         # for the neural network to understand the company name we need to convert it to a number
#         df['company'] = i
#         i += 1
#         df = add_up_column(df)
#         processed_dfs.append(df)
#     combined_df = pd.concat(processed_dfs, ignore_index=True)
    
#     return combined_df

# df_full = combine_csvs_from_folder('market_data/merged_sentiment_data')

In [21]:
df = df_master.copy()
X_columns = [c for c in df.columns if c != "up"]
y_column = "up"

In [22]:
X_data = df[X_columns].to_numpy()
y_data = df[y_column].to_numpy()

In [26]:
test_df = df_full[df_full.isna().any(axis=1)]
test_df = test_df.drop(['up'], axis=1)
xgb_model = GradientBoostingClassifier()
xgb_model.fit(X_data, y_data)
y_pred = xgb_model.predict(test_df.to_numpy())
filenames = [os.path.splitext(os.path.basename(f))[0] for f in os.listdir("market_data/merged_sentiment_data") if f.endswith('.csv')]
for filename, pred in zip(filenames, y_pred):
    print(f"{filename}: {pred}")


AAPL: 0.0
ADBE: 1.0
AFRM: 1.0
AMD: 0.0
AMZN: 1.0
AVGO: 0.0
BAC: 1.0
BIO: 1.0
BKNG: 0.0
COIN: 1.0
DKNG: 1.0
GOOG: 1.0
JPM: 0.0
KO: 0.0
MA: 0.0
META: 0.0
MSFT: 0.0
NVDA: 0.0
OPEN: 1.0
PG: 0.0
PLUG: 1.0
RDFN: 1.0
SHOP: 1.0
SQ: 1.0
TEAM: 1.0
TSLA: 1.0
UNH: 0.0
WMT: 0.0
XOM: 1.0


In [None]:
accuracy_score(y_data, np.zeros(y_data.shape))

0.5115218154170078