In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier


In [2]:
def normalize_dataframe(df):
    """
    Normalizes all columns in a pandas DataFrame  using MinMaxScaler.

    Args:
        df (pandas.DataFrame): The input DataFrame.

    Returns:
        pandas.DataFrame: The normalized DataFrame.
    """
    scaler = MinMaxScaler()
    columns_to_normalize = [col for col in df.columns]
    df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
    return df

In [3]:
def add_up_column(df):
    # Create empty 'up' and 'down' columns
    df['up'] = np.nan
    
    # Loop over the rows (skipping the first row)
    for i in range(0, len(df)-1):
        if df.loc[i+1, '4. close'] > df.loc[i, '4. close']:
            df.loc[i, 'up'] = 1
        else:
            df.loc[i, 'up'] = 0
    return df

In [4]:
def combine_csvs_from_folder(folder_path):
    """
    Combines all CSV files in a folder into a single pandas DataFrame also normalizes before combining them.

    Args:
        folder_path (str): The path to the folder containing the CSV files.

    Returns:
        A pandas DataFrame containing the concatenated data from all CSV files in the input folder.
    """
    # Use a list comprehension to read all CSV files in the folder into a list of DataFrames.
    dfs = [pd.read_csv(os.path.join(folder_path, f)) for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    # Use a list comprehension to get the filenames of all CSV files in the folder.
    filenames = [os.path.splitext(os.path.basename(f))[0] for f in os.listdir(folder_path) if f.endswith('.csv')]

    processed_dfs = []
    i = 0
    for df, filename in zip(dfs, filenames):
        # Dont need the date column
        df = df.drop(['date'], axis=1)
        # normalize the dataframes before combining them
        df = normalize_dataframe(df)
        # for the neural network to understand the company name we need to convert it to a number
        df['company'] = i
        i += 1
        df = add_up_column(df)
        processed_dfs.append(df)
    combined_df = pd.concat(processed_dfs, ignore_index=True)
    
    return combined_df

df_full = combine_csvs_from_folder('market_data/merged_sentiment_data')

df_full.tail()

Unnamed: 0,Chaikin A/D,ADOSC,ADX,ADXR,APO,Aroon Down,Aroon Up,AROONOSC,ATR,Real Upper Band,...,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,sentiment,company,up
8579,1.0,0.67249,0.258971,0.316359,0.75496,0.0,0.85,0.925,0.302798,0.95102,...,0.92954,0.951991,0.944602,0.965739,0.140781,0.0,0.0,0.766338,28,1.0
8580,0.998778,0.664732,0.266415,0.306323,0.732013,0.0,1.0,1.0,0.320301,0.943965,...,1.0,1.0,0.98035,1.0,0.265542,0.0,0.0,0.76478,28,0.0
8581,0.966297,0.581039,0.254699,0.296031,0.687362,0.0,0.95,0.975,0.365638,0.918791,...,0.936562,0.94135,0.893466,0.91673,0.157345,0.0,0.0,0.755167,28,0.0
8582,0.931217,0.468669,0.24695,0.288036,0.623677,1.0,0.9,0.45,0.462123,0.939905,...,0.855206,0.814194,0.785275,0.813039,0.264651,0.0,0.0,0.502183,28,0.0
8583,0.901446,0.36545,0.247488,0.285341,0.568705,1.0,0.85,0.425,0.444907,0.964661,...,0.755206,0.776368,0.733902,0.763803,0.139751,0.0,0.0,0.415032,28,


In [5]:
#we need this for later
def find_indices_of_test_rows(df):
    indices = []
    for i in range(1, len(df)):
        if np.isnan(df.loc[i, 'up']):
            indices.append(i)
    return indices
idxs = find_indices_of_test_rows(df_full)
print(idxs)

[295, 591, 887, 1183, 1479, 1775, 2071, 2367, 2663, 2959, 3255, 3551, 3847, 4143, 4439, 4735, 5031, 5327, 5623, 5919, 6215, 6511, 6807, 7103, 7399, 7695, 7991, 8287, 8583]


In [6]:
# we should one hot encode the company column
# first we need to change it to a string so we can one hot encode it
df_full['company'] = df_full['company'].astype(str)
df_full = pd.get_dummies(df_full, columns=['company'])

In [7]:
#drop nan rows
df_train = df_full.dropna()

In [8]:
df_master = df_train
df_master

Unnamed: 0,Chaikin A/D,ADOSC,ADX,ADXR,APO,Aroon Down,Aroon Up,AROONOSC,ATR,Real Upper Band,...,company_26,company_27,company_28,company_3,company_4,company_5,company_6,company_7,company_8,company_9
0,0.135312,0.619734,0.395503,0.459299,0.422455,0.85,0.35,0.250,0.603205,0.855696,...,0,0,0,0,0,0,0,0,0,0
1,0.161065,0.647807,0.407572,0.449526,0.400695,0.80,0.30,0.250,0.599784,0.843167,...,0,0,0,0,0,0,0,0,0,0
2,0.137926,0.582208,0.409131,0.430583,0.372748,0.75,0.25,0.250,0.577742,0.824053,...,0,0,0,0,0,0,0,0,0,0
3,0.121791,0.499820,0.424835,0.416918,0.317456,0.70,0.20,0.250,0.570574,0.816083,...,0,0,0,0,0,0,0,0,0,0
4,0.077162,0.354716,0.451172,0.410581,0.252092,0.65,0.15,0.250,0.596903,0.814220,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8578,0.970065,0.646422,0.270488,0.335065,0.776503,0.00,0.90,0.950,0.320660,0.959564,...,0,0,1,0,0,0,0,0,0,0
8579,1.000000,0.672490,0.258971,0.316359,0.754960,0.00,0.85,0.925,0.302798,0.951020,...,0,0,1,0,0,0,0,0,0,0
8580,0.998778,0.664732,0.266415,0.306323,0.732013,0.00,1.00,1.000,0.320301,0.943965,...,0,0,1,0,0,0,0,0,0,0
8581,0.966297,0.581039,0.254699,0.296031,0.687362,0.00,0.95,0.975,0.365638,0.918791,...,0,0,1,0,0,0,0,0,0,0


## Without sentiment analysis

In [9]:
df = df_master.drop(columns="sentiment")
X_columns = [c for c in df.columns if c != "up"]
y_column = "up"

In [10]:
X_data = df[X_columns].to_numpy()
y_data = df[y_column].to_numpy()

In [11]:
def train_test(models, X, y):
    """
    input: list(tuple(str, class)) models in the following format:
        [(<name1>, <model1>), (<name2>, <model2>), ...]
    output: list of tuples of name, trained model, and accuracy
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    names = []
    trained_models = []
    accuracies = []
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

        names.append(name)
        trained_models.append(model)
        accuracies.append(acc)
        print(f"{name} got a training accuracy of {acc}")

    return [(a,b,c) for a,b,c in zip(names, trained_models, accuracies)]

In [12]:
xgb_model = GradientBoostingClassifier()
ada_model = AdaBoostClassifier()
bag_model = BaggingClassifier()
svc_model = SVC()
sgd_model = SGDClassifier()
mlp_model = MLPClassifier(max_iter=500)
knn_model = KNeighborsClassifier()

models = [("xgb", GradientBoostingClassifier()),
           ("ada", AdaBoostClassifier()), 
            ("bag", BaggingClassifier()), 
             ("svc", SVC()), 
              ("sgd", SGDClassifier()), 
               ("mlp", MLPClassifier(max_iter=500))]
stc_model = StackingClassifier(models)

test_models = [("Grad. Boost", xgb_model),
               ("AdaBoost", ada_model),
               ("Bagging", bag_model),
               ("SVC", svc_model),
               ("SGD", sgd_model),
               ("MLP", mlp_model),
               ("KNN", knn_model),
               ("Stacking", stc_model),
               ]

tech_train_results = train_test(test_models, X_data, y_data)

Grad. Boost got a training accuracy of 0.578257790368272
AdaBoost got a training accuracy of 0.5676345609065155
Bagging got a training accuracy of 0.5545325779036827
SVC got a training accuracy of 0.5446175637393768
SGD got a training accuracy of 0.5010623229461756
MLP got a training accuracy of 0.5516997167138811
KNN got a training accuracy of 0.4936260623229462




Stacking got a training accuracy of 0.5796742209631728


## With Sentiment Analysis

In [13]:
df = df_master
X_columns = [c for c in df.columns if c != "up"]
y_column = "up"

In [14]:
X_data = df[X_columns].to_numpy()
y_data = df[y_column].to_numpy()

In [15]:
xgb_model = GradientBoostingClassifier()
ada_model = AdaBoostClassifier()
bag_model = BaggingClassifier()
svc_model = SVC()
sgd_model = SGDClassifier()
mlp_model = MLPClassifier(max_iter=500)
knn_model = KNeighborsClassifier()

models = [("xgb", GradientBoostingClassifier()),
           ("ada", AdaBoostClassifier()), 
            ("bag", BaggingClassifier()), 
             ("svc", SVC()), 
              ("sgd", SGDClassifier()), 
               ("mlp", MLPClassifier(max_iter=500))]
stc_model = StackingClassifier(models)

test_models = [("Grad. Boost", xgb_model),
               ("AdaBoost", ada_model),
               ("Bagging", bag_model),
               ("SVC", svc_model),
               ("SGD", sgd_model),
               ("MLP", mlp_model),
               ("KNN", knn_model),
               ("Stacking", stc_model),
               ]

sentiment_train_results = train_test(test_models, X_data, y_data)

Grad. Boost got a training accuracy of 0.5970254957507082
AdaBoost got a training accuracy of 0.5828611898016998
Bagging got a training accuracy of 0.5669263456090652
SVC got a training accuracy of 0.5378895184135978
SGD got a training accuracy of 0.5432011331444759
MLP got a training accuracy of 0.5662181303116147
KNN got a training accuracy of 0.4985835694050991




Stacking got a training accuracy of 0.5938385269121813


## With Sentiment Analysis Plus Additional Previous Day's news

In [16]:
df = df_master
for i in range(1, 7):
    df[f"sentiment_{i}"] = df["sentiment"].shift(i)
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"sentiment_{i}"] = df["sentiment"].shift(i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"sentiment_{i}"] = df["sentiment"].shift(i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"sentiment_{i}"] = df["sentiment"].shift(i)
A value is trying to be set on a copy of a slice from a DataF

In [17]:
X_columns = [c for c in df.columns if c != "up"]
y_column = "up"

In [18]:
X_data = df[X_columns].to_numpy()
y_data = df[y_column].to_numpy()

In [19]:
xgb_model = GradientBoostingClassifier()
ada_model = AdaBoostClassifier()
bag_model = BaggingClassifier()
svc_model = SVC()
sgd_model = SGDClassifier()
mlp_model = MLPClassifier(max_iter=500)
knn_model = KNeighborsClassifier()

models = [("xgb", GradientBoostingClassifier()),
           ("ada", AdaBoostClassifier()), 
            ("bag", BaggingClassifier()), 
             ("svc", SVC()), 
              ("sgd", SGDClassifier()), 
               ("mlp", MLPClassifier(max_iter=500))]
stc_model = StackingClassifier(models)

test_models = [("Grad. Boost", xgb_model),
               ("AdaBoost", ada_model),
               ("Bagging", bag_model),
               ("SVC", svc_model),
               ("SGD", sgd_model),
               ("MLP", mlp_model),
               ("KNN", knn_model),
               ("Stacking", stc_model),
               ]

sentiment_p_train_results = train_test(test_models, X_data, y_data)

Grad. Boost got a training accuracy of 0.5861091424521616
AdaBoost got a training accuracy of 0.5744153082919915
Bagging got a training accuracy of 0.5609496810772502
SVC got a training accuracy of 0.5379163713678242
SGD got a training accuracy of 0.5223245924875974




MLP got a training accuracy of 0.5513819985825655
KNN got a training accuracy of 0.49751948972360027




Stacking got a training accuracy of 0.5871722182849043


## Test predictions

In [30]:
tomorrow_dfs = [pd.read_csv(os.path.join(folder_path, f)) for f in os.listdir(folder_path) if f.endswith('.csv')]
    for tom_df, filename in zip(tomorrow_dfs, filenames):
        
        df = df.drop(['date'], axis=1)
        # normalize the dataframes before combining them
        df = normalize_dataframe(df)
        # for the neural network to understand the company name we need to convert it to a number
        df['company'] = i
        i += 1
        df = add_up_column(df)
        processed_dfs.append(df)
    combined_df = pd.concat(processed_dfs, ignore_index=True)
    
    return combined_df

df_full = combine_csvs_from_folder('market_data/merged_sentiment_data')

Unnamed: 0,Chaikin A/D,ADOSC,ADX,ADXR,APO,Aroon Down,Aroon Up,AROONOSC,ATR,Real Upper Band,...,company_6,company_7,company_8,company_9,sentiment_1,sentiment_2,sentiment_3,sentiment_4,sentiment_5,sentiment_6
10,0.038411,0.177497,0.695121,0.465075,0.240003,0.95,0.10,0.075,0.703151,0.756102,...,0,0,0,0,0.758145,0.898415,0.872104,0.821747,0.805998,0.743600
11,0.082597,0.318983,0.704401,0.451961,0.240203,0.90,0.05,0.075,0.714965,0.723169,...,0,0,0,0,0.796698,0.758145,0.898415,0.872104,0.821747,0.805998
12,0.112285,0.447096,0.707059,0.440916,0.259634,0.85,0.00,0.075,0.687340,0.686308,...,0,0,0,0,0.977138,0.796698,0.758145,0.898415,0.872104,0.821747
13,0.161053,0.600500,0.689115,0.425413,0.274767,0.80,0.00,0.100,0.685287,0.669592,...,0,0,0,0,0.873524,0.977138,0.796698,0.758145,0.898415,0.872104
14,0.179932,0.683870,0.661752,0.415688,0.285806,0.75,0.00,0.125,0.658527,0.662759,...,0,0,0,0,0.886805,0.873524,0.977138,0.796698,0.758145,0.898415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7818,0.970065,0.646422,0.270488,0.335065,0.776503,0.00,0.90,0.950,0.320660,0.959564,...,0,0,0,0,0.516550,0.739200,0.596341,0.616635,0.779338,0.664121
7819,1.000000,0.672490,0.258971,0.316359,0.754960,0.00,0.85,0.925,0.302798,0.951020,...,0,0,0,0,0.523149,0.516550,0.739200,0.596341,0.616635,0.779338
7820,0.998778,0.664732,0.266415,0.306323,0.732013,0.00,1.00,1.000,0.320301,0.943965,...,0,0,0,0,0.766338,0.523149,0.516550,0.739200,0.596341,0.616635
7821,0.966297,0.581039,0.254699,0.296031,0.687362,0.00,0.95,0.975,0.365638,0.918791,...,0,0,0,0,0.764780,0.766338,0.523149,0.516550,0.739200,0.596341


In [22]:
accuracy_score(y_data, np.zeros(y_data.shape))

0.5115218154170078