## Data Preparation and Preprocessing

In [1]:
# sets up dataframe with all given files
import pandas as pd
import time

start = time.time()
dfs = []

for year in range(1, 5):
    filename = "trades_202" + str(year) + ".csv"
    df_year = pd.read_csv(filename)
    
    dfs.append(df_year)
df = pd.concat(dfs, ignore_index=True)
df

Unnamed: 0,entry_side,date_time,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature535,feature536,feature537,feature538,feature539,feature540,feature541,feature542,feature543,profit_loss
0,1,2021-01-04 09:30:00.802000,119.60,0.11,-0.041841,-0.038852,94155.00,16.730000,-0.004184,-0.060000,...,,28070,118688,1.290000,0.006571,0.057300,0.02168,0.588744,-0.12,-4.560
1,1,2021-01-04 09:30:01.743000,61.65,0.11,-0.041841,-0.038852,94155.00,16.730000,-0.004184,-0.060000,...,,28070,118688,1.290000,0.006571,0.011100,0.02168,0.713989,-0.12,-240.262
2,1,2021-01-04 09:30:01.021000,75.50,0.11,-0.041841,-0.038852,94155.00,16.730000,-0.004184,-0.060000,...,,28070,118688,1.290000,0.006571,0.020150,0.02168,0.715928,-0.12,-48.134
3,1,2021-01-04 09:30:01.207000,62.59,0.11,-0.041841,-0.038852,94155.00,16.730000,-0.004184,-0.060000,...,,28070,118688,1.290000,0.006571,0.024300,0.02168,0.696340,-0.12,-9.184
4,1,2021-01-04 09:30:01.230000,62.23,0.11,-0.041841,-0.038852,94155.00,16.730000,-0.004184,-0.060000,...,,28070,118688,1.290000,0.006571,0.011100,0.02168,0.713989,-0.12,-48.804
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641027,1,2024-01-03 09:30:01.249,74.09,2.00,1.000000,241839.000000,74.23,0.953913,1.405925,2.634463,...,141424.0,88402,4282,897.000000,897.000000,854.645161,648.00000,0.016124,2.00,-50.478
641028,1,2024-01-03 09:30:00.908,43.91,4.00,3.000000,11826.000000,43.91,0.678138,1.405925,2.634463,...,141424.0,88402,4282,2125.000000,2125.000000,3382.939130,529.00000,0.016124,2.00,-53.482
641029,1,2024-01-03 09:30:22.984,360.86,1.00,1.000000,23769.000000,361.45,3.421087,1.405925,2.634463,...,141424.0,88402,4282,294.571429,224.000000,2094.448276,224.00000,0.016124,2.00,130.572
641030,1,2024-01-03 09:30:01.102,29.23,,,6.000000,29.50,0.538750,1.405925,2.634463,...,141424.0,88402,4282,,,,,0.016124,2.00,7.860


In [2]:
# fills NaN values in numerical features with the mean of the feature
numeric_features = df.select_dtypes(include=['number']).columns
df[numeric_features] = df[numeric_features].fillna(df[numeric_features].mean())
numeric_features = numeric_features.tolist()

In [3]:
import math

# checks if number is float
def is_convertible_to_number(obj):
    try:
        _ = float(obj)  
        return True
    except ValueError:
        return False
    
# splits categorical features into numerical and categorical features
categorical_features = df.select_dtypes(include=['object'])
categorical_features = categorical_features.drop(columns=['date_time'])
new_cat = []
new_num = []
for col in categorical_features:
    numericals = []
    categories = []
    for row in categorical_features[col]:
        if is_convertible_to_number(row):
            numericals.append(row)
            categories.append(math.nan)
        else:
            categories.append(row)
            numericals.append(math.nan)
    newframe = df.copy()  # Create a copy of the DataFrame to avoid modifying the original
    newframe = pd.concat([newframe, pd.DataFrame(categories, columns=[col + 'c'], dtype='object')], axis=1)
    newframe = pd.concat([newframe, pd.DataFrame(numericals, columns=[col + 'n'], dtype='object')], axis=1)
    df = newframe.copy()
    new_cat.append(col + 'c')
    new_num.append(col + 'n')

In [4]:
# fills NaN values in numerical features with the mean of the feature 
# and categorical features with the mode of the feature
df[new_num] = df[new_num].fillna(df[new_num].mean())
numeric_features.append(new_num)
for col in new_cat:
    mode = df[col].mode()[0]
    df[col] = df[col].fillna(mode)

In [5]:
# these columns have date_time information, which we already have
df = df.drop(columns='feature308c')
new_cat.remove('feature308c')
df = df.drop(columns='feature417c')
new_cat.remove('feature417c')

In [6]:
# one-hot encodes all categorical features
for column in new_cat:
    one_hot = pd.get_dummies(df[column])
    one_hot = one_hot.astype('int')
    df = df.drop(columns=column)
    df = pd.concat([df, one_hot], axis=1)
df

Unnamed: 0,entry_side,date_time,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,Retail,Technology,Transportation,Utility,A,N,P,Q,No Sector,No Sector.1
0,1,2021-01-04 09:30:00.802000,119.60,0.110000,-0.041841,-0.038852,94155.00,16.730000,-0.004184,-0.060000,...,0,0,0,0,0,1,0,0,1,1
1,1,2021-01-04 09:30:01.743000,61.65,0.110000,-0.041841,-0.038852,94155.00,16.730000,-0.004184,-0.060000,...,0,0,0,0,0,1,0,0,1,1
2,1,2021-01-04 09:30:01.021000,75.50,0.110000,-0.041841,-0.038852,94155.00,16.730000,-0.004184,-0.060000,...,0,0,0,0,0,1,0,0,1,1
3,1,2021-01-04 09:30:01.207000,62.59,0.110000,-0.041841,-0.038852,94155.00,16.730000,-0.004184,-0.060000,...,0,0,0,0,0,1,0,0,1,1
4,1,2021-01-04 09:30:01.230000,62.23,0.110000,-0.041841,-0.038852,94155.00,16.730000,-0.004184,-0.060000,...,0,0,0,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641027,1,2024-01-03 09:30:01.249,74.09,2.000000,1.000000,241839.000000,74.23,0.953913,1.405925,2.634463,...,0,0,0,0,0,0,0,1,1,1
641028,1,2024-01-03 09:30:00.908,43.91,4.000000,3.000000,11826.000000,43.91,0.678138,1.405925,2.634463,...,0,1,0,0,0,0,0,1,1,1
641029,1,2024-01-03 09:30:22.984,360.86,1.000000,1.000000,23769.000000,361.45,3.421087,1.405925,2.634463,...,0,0,0,0,0,1,0,0,1,1
641030,1,2024-01-03 09:30:01.102,29.23,3.959212,4.165474,6.000000,29.50,0.538750,1.405925,2.634463,...,0,0,0,0,0,0,0,1,1,1


## Type 1 Features

In [7]:
# groups records by date_time
df['date_time'] = pd.to_datetime(df['date_time'])
list_of_dfs = [group for _, group in df.groupby(pd.Grouper(key='date_time', freq='D'))]
list_of_dfs = [df_group for df_group in list_of_dfs if not df_group.empty]

In [8]:
# calculates net profit/loss of the day and keeps records in net profit days
profit = []
for day in list_of_dfs:
    net = 0
    for index, record in day.iterrows():
        net += float(record['profit_loss'])
    if net > 0:
        profit.append(day)
df = pd.concat(profit, ignore_index=True)

In [9]:
# finds Type 2 and Type 3 features
model_features = []
for feature_name, values in df.items():
    if not all(values.groupby(df['date_time']).nunique() == 1):
        model_features.append(feature_name)

In [10]:
# cleans up dataframe
df = df.loc[:, model_features]
df = df.loc[:, ~df.columns.duplicated()]
df = df.drop(columns=categorical_features.columns)
df

Unnamed: 0,entry_side,feature1,feature2,feature3,feature4,feature5,feature6,feature10,feature14,feature15,...,Trucks & Parts-Hvy Duty,Utility-Diversified,Utility-Electric Power,Utility-Gas Distribution,Utility-Water Supply,Wholesale-Food,A,N,P,Q
0,1,42.41,0.11,-0.041841,-3.885200e-02,94155.00,16.730000,0.703725,1.227750,36.649700,...,0,1,0,0,0,0,0,0,0,1
1,-1,32.52,0.11,-0.041841,-3.885200e-02,94155.00,16.730000,0.703725,1.291050,23.500000,...,0,0,0,0,0,0,0,0,0,1
2,-1,50.89,0.11,-0.041841,-3.885200e-02,94155.00,16.730000,0.703725,0.630330,59.919900,...,0,0,0,0,0,0,0,0,0,1
3,1,73.12,0.11,-0.041841,-3.885200e-02,94155.00,16.730000,0.703725,1.054120,58.025600,...,0,0,0,0,0,0,0,0,1,0
4,-1,38.36,0.11,-0.041841,-3.885200e-02,94155.00,16.730000,0.703725,1.393180,34.824000,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288004,-1,513.28,5.00,21.000000,2.695959e+06,513.29,3.494250,1.050936,1.044287,1.591275,...,0,0,0,0,0,0,0,1,0,0
288005,-1,513.26,5.00,21.000000,2.705116e+06,513.27,3.494250,1.050936,1.044287,1.591275,...,0,0,0,0,0,0,0,1,0,0
288006,-1,513.29,5.00,21.000000,2.710537e+06,513.29,3.494250,1.050936,1.044287,1.591275,...,0,0,0,0,0,0,0,1,0,0
288007,-1,513.36,5.00,21.000000,2.777902e+06,513.38,3.494250,1.050936,1.044287,1.591275,...,0,0,0,0,0,0,0,1,0,0


## Type 2 Features

In [11]:
# imports all libraries needed for neural nets
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [13]:
# Sorts each feature into Type 2 and Type 3 features
feature_columns = [col for col in df.columns if 'feature' in col]
type2_features = []
type3_features = []
for col in feature_columns:
    if (df[col] >= 0).all():
        type2_features.append(col)
    else:
        type3_features.append(col)
type2_features.append(df.columns.difference(feature_columns))

In [14]:
# sets up environmental variables for neural nets
import numpy as np
import random

sample = df.sample(n=int(len(df) / 20), random_state=random.randint(0, 100))
features = pd.concat([sample[col] for col in type2_features], axis=1)
labels = sample['profit_loss'].values.ravel()
binary_labels = np.where(labels > 0, 1, 0)
features

Unnamed: 0,feature1,feature5,feature6,feature10,feature15,feature16,feature17,feature18,feature19,feature20,...,Transportation-Truck,Trucks & Parts-Hvy Duty,Utility,Utility-Diversified,Utility-Electric Power,Utility-Gas Distribution,Utility-Water Supply,Wholesale-Food,entry_side,profit_loss
50501,53.74,50448.0,30.54,1.656887,58.6394,56.5323,56.3400,56.4000,53.03,55.8100,...,0,0,0,0,0,0,0,0,1,39.004
134069,59.25,4447.0,24.71,1.094438,67.3565,71.4966,60.3000,63.7500,58.16,64.2700,...,0,0,0,0,0,0,0,0,1,-7.896
233887,61.89,8042.0,25.20,0.921875,38.5540,40.6642,59.1456,57.7470,59.91,57.7620,...,0,0,0,0,0,0,0,0,1,62.816
32661,324.07,61382.0,41.18,3.533125,290.5956,316.6005,335.0900,332.2500,325.39,329.0300,...,0,0,0,0,0,0,0,0,1,941.200
92380,80.29,46257.0,20.16,1.744375,86.8249,96.5182,88.2000,78.2700,79.45,81.3400,...,0,0,0,0,0,0,0,0,1,34.452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124445,55.00,4497.0,26.63,1.829987,57.7447,55.6256,59.3452,54.7503,55.57,56.3914,...,0,0,0,0,0,0,0,0,-1,-64.844
66474,63.32,36053.0,24.70,1.453125,51.4219,51.6329,58.1200,61.0600,62.93,62.8900,...,0,0,0,0,0,0,0,0,-1,-5.340
94821,85.40,113471.0,20.44,1.382725,82.5397,89.6758,90.1100,83.3700,84.82,82.7400,...,0,0,0,0,0,0,0,0,1,-68.838
175861,37.62,78803.0,16.43,0.653512,36.4387,34.0258,33.9800,35.7000,37.61,35.8900,...,0,0,0,0,0,0,0,0,-1,119.200


In [15]:
# runs neural net model
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

@ignore_warnings(category=ConvergenceWarning)
def run_nn():
    pl = Pipeline(steps=[('scaler', StandardScaler()), ('mlp', MLPClassifier(max_iter = 100, random_state = random.randint(0, 100)))])
    prm_grd = {'mlp__hidden_layer_sizes': [20], 'mlp__activation': ['logistic', 'tanh', 'relu']}
    grid_search = GridSearchCV(pl, prm_grd, cv=5)
    prediction = cross_val_predict(grid_search, features, binary_labels, cv = 5)
    accuracy = accuracy_score(binary_labels, prediction)
    print("Accuracy:", accuracy * 100, "%")
    return prediction
  


prediction = run_nn()
prediction

Accuracy: 94.19444444444444 %


array([1, 0, 1, ..., 0, 1, 0])

In [16]:
# creates dataframe with features predicted as a profit
type2_sample = pd.DataFrame()
for index in range(len(binary_labels)):
    if prediction[index] == 1:
        type2_sample = pd.concat([type2_sample, sample.iloc[index]], axis=1)
type2_sample = type2_sample.transpose()
type2_sample

Unnamed: 0,entry_side,feature1,feature2,feature3,feature4,feature5,feature6,feature10,feature14,feature15,...,Trucks & Parts-Hvy Duty,Utility-Diversified,Utility-Electric Power,Utility-Gas Distribution,Utility-Water Supply,Wholesale-Food,A,N,P,Q
50501,1.0,53.74,-0.19,-0.010478,0.002292,50448.0,30.54,1.656887,1.18603,58.6394,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
233887,1.0,61.89,-0.20,-0.051984,0.007937,8042.0,25.20,0.921875,2.22866,38.5540,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
32661,1.0,324.07,-1.03,0.031812,-0.175206,61382.0,41.18,3.533125,1.16971,290.5956,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
92380,1.0,80.29,-0.09,-0.080357,-0.062004,46257.0,20.16,1.744375,1.26842,86.8249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
251738,-1.0,92.39,0.38,-0.060992,-0.105143,50286.0,21.97,1.500675,0.75657,111.0992,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16732,1.0,85.00,-0.10,-0.105657,-0.094842,26907.0,12.02,0.683750,1.16521,113.0000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
140167,1.0,70.58,0.25,-0.011211,-0.024215,3253.0,22.30,0.846875,1.10343,87.7040,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
34909,1.0,54.61,-0.30,0.027888,-0.073705,50804.0,35.14,1.478188,1.25551,39.2041,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
233347,1.0,49.98,-0.27,0.032793,0.021219,10082.0,25.92,0.960625,1.36329,32.8600,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Type 3 Features

In [17]:
# decision tree algorithm

# calculates entropy of the input dataset
def calc_entropy(data):
    entropy = 0.0
    num_ones = 0
    for index, row in data.iterrows():
        if row['profit_loss'] >= 0:
            num_ones += 1
    num_zeros = len(data) - num_ones
    if num_zeros != 0:
        entropy -= (num_zeros/len(data) * math.log2(num_zeros/len(data)))
    if num_ones != 0:
        entropy -= (num_ones/len(data) * math.log2(num_ones/len(data)))
    return (entropy, num_ones - num_zeros)

# splits dataset based on left less than threshold and right greater than threshold
def split_data(data, feature, threshold):
    left_split = pd.DataFrame()
    right_split = pd.DataFrame()
    left_split = data[data[feature] < threshold]
    right_split = data[data[feature] >= threshold]
    return (left_split, right_split)

# finds the best gain by calculating each split
def find_split(data, feature):
    max_gain = 0.0
    max_mean = 0.0
    curr_left_decision = None
    curr_right_decision = None
    parent_entropy, _ = calc_entropy(data)
    for i in range(len(data) - 1):
        mean = data.iloc[i][feature] + data.iloc[i+1][feature] / 2
        left, right = split_data(data, feature, mean)
        left_entropy, left_decision = calc_entropy(left)
        right_entropy, right_decision = calc_entropy(right)
        gain = parent_entropy - (((len(left)/len(data)) * left_entropy) + ((len(right)/len(data)) * right_entropy))
        if gain > max_gain:
            max_gain = gain
            max_mean = mean
            curr_left_decision = 1 if left_decision >= 0 else 0
            curr_right_decision = 1 if right_decision >= 0 else 0
    return (max_mean, curr_left_decision, curr_right_decision)

In [18]:
# decision tree simulator

# calculates accuracy of model of a given feature
def calc_accuracy(split, left, right, data, feature):
    num_accurate = 0
    for index, row in data.iterrows():
        if row[feature] and split :
            pred = left if row[feature] >= split else right
            ans = 1 if row['profit_loss'] >= 0 else 0
            if pred == ans:
                num_accurate += 1
    return num_accurate / len(data)

# tests and trains data with cross validation for optimal accuracy
def cross_validation(data, test):
    sub_len = int(round(len(data) / 5))
    index = 0
    sum_val = 0
    sum_split = 0
    for _ in range(5):
        if(index + sub_len >= len(data)):
            train_set = data.iloc[0:index]
            test_set = data.iloc[index:len(data)]
        else:
            train_set = pd.concat([data.iloc[0:index], data.iloc[index + sub_len:len(data)]], axis=0)
            test_set = data.iloc[index:index + sub_len]
        train_set = train_set.loc[:, ~train_set.columns.duplicated()]
        test_set = test_set.loc[:, ~test_set.columns.duplicated()]
        split, left, right = find_split(train_set, test)
        acc = calc_accuracy(split, left, right, test_set, test)
        sum_val += acc
        sum_split += split * acc
        index += sub_len
    return (sum_val * 20, sum_split / 5)

# runs decision tree for long and short records for each feature
def run_decision_trees(long, short):
    long_tup = []
    short_tup = []
    for col in features.columns:
        long = long.sort_values(by=col)
        short = short.sort_values(by=col)
        long_val, long_split = cross_validation(long, col)
        short_val, short_split = cross_validation(short, col)
        long_tup.append((col, long_val, long_split))
        short_tup.append((col, short_val, short_split))
    return (long_tup, short_tup)

In [19]:
# finds long and short filters with 5 sample trials
long_good = set()
short_good = set()
for i in range(5):
    new_sample = sample.sample(n=100, random_state=random.randint(0, 100))
    features = pd.concat([new_sample[col] for col in type3_features], axis=1)
    features = pd.concat([features, new_sample['profit_loss']], axis=1)
    short = pd.DataFrame()
    long = pd.DataFrame()
    short = features[new_sample['entry_side'] <= 0]
    long = features[new_sample['entry_side'] > 0]
    long_tup, short_tup = run_decision_trees(long, short)
    for index in range(len(long_tup)):
        if long_tup[index][1] >= 62.5 and short_tup[index][1] >= 62.5:
            long_good.add(long_tup[index])
            short_good.add(short_tup[index])   

In [20]:
# stores valid long and short filters into a tuple
long_filters = dict()
short_filters = dict()
for long_tup in long_good:
    short_tup = [tup for tup in short_good if tup[0] == long_tup[0]][0]
    long_filters[long_tup[0]] = (long_tup[2], 1 if long_tup[2] > short_tup[2] else 0)
    short_filters[short_tup[0]] = (short_tup[2], 1 if long_tup[2] <= short_tup[2] else 0)
print(long_filters)
print(short_filters)

{'feature379': (0.004630836363636363, 1), 'feature423': (9.016249566666668, 1), 'feature424': (12.839054545454541, 1)}
{'feature379': (0.001326512698412698, 0), 'feature423': (0.0270378303030303, 0), 'feature424': (5.139161587301587, 0)}


In [21]:
# runs filters on the entire sample
type3_sample = pd.DataFrame()
for index, row in sample.iterrows():
    add = True
    for key in long_filters:
        if long_filters[key][1] == 1 and short_filters[key][1] == 0:
            if row[key] >= long_filters[key][0] and row['entry_side'] == 1:
                pass
            elif row[key] <= short_filters[key][0] and row['entry_side'] == -1:
                pass
            else:
                add = False
                break
        elif long_filters[key][1] == 0 and short_filters[key][1] == 1:
            if row[key] <= long_filters[key][0] and row['entry_side'] == 1:
                pass
            elif row[key] >= short_filters[key][0] and row['entry_side'] == -1:
                pass
            else:
                add = False
                break
        else:
            add = False
            break
    if add:
        type3_sample = pd.concat([type3_sample, row], axis=1)
type3_sample = type3_sample.transpose()
type3_sample

Unnamed: 0,entry_side,feature1,feature2,feature3,feature4,feature5,feature6,feature10,feature14,feature15,...,Trucks & Parts-Hvy Duty,Utility-Diversified,Utility-Electric Power,Utility-Gas Distribution,Utility-Water Supply,Wholesale-Food,A,N,P,Q
120289,-1.0,51.98,-0.21,-0.048033,-0.065839,9949.0,24.150000,1.138000,1.043060,53.30840,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
24984,-1.0,92.73,-0.14,0.028265,0.009747,73094.0,10.260000,0.329375,0.809680,86.09000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
150864,-1.0,49.88,-0.15,-0.082653,-0.076531,3368.0,19.600000,0.630625,0.762440,57.47690,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
286600,1.0,31.49,1.00,1.000000,33.000000,32.0,0.648163,2.294318,1.694531,2.51983,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
20287,-1.0,55.84,0.03,-0.042594,-0.112294,38533.0,10.330000,0.463125,0.625770,53.56400,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41184,-1.0,40.09,-0.10,-0.101557,-0.085647,40369.0,29.540000,1.679875,0.689030,31.26440,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
56915,-1.0,52.00,0.16,-0.021739,-0.055731,102423.0,25.300000,0.929500,1.161780,48.30000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
204641,-1.0,130.73,0.16,-0.055125,-0.083549,42364.0,11.610000,0.493625,0.781800,164.02610,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
124445,-1.0,55.00,-0.28,-0.040556,0.010514,4497.0,26.630000,1.829987,0.786840,57.74470,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Results

In [22]:
# Combines Type 2 and Type 3 dataframes
final = type2_sample[type2_sample.isin(type3_sample).all(axis=1)]
length = len(final)
final = pd.concat([final, sample['profit_loss']], axis = 1)
final = final.iloc[:length]
final

Unnamed: 0,entry_side,feature1,feature2,feature3,feature4,feature5,feature6,feature10,feature14,feature15,...,Utility-Diversified,Utility-Electric Power,Utility-Gas Distribution,Utility-Water Supply,Wholesale-Food,A,N,P,Q,profit_loss
120289,-1.0,51.98,-0.21,-0.048033,-0.065839,9949.0,24.15,1.138000,1.04306,53.3084,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,14.260
24984,-1.0,92.73,-0.14,0.028265,0.009747,73094.0,10.26,0.329375,0.80968,86.0900,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-2.904
20287,-1.0,55.84,0.03,-0.042594,-0.112294,38533.0,10.33,0.463125,0.62577,53.5640,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.878
265286,-1.0,77.45,0.05,-0.052733,-0.108682,22011.0,15.55,0.401563,0.42364,66.4900,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,15.264
80815,-1.0,41.50,-0.67,-0.083767,-0.217227,164859.0,21.13,2.578125,1.23046,43.2700,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94020,-1.0,116.50,0.86,-0.037075,0.013713,547382.0,19.69,1.358350,0.70679,83.6951,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.832
132739,-1.0,76.88,-1.14,-0.001831,-0.032494,1218.0,21.85,0.827988,0.66285,73.8800,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,12.624
113092,-1.0,73.45,0.20,-0.015391,-0.005919,3998.0,25.34,1.825000,2.03044,64.9536,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,115.808
56915,-1.0,52.00,0.16,-0.021739,-0.055731,102423.0,25.30,0.929500,1.16178,48.3000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-2.464


In [31]:
# prints results
loss = pd.Series((final['profit_loss'] < 0).sum()).values[0]
profit = pd.Series((final['profit_loss'] >= 0).sum()).values[0]
print("Loss:", loss)
print("Profit:", profit)
print("Accuracy: ", profit / length * 100, "%")

Loss: 68
Profit: 1191
Accuracy:  94.59888800635426 %


In [26]:
# time in seconds
end = time.time()
print ('Time taken:', end - start)

Time taken: 1790.081228017807
