In [1]:
import numpy as np
import pandas as pd

In [2]:
df_full = pd.read_parquet("stocks_df_combined_2024_05_07.parquet.brotli")
df_full.head()

Unnamed: 0,Open,High,Low,Close,Adj Close_x,Volume,Ticker,Year,Month,Weekday,...,growth_brent_oil_7d,growth_brent_oil_30d,growth_brent_oil_90d,growth_brent_oil_365d,growth_btc_usd_1d,growth_btc_usd_3d,growth_btc_usd_7d,growth_btc_usd_30d,growth_btc_usd_90d,growth_btc_usd_365d
0,0.088542,0.101563,0.088542,0.097222,0.060163,1031789000.0,MSFT,1986,1986-03-01,3,...,,,,,,,,,,
1,0.097222,0.102431,0.097222,0.100694,0.062311,308160000.0,MSFT,1986,1986-03-01,4,...,,,,,,,,,,
2,0.100694,0.103299,0.100694,0.102431,0.063386,133171200.0,MSFT,1986,1986-03-01,0,...,,,,,,,,,,
3,0.102431,0.103299,0.098958,0.099826,0.061774,67766400.0,MSFT,1986,1986-03-01,1,...,,,,,,,,,,
4,0.099826,0.100694,0.097222,0.09809,0.0607,47894400.0,MSFT,1986,1986-03-01,2,...,,,,,,,,,,


In [3]:
GROWTH = [g for g in df_full.keys() if (g.find('growth_')==0)&(g.find('future')<0)]

In [4]:
OHLCV = ['Open','High','Low','Close','Adj Close_x','Volume']

In [5]:
CATEGORICAL = ['Month', 'Weekday', 'Ticker', 'ticker_type']

In [6]:
TO_PREDICT = [g for g in df_full.keys() if (g.find('future')>=0)]

In [7]:
TO_DROP = ['Year','Date','index_x', 'index_y', 'index', 'Quarter','Adj Close_y'] + CATEGORICAL + OHLCV

In [8]:
df_full['ln_volume'] = df_full.Volume.apply(lambda x: np.log(x))

  df_full['ln_volume'] = df_full.Volume.apply(lambda x: np.log(x))


In [9]:
# manually defined features
CUSTOM_NUMERICAL = ['SMA10', 'SMA20', 'growing_moving_average', 'high_minus_low_relative','volatility', 'ln_volume']

In [10]:
# All Supported Ta-lib indicators: https://github.com/TA-Lib/ta-lib-python/blob/master/docs/funcs.md

TECHNICAL_INDICATORS = ['adx', 'adxr', 'apo', 'aroon_1','aroon_2', 'aroonosc',
 'bop', 'cci', 'cmo','dx', 'macd', 'macdsignal', 'macdhist', 'macd_ext',
 'macdsignal_ext', 'macdhist_ext', 'macd_fix', 'macdsignal_fix',
 'macdhist_fix', 'mfi', 'minus_di', 'mom', 'plus_di', 'dm', 'ppo',
 'roc', 'rocp', 'rocr', 'rocr100', 'rsi', 'slowk', 'slowd', 'fastk',
 'fastd', 'fastk_rsi', 'fastd_rsi', 'trix', 'ultosc', 'willr',
 'ad', 'adosc', 'obv', 'atr', 'natr', 'ht_dcperiod', 'ht_dcphase',
 'ht_phasor_inphase', 'ht_phasor_quadrature', 'ht_sine_sine', 'ht_sine_leadsine',
 'ht_trendmod', 'avgprice', 'medprice', 'typprice', 'wclprice']

In [11]:
TECHNICAL_PATTERNS = [g for g in df_full.keys() if g.find('cdl')>=0]
print(f'Technical patterns count = {len(TECHNICAL_PATTERNS)}, examples = {TECHNICAL_PATTERNS[0:5]}')


Technical patterns count = 61, examples = ['cdl2crows', 'cdl3blackrows', 'cdl3inside', 'cdl3linestrike', 'cdl3outside']


In [12]:
MACRO = ['gdppot_us_yoy', 'gdppot_us_qoq', 'cpi_core_yoy', 'cpi_core_mom', 'FEDFUNDS',
 'DGS1', 'DGS5', 'DGS10']

In [13]:
NUMERICAL = GROWTH + TECHNICAL_INDICATORS + TECHNICAL_PATTERNS + CUSTOM_NUMERICAL + MACRO

In [14]:
# CHECK: NO OTHER INDICATORS LEFT
OTHER = [k for k in df_full.keys() if k not in OHLCV + CATEGORICAL + NUMERICAL + TO_DROP]
OTHER

['growth_future_5d', 'is_positive_growth_5d_future']

In [15]:
df_full.Ticker.nunique()

33

In [16]:
# truncated df_full with 25 years of data (and defined growth variables)
df = df_full[df_full.Date>='2000-01-01']
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 182675 entries, 3490 to 5426
Columns: 203 entries, Open to ln_volume
dtypes: datetime64[ns](3), float64(129), int32(64), int64(5), object(2)
memory usage: 239.7+ MB


In [17]:
# dummy variables are not generated from Date and numeric variables
df.loc[:,'Month'] = df.Month.dt.strftime('%B')
df.loc[:,'Weekday'] = df.Weekday.astype(str)

  df.loc[:,'Month'] = df.Month.dt.strftime('%B')
  df.loc[:,'Weekday'] = df.Weekday.astype(str)


In [18]:
# Generate dummy variables (no need for bool, let's have int32 instead)
dummy_variables = pd.get_dummies(df[CATEGORICAL], dtype='int32')

## Question 1

In [22]:
# TODO 1: define more categorical features, e.g. all combinations for <September+weekday>  (you'll see that September is actually an important dummy in one of the models)
wom = ((df["Date"].dt.day - 1) // 7) + 1
month_wom = df["Date"].dt.month_name().str.cat(wom.astype(str), sep="_w")
month_wom

3490    January_w1
3491    January_w1
3492    January_w1
3493    January_w1
3494    January_w1
           ...    
5422      April_w5
5423        May_w1
5424        May_w1
5425        May_w1
5426        May_w1
Name: Date, Length: 182675, dtype: object

In [26]:
mwom_dummy = pd.get_dummies(month_wom)
correlations = mwom_dummy.corrwith(df["is_positive_growth_5d_future"])
correlations.head()

April_w1   -0.004023
April_w2   -0.000227
April_w3    0.009226
April_w4   -0.001560
April_w5   -0.008793
dtype: float64

In [27]:
correlations[[correlations.abs().idxmax()]]

September_w3   -0.034537
dtype: float64

In [30]:
dummy_variables = pd.concat([dummy_variables, mwom_dummy.astype(int)], axis=1)

In [31]:
dummy_variables.info()

<class 'pandas.core.frame.DataFrame'>
Index: 182675 entries, 3490 to 5426
Columns: 115 entries, Month_April to September_w5
dtypes: int32(55), int64(60)
memory usage: 127.4 MB


In [32]:
# get dummies names in a list
DUMMIES = dummy_variables.keys().to_list()

In [33]:
# Concatenate the dummy variables with the original DataFrame
df_with_dummies = pd.concat([df, dummy_variables], axis=1)

In [34]:
df_with_dummies[NUMERICAL+DUMMIES].info()

<class 'pandas.core.frame.DataFrame'>
Index: 182675 entries, 3490 to 5426
Columns: 299 entries, growth_1d to September_w5
dtypes: float64(121), int32(117), int64(61)
memory usage: 340.6 MB


In [42]:
def temporal_split(df, min_date, max_date, train_prop=0.7, val_prop=0.15, test_prop=0.15):
    """
    Splits a DataFrame into three buckets based on the temporal order of the 'Date' column.

    Args:
        df (DataFrame): The DataFrame to split.
        min_date (str or Timestamp): Minimum date in the DataFrame.
        max_date (str or Timestamp): Maximum date in the DataFrame.
        train_prop (float): Proportion of data for training set (default: 0.6).
        val_prop (float): Proportion of data for validation set (default: 0.2).
        test_prop (float): Proportion of data for test set (default: 0.2).

    Returns:
        DataFrame: The input DataFrame with a new column 'split' indicating the split for each row.
    """
    # Define the date intervals
    train_end = min_date + pd.Timedelta(days=(max_date - min_date).days * train_prop)
    val_end = train_end + pd.Timedelta(days=(max_date - min_date).days * val_prop)

    # Assign split labels based on date ranges
    split_labels = []
    for date in df['Date']:
        if date <= train_end:
            split_labels.append('train')
        elif date <= val_end:
            split_labels.append('validation')
        else:
            split_labels.append('test')

    # Add 'split' column to the DataFrame
    df['split'] = split_labels

    return df

In [43]:
min_date_df = df_with_dummies.Date.min()
max_date_df = df_with_dummies.Date.max()

df_with_dummies = temporal_split(df_with_dummies,
                                 min_date = min_date_df,
                                 max_date = max_date_df)

In [44]:
df_with_dummies['split'].value_counts()/len(df_with_dummies)

split
train         0.675834
test          0.163290
validation    0.160876
Name: count, dtype: float64

In [77]:
# remove the "segmentation" problem (warning message on df performance after many joins and data transformations)
new_df = df_with_dummies.copy()

In [78]:
# Full dataframe (transformed and truncated to 25 years)
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 182675 entries, 3490 to 5426
Columns: 319 entries, Open to split
dtypes: datetime64[ns](2), float64(129), int32(118), int64(65), object(5)
memory usage: 367.8+ MB


In [79]:
# generate manual predictions
# Let's label all prediction features with prefix "pred"
new_df['pred0_manual_cci'] = (new_df.cci>200).astype(int)
new_df['pred1_manual_prev_g1'] = (new_df.growth_1d>1).astype(int)
new_df['pred2_manual_prev_g1_and_snp'] = ((new_df['growth_1d'] > 1) & (new_df['growth_snp500_1d'] > 1)).astype(int)

## Question 2

In [105]:
new_df["pred3_manual_gdp_fastd"] = (new_df["gdppot_us_yoy"] <= 0.027) & (new_df["fastd"] >= 0.251)
new_df["pred4_manual_gpd_wti_oil"] = (new_df["gdppot_us_yoy"] >= 0.027) & (new_df["growth_wti_oil_30d"] <= 1.005)

acc_3 = (new_df["pred3_manual_gdp_fastd"] * new_df["is_positive_growth_5d_future"]).sum() / new_df["pred3_manual_gdp_fastd"].sum()
acc_4 = (new_df["pred4_manual_gpd_wti_oil"] * new_df["is_positive_growth_5d_future"]).sum() / new_df["pred4_manual_gpd_wti_oil"].sum()

acc_3, acc_4

(0.5522010081688141, 0.5374581350255596)

In [106]:
# Considering only test
test_data = new_df[new_df["split"] == "test"]
acc_3 = (test_data["pred3_manual_gdp_fastd"] * test_data["is_positive_growth_5d_future"]).sum() / test_data["pred3_manual_gdp_fastd"].sum()
# pred4 doesnt predict 1 for any value in the test partition.
# acc_4 = (test_data["pred4_manual_gpd_wti_oil"] * test_data["is_positive_growth_5d_future"]).sum() / test_data["pred4_manual_gpd_wti_oil"].sum()

acc_3, acc_4

  acc_4 = (test_data["pred4_manual_gpd_wti_oil"] * test_data["is_positive_growth_5d_future"]).sum() / test_data["pred4_manual_gpd_wti_oil"].sum()


(0.5552947488431359, nan)

In [101]:
PREDICTIONS = new_df.columns[new_df.columns.str.startswith("pred")]
PREDICTIONS

Index(['pred0_manual_cci', 'pred1_manual_prev_g1',
       'pred2_manual_prev_g1_and_snp', 'pred3_manual_gdp_fastd',
       'pred4_manual_gpd_wti_oil'],
      dtype='object')

In [102]:
# generate columns is_correct_
for pred in PREDICTIONS:
  part1 = pred.split('_')[0] # first prefix before '_'
  new_df[f'is_correct_{part1}'] =  (new_df[pred] == new_df.is_positive_growth_5d_future).astype(int)

In [103]:
# IS_CORRECT dataset
IS_CORRECT =  [k for k in new_df.keys() if k.startswith('is_correct_')]
IS_CORRECT

['is_correct_pred0',
 'is_correct_pred1',
 'is_correct_pred2',
 'is_correct_pred3',
 'is_correct_pred4']

In [104]:
# define "Precision" for ALL predictions on a Test dataset (~4 last years of trading)
for i,column in enumerate(IS_CORRECT):
  prediction_column = PREDICTIONS[i]
  is_correct_column = column
  filter = (new_df.split=='test') & (new_df[prediction_column]==1)
  print(f'Prediction column:{prediction_column} , is_correct_column: {is_correct_column}')
  print(new_df[filter][is_correct_column].value_counts())
  print(new_df[filter][is_correct_column].value_counts()/len(new_df[filter]))

  print('---------')

Prediction column:pred0_manual_cci , is_correct_column: is_correct_pred0
is_correct_pred0
1    455
0    344
Name: count, dtype: int64
is_correct_pred0
1    0.569462
0    0.430538
Name: count, dtype: float64
---------
Prediction column:pred1_manual_prev_g1 , is_correct_column: is_correct_pred1
is_correct_pred1
1    8621
0    6980
Name: count, dtype: int64
is_correct_pred1
1    0.552593
0    0.447407
Name: count, dtype: float64
---------
Prediction column:pred2_manual_prev_g1_and_snp , is_correct_column: is_correct_pred2
is_correct_pred2
1    5726
0    4729
Name: count, dtype: int64
is_correct_pred2
1    0.547681
0    0.452319
Name: count, dtype: float64
---------
Prediction column:pred3_manual_gdp_fastd , is_correct_column: is_correct_pred3
is_correct_pred3
1    16560
0    13262
Name: count, dtype: int64
is_correct_pred3
1    0.555295
0    0.444705
Name: count, dtype: float64
---------
Prediction column:pred4_manual_gpd_wti_oil , is_correct_column: is_correct_pred4
Series([], Name: coun

In [109]:
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

### 1.4.1) Define dataframes AND perform data cleaning
* define X_train (dataframe), X_test (dataframe), y_train (series), y_test (series)
* replace +-inf. with 0
* fill NaNs with 0 (you can drop it too, but will loose a lot of data in our case
* remove 1-2% outliers (in each dimension, or only in variable to_predict :: we won't use it for a Decision Tree

In [114]:
# Decision Tree doesn't like too large and inf. values
import numpy as np

def remove_infinite_values(X):
    """
    Remove infinite values from the input array.

    Parameters:
    - X: Input array (NumPy array or array-like)

    Returns:
    - Array with infinite values removed
    """
    return X[np.isfinite(X).all(axis=1)]

# Example usage:
# Assuming X is your input data
# filtered_X = remove_infinite_values(X)

In [116]:
# Split the data into training and testing sets based on the split date
features_list = NUMERICAL+DUMMIES
to_predict = 'is_positive_growth_5d_future'

train_df = new_df[new_df.split.isin(['train','validation'])].copy(deep=True)
test_df = new_df[new_df.split.isin(['test'])].copy(deep=True)

# ONLY numerical Separate features and target variable for training and testing sets
# need Date and Ticker later when merging predictions to the dataset
X_train = train_df[features_list+[to_predict,'Date','Ticker']]
X_test = test_df[features_list+[to_predict,'Date','Ticker']]

print(f'length: X_train {X_train.shape},  X_test {X_test.shape}')


length: X_train (152846, 302),  X_test (29829, 302)


In [117]:
# Can't have +-inf values . E.g. ln(volume)=-inf when volume==0 => substitute with 0

# Disable SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Need to fill NaNs somehow
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

print(f'length: X_train_imputed {X_train.shape},  X_test_imputed {X_test.shape}')

length: X_train_imputed (152846, 302),  X_test_imputed (29829, 302)


In [118]:
# you may want to remove 1-2% outliers based on percentile ==> not used here in Decision Trees
def remove_outliers_percentile(X, lower_percentile=1, upper_percentile=99):
    """
    Remove outliers from the input array based on percentiles.

    Parameters:
    - X: Input array (NumPy array or array-like)
    - lower_percentile: Lower percentile threshold (float, default=1)
    - upper_percentile: Upper percentile threshold (float, default=99)

    Returns:
    - Array with outliers removed
    """
    lower_bound = np.percentile(X, lower_percentile, axis=0)
    upper_bound = np.percentile(X, upper_percentile, axis=0)
    mask = np.logical_and(np.all(X >= lower_bound, axis=1), np.all(X <= upper_bound, axis=1))
    return X[mask]

# Example usage:
# Assuming X is your input data
# filtered_X = remove_outliers_percentile(X, lower_percentile=1, upper_percentile=99)

In [119]:
X_train_imputed = X_train # we won't use outliers removal to save more data to train: remove_outliers_percentile(X_train)
X_test_imputed = X_test # we won't use outliers removal to save more data to test: remove_outliers_percentile(X_test)

In [120]:
# same shape
print(f'length: X_train_imputed {X_train_imputed.shape},  X_test_imputed {X_test_imputed.shape}')

length: X_train_imputed (152846, 302),  X_test_imputed (29829, 302)


In [121]:
y_train = X_train_imputed[to_predict]
y_test = X_test_imputed[to_predict]

# remove y_train, y_test from X_ dataframes
del X_train_imputed[to_predict]
del X_test_imputed[to_predict]

### 1.4.2 Estimation of a Decision Tree model

In [122]:
# INPUTS:
# X_train_imputed : CLEAN dataFrame with only numerical features (train+validation periods)
# X_test_imputed : CLEAN dataFrame with only numerical features (test periods)

# y_train : true values for the train period
# y_test  : true values for the test period

In [123]:
# estimation/fit function (using dataframe of features X and what to predict y) --> optimising total accuracy
# max_depth is hyperParameter
def fit_decision_tree(X, y, max_depth=20):
# Initialize the Decision Tree Classifier
  clf = DecisionTreeClassifier(max_depth=max_depth)

  # Fit the classifier to the training data
  clf.fit(X, y)
  return clf, X.columns

In [125]:
%%time
clf_10, train_columns = fit_decision_tree(X=X_train_imputed.drop(['Date','Ticker'],axis=1),
                           y=y_train,
                           max_depth=10)

CPU times: user 16.3 s, sys: 137 ms, total: 16.4 s
Wall time: 16.6 s


In [None]:
# TODO 3: TRAIN only on train dataset, experiment with trees with depth 1..20 --> find the best one on VALID dataset
#       for the "best" tree model: find precision on the TEST set

### 1.4.3 Inference for a Decision Tree

In [166]:
def predict_decision_tree(clf:DecisionTreeClassifier, df_X:pd.DataFrame, y_true: pd.Series):
  # Predict the target variable on the test data
  y_pred = clf.predict(df_X)

  max_depth = clf.tree_.max_depth
  # Print the maximum depth
  print("Maximum depth of the decision tree:", max_depth)

  # Calculate the accuracy/precision of the model
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  print(f'Accuracy ={accuracy}, precision = {precision}')

  # resulting df
  result_df = pd.concat([df_X, y_true, pd.Series(y_pred, index=df_X.index, name='pred_')], axis=1)

  return result_df

In [135]:
# Feautures importance function to predict future returns (based on the classifier)
# get feature importance from 'clf' (classifier) and 'train_columns' (column names)

def get_importances(clf, train_columns):
  # Assuming clf is your trained DecisionTreeClassifier
  feature_importance = clf.feature_importances_

  # Assuming X_train is your training features
  feature_names = train_columns

  # Create a DataFrame to store feature importance
  feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

  # Sort the DataFrame by importance in descending order
  feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

  # Print or display the feature importance DataFrame
  # print(feature_importance_df)
  return feature_importance_df

## Question 3

In [134]:
# TODO 4: JOIN predictions with the original dataframe (define a new column):
#  so, that there are columns pred_tree_clf10 AND pred_tree_clf20

In [148]:
test_pred = clf_10.predict(X_test_imputed.drop(['Date','Ticker'],axis=1))
tp_series = pd.Series(test_pred, dtype=int)
tp_series.head()

0    1
1    1
2    1
3    1
4    1
dtype: int64

In [159]:
test_mask = new_df["split"] == "test"
new_df.loc[test_mask, "pred5_clf_10"] = tp_series

In [165]:
pred0_to_4 = new_df.loc[test_mask, new_df.columns.str.contains(r"^pred[0-4]")]

In [181]:
manual_all_wrong = (pred0_to_4.values != new_df.loc[test_mask, to_predict].values[:, np.newaxis]).all(axis=1)
pred_5_correct = new_df.loc[test_mask, to_predict].values == new_df.loc[test_mask, "pred5_clf_10"].values

In [183]:
(manual_all_wrong & pred_5_correct).sum()

1

In [202]:
pred_cols = new_df.columns[new_df.columns.str.contains(r"^pred")].to_list()
new_df.loc[test_mask, pred_cols + [to_predict]][manual_all_wrong & pred_5_correct]

Unnamed: 0,pred0_manual_cci,pred1_manual_prev_g1,pred2_manual_prev_g1_and_snp,pred3_manual_gdp_fastd,pred4_manual_gpd_wti_oil,pred5_clf_10,is_positive_growth_5d_future
4,0,0,0,False,False,1.0,1


## Question 4

In [216]:
# ONLY numerical Separate features and target variable for training and testing sets
# need Date and Ticker later when merging predictions to the dataset
X_train = new_df.loc[new_df["split"] == "train", features_list+[to_predict]]
X_valid = new_df.loc[new_df["split"] == "validation", features_list+[to_predict]]
X_train.shape, X_valid.shape

((123458, 300), (29388, 300))

In [217]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)

# Need to fill NaNs somehow
X_train.fillna(0, inplace=True)
X_valid.fillna(0, inplace=True)

y_train = X_train[to_predict]
X_train.drop(to_predict, axis="columns", inplace=True)
y_valid = X_valid[to_predict]
X_valid.drop(to_predict, axis="columns", inplace=True)

In [219]:
accuracies = np.zeros((20, ))
precisions = np.zeros((20, ))
for i, max_depth in enumerate(range(1, 21)):
    print(f"{max_depth=}")
    clf = DecisionTreeClassifier(max_depth=max_depth)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_valid)
    accuracies[i] = accuracy_score(y_valid, y_pred)
    precisions[i] = precision_score(y_valid, y_pred)
    print(f"{accuracies[i]:.3f}\t{precisions[i]:.3f}")

accuracies, precisions

max_depth=1
0.572	0.572
max_depth=2
0.572	0.572
max_depth=3
0.570	0.571
max_depth=4
0.570	0.571
max_depth=5
0.570	0.571
max_depth=6
0.494	0.562
max_depth=7
0.466	0.555
max_depth=8
0.526	0.570
max_depth=9
0.498	0.562
max_depth=10
0.510	0.564
max_depth=11
0.493	0.559
max_depth=12
0.502	0.573
max_depth=13
0.504	0.569
max_depth=14
0.490	0.561
max_depth=15
0.498	0.568
max_depth=16
0.486	0.558
max_depth=17
0.495	0.566
max_depth=18
0.493	0.562
max_depth=19
0.499	0.568
max_depth=20
0.499	0.567


(array([0.57172996, 0.57172996, 0.5695522 , 0.5695522 , 0.5695522 ,
        0.49397713, 0.46566626, 0.5258609 , 0.49806043, 0.51031033,
        0.49288825, 0.50248401, 0.50387913, 0.4904723 , 0.4978903 ,
        0.48618484, 0.49506601, 0.49258201, 0.4992514 , 0.49867293]),
 array([0.57172996, 0.57172996, 0.57102778, 0.57102778, 0.57102778,
        0.56184742, 0.55509877, 0.56963193, 0.56172134, 0.56370554,
        0.559111  , 0.57276306, 0.56926434, 0.56101469, 0.5678292 ,
        0.55792268, 0.56551632, 0.56174049, 0.56802765, 0.56732834]))

In [222]:
range(1, 21)[precisions.argmax()]

12

In [224]:
clf = DecisionTreeClassifier(max_depth=12)
clf.fit(X_train, y_train)

In [235]:
test_pred = clf.predict(X_test_imputed.drop(['Date','Ticker'],axis=1))
train_pred = clf.predict(X_train)
valid_pred = clf.predict(X_valid)

pred6_best_clf = np.concatenate([train_pred, valid_pred, test_pred])
new_df["pred6_best_clf"] = pred6_best_clf
new_df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close_x,Volume,Ticker,Year,Month,Weekday,...,pred2_manual_prev_g1_and_snp,pred3_manual_gdp_fastd,pred4_manual_gpd_wti_oil,is_correct_pred0,is_correct_pred1,is_correct_pred2,is_correct_pred3,is_correct_pred4,pred5_clf_10,pred6_best_clf
3490,58.6875,59.3125,56.0,58.28125,36.065567,53228400.0,MSFT,2000,January,0,...,0,False,False,1,1,1,1,1,,1
3491,56.78125,58.5625,56.125,56.3125,34.847271,54119000.0,MSFT,2000,January,1,...,0,False,False,1,1,1,1,1,,1
3492,55.5625,58.1875,54.6875,56.90625,35.214706,64059600.0,MSFT,2000,January,2,...,1,False,False,1,0,0,1,1,,1
3493,56.09375,56.9375,54.1875,55.0,34.035072,54976600.0,MSFT,2000,January,3,...,0,False,False,1,1,1,1,1,,1
3494,54.3125,56.125,53.65625,55.71875,34.479843,62013600.0,MSFT,2000,January,4,...,1,False,False,0,1,1,0,0,,1
