# trying to get better model

In [2]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def get_featured_frame(file_path):
    df = pd.read_csv(file_path)
    DAY_VALUES = 1
    MONTH_VALUES = 30
    WEEK_VALUES = DAY_VALUES*7
    YEAR_VALUES = DAY_VALUES*365
    df['Time'] = pd.to_datetime(df.Time)
    df = df.set_index('Time')
    df = df.resample('D', convention='start').mean()
    df['date'] = df.index.values
    df['Avg'] =(df['Low'] + df['High'])/2
    # time features
    df['year'] = df['date'].apply(lambda x: x.year)
    df['month'] = df['date'].apply(lambda x: x.month)
    df['day'] = df['date'].apply(lambda x: x.day)
    df['hour'] = df['date'].apply(lambda x: x.hour)
    df['minute'] = df['date'].apply(lambda x: x.minute)
    # Lagged Values
    for unit, amount, shift_values in zip(['day', 'day', 'day', 'day', 'week', 'week', 'week', 'month', 'month', 'month', 'year'],[1,2,3,4,1,2,3,1,2,1],[DAY_VALUES, DAY_VALUES, DAY_VALUES, DAY_VALUES, WEEK_VALUES, WEEK_VALUES, WEEK_VALUES, MONTH_VALUES, MONTH_VALUES, YEAR_VALUES]):
        for col in ['Open', 'Close', 'High', 'Low', 'Volume', 'Avg']:
            new_col = "{}_{}{}_before".format(col, amount, unit)
            df[new_col] = df[col].shift(amount*shift_values)
    # Summary of values
    for unit, amount, win_size in zip(['day', 'day', 'week', 'week','month', 'month', 'month'],[1,1,1,1,1,1,1],[2,5,2,3,1,2,3]):
        for col in ['Open', 'Close', 'High', 'Low', 'Volume']:
            roll_col = "{}_av_{}{}_before_{}roll".format(col, amount, unit, win_size)
            shifted = "{}_{}{}_before".format(col, amount, unit)
            df[roll_col] = (df[shifted].rolling(window=win_size)).mean()
    # some stat of the values
    for col in ['Open', 'Close', 'High', 'Low']:
        window = df[col].expanding()
        df["{}_max".format(col)] = window.max()
        df["{}_min".format(col)] = window.min()
        df["{}_avg".format(col)] = window.mean()
    df = df.drop("date", axis=1)
    # create the prediction column
    df['next_rate'] = np.where(df['Avg'].shift(-1) > df['Avg'],1,-1)
    df = df.dropna()
    # sclae the values
    scaler = MinMaxScaler()
    df[df.columns] = scaler.fit_transform(df[df.columns])
    print(df.info())
    return df

In [3]:
df = get_featured_frame("../res/EURUSD_15m_BID_01.01.2010-31.12.2016.csv")

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2190 entries, 2011-01-03 to 2016-12-31
Freq: D
Columns: 113 entries, Open to next_rate
dtypes: float64(113)
memory usage: 1.9 MB
None


# First we need to have some sort of auto_ML to find the best model
## Let's build it

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

def auto_ml(X, Y):
    X = StandardScaler().fit_transform(X)
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state=0)
    clfs = {
        "LogisticRegression": GridSearchCV(LogisticRegression(), {'C':[.001, .01, .1, 1, 10, 100],
                                                               'penalty':['l2', 'l1']}, cv=5, n_jobs=-1),
        "SGDClassifier": GridSearchCV(SGDClassifier(), {'max_iter':[1e1,1e2, 1e3],
                                                     'loss':['log', 'modified_huber',
                                                             'squared_hinge', 'perceptron'],
                                                      'penalty':['elasticnet', 'l2', 'l1']}, cv=5, n_jobs=-1),
        "SVC":GridSearchCV(SVC(), {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]}, cv=5, n_jobs=-1),
        "LinearSVC": GridSearchCV(LinearSVC(), {'C':[1, 10, 100, 1000]}, cv=5, n_jobs=-1),
        "GaussianNB": GridSearchCV(GaussianNB(),{'priors':[None]}, cv=5, n_jobs=-1),
        "KNeighborsClassifier": GridSearchCV(KNeighborsClassifier(), {'n_neighbors':[1, 3, 5, 10, 100, 200],
                     'weights':['uniform', 'distance'],
                      'algorithm':['ball_tree', 'kd_tree', 'brute'], 
                      'p':[1, 2]}, cv=5, n_jobs=-1),
        "RandomForestClassifier": GridSearchCV(RandomForestClassifier(), 
                       {"n_estimators":[10, 50, 100, 200],
                       "criterion":["gini", "entropy"]}, cv=5, n_jobs=-1),
        "AdaBoostClassifier": GridSearchCV(AdaBoostClassifier(), 
                       {"n_estimators":[50, 100, 200]}, cv=5, n_jobs=-1),
        "GradientBoostingClassifier": GridSearchCV(GradientBoostingClassifier(), 
                       {"loss":['deviance', 'exponential']}, cv=5, n_jobs=-1),
        "DecisionTreeClassifier": GridSearchCV(DecisionTreeClassifier(), 
                       {"criterion":["gini", "entropy"]}, cv=5, n_jobs=-1),
    }
    print("| model | train score | test score|")
    print("| --- | --- | --- |")
    best_score, best_test, best_clf = 0, 0, None
    for clf in clfs.keys():
        clfs[clf].fit(x_train, y_train)
        score = clfs[clf].best_score_
        test_score = clfs[clf].score(x_test, y_test)
        print("| {} | {:.2f}% | {:.2f}% |".format(clf, score, test_score))
        if score > best_score and test_score > best_test:
            best_score = score
            best_test = test_score
            best_clf = clfs[clf].best_estimator_
    print("\n\n# the best classifier is `{}` with train score of {} and test score of {}".format(best_clf, 
                                                                                          best_score, best_test))

## Let's try the data with the given classifiers

In [64]:
%%time
auto_ml(df.drop('next_rate', axis=1).values, df['next_rate'].values)

| model | train score | test score|
| --- | --- | --- |
| LogisticRegression | 0.60% | 0.56% |
| SGDClassifier | 0.59% | 0.56% |
| SVC | 0.58% | 0.57% |
| LinearSVC | 0.60% | 0.55% |
| GaussianNB | 0.48% | 0.49% |
| KNeighborsClassifier | 0.57% | 0.53% |
| RandomForestClassifier | 0.58% | 0.58% |
| AdaBoostClassifier | 0.52% | 0.52% |
| GradientBoostingClassifier | 0.53% | 0.53% |
| DecisionTreeClassifier | 0.55% | 0.52% |


# the best classifier is `DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')` with train score of 0 and test score of 0
CPU times: user 23.6 s, sys: 944 ms, total: 24.6 s
Wall time: 3min 20s


| model | train score | test score|
| --- | --- | --- |
| LogisticRegression | 0.60% | 0.56% |
| SGDClassifier | 0.59% | 0.58% |
| SVC | 0.58% | 0.57% |
| LinearSVC | 0.60% | 0.56% |
| GaussianNB | 0.48% | 0.49% |
| KNeighborsClassifier | 0.57% | 0.53% |
| RandomForestClassifier | 0.58% | 0.58% |
| AdaBoostClassifier | 0.52% | 0.52% |
| GradientBoostingClassifier | 0.52% | 0.53% |
| DecisionTreeClassifier | 0.54% | 0.56% |


# the best classifier is `DecisionTreeClassifier` with train score of 0 and test score of 0

## Let's see the correlation between the features
    

we are trying to see which varaibles have high correlation with the `Avg` column

In [40]:
corr = df.corr()
corrs = corr['Avg']

In [45]:
len(corrs)

116

In [47]:
len(corrs[abs(corrs)>0.5])

87

In [86]:
(corrs[abs(corrs)>0.5].index)

Index(['Open', 'High', 'Low', 'Close', 'Avg', 'year', 'Open_1day_before',
       'Close_1day_before', 'High_1day_before', 'Low_1day_before',
       ...
       'Close_avg^2', 'Close_avg^3', 'High_min^2', 'High_min^3', 'High_avg^2',
       'High_avg^3', 'Low_min^2', 'Low_min^3', 'Low_avg^2', 'Low_avg^3'],
      dtype='object', length=210)

SO we can see that only 87 features have more than abs(0.5) correlation, but wait, maybe the other values needs to be fitted from another degree !

## Generating polynomials for the model

In [4]:
def generate_poly_feats(df, degree=3, y_col='next_rate'):
    from sklearn.preprocessing import PolynomialFeatures
    df.dropna(inplace=True)
    for feature in df.columns:
        if feature == 'next_rate':
            continue
        poly_gen = PolynomialFeatures(degree=degree, include_bias=False)
        polys = poly_gen.fit_transform(df[[feature]])
        for column in range(degree):
            if column > 0:
                new_col_name = "{}^{}".format(feature, column+1)
                df[new_col_name] = polys[:, column]
    return df

In [68]:
generate_poly_feats(df.copy(), 3).head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Avg,year,month,day,hour,...,High_min^2,High_min^3,High_avg^2,High_avg^3,Low_max^2,Low_max^3,Low_min^2,Low_min^3,Low_avg^2,Low_avg^3
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-03,0.659563,0.65949,0.659557,0.659548,0.11387,0.659523,0.0,0.0,0.066667,0.0,...,1.0,1.0,0.393609,0.246943,0.0,0.0,1.0,1.0,0.391333,0.244805
2011-01-04,0.663954,0.664138,0.66373,0.663851,0.147426,0.663934,0.0,0.0,0.1,0.0,...,1.0,1.0,0.393887,0.247205,0.0,0.0,1.0,1.0,0.3916,0.245056
2011-01-05,0.63512,0.635156,0.63478,0.634727,0.138942,0.634968,0.0,0.0,0.133333,0.0,...,1.0,1.0,0.393697,0.247026,0.0,0.0,1.0,1.0,0.391401,0.244868
2011-01-06,0.605992,0.605905,0.605549,0.605563,0.126467,0.605727,0.0,0.0,0.166667,0.0,...,1.0,1.0,0.393039,0.246407,0.0,0.0,1.0,1.0,0.390733,0.244242
2011-01-07,0.578792,0.579032,0.578433,0.578604,0.122968,0.578733,0.0,0.0,0.2,0.0,...,1.0,1.0,0.391955,0.245388,0.0,0.0,1.0,1.0,0.389636,0.243214


## Now let's see the correlation again

In [73]:
df_poly_3 = generate_poly_feats(df.copy(), 3)
corr = df_poly_3.corr()
corrs = corr['Avg']

In [74]:
len(corrs)

337

In [75]:
len(corrs[abs(corrs)>0.5])

210

## SO the poly features might be a good way for the model to get an idea about the data !
## let's see the affect of the polys on the model

In [77]:
# generate the poly features
df_poly_2 = generate_poly_feats(df.copy(), 2)

# testing on second poly degree features

In [81]:
auto_ml(df_poly_2.drop('next_rate', 1), df_poly_2['next_rate'])

| model | train score | test score|
| --- | --- | --- |
| LogisticRegression | 0.60% | 0.60% |
| SGDClassifier | 0.59% | 0.59% |
| SVC | 0.57% | 0.55% |
| LinearSVC | 0.59% | 0.61% |
| GaussianNB | 0.49% | 0.49% |
| KNeighborsClassifier | 0.58% | 0.59% |
| RandomForestClassifier | 0.57% | 0.59% |
| AdaBoostClassifier | 0.52% | 0.52% |
| GradientBoostingClassifier | 0.53% | 0.53% |
| DecisionTreeClassifier | 0.54% | 0.55% |


# the best classifier is `DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')` with train score of 0 and test score of 0


| model | train score | test score|
| --- | --- | --- |
| LogisticRegression | 0.60% | 0.60% |
| SGDClassifier | 0.59% | 0.59% |
| SVC | 0.57% | 0.55% |
| LinearSVC | 0.59% | 0.61% |
| GaussianNB | 0.49% | 0.49% |
| KNeighborsClassifier | 0.58% | 0.59% |
| RandomForestClassifier | 0.57% | 0.59% |
| AdaBoostClassifier | 0.52% | 0.52% |
| GradientBoostingClassifier | 0.53% | 0.53% |
| DecisionTreeClassifier | 0.54% | 0.55% |

# testing on third poly degree features

In [83]:
auto_ml(df_poly_3.drop('next_rate', 1), df_poly_3['next_rate'])

| model | train score | test score|
| --- | --- | --- |
| LogisticRegression | 0.59% | 0.58% |
| SGDClassifier | 0.58% | 0.58% |
| SVC | 0.57% | 0.54% |
| LinearSVC | 0.58% | 0.58% |
| GaussianNB | 0.49% | 0.50% |
| KNeighborsClassifier | 0.58% | 0.58% |
| RandomForestClassifier | 0.57% | 0.59% |
| AdaBoostClassifier | 0.52% | 0.52% |
| GradientBoostingClassifier | 0.53% | 0.54% |
| DecisionTreeClassifier | 0.54% | 0.54% |


# the best classifier is `DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')` with train score of 0 and test score of 0


| model | train score | test score|
| --- | --- | --- |
| LogisticRegression | 0.59% | 0.58% |
| SGDClassifier | 0.58% | 0.58% |
| SVC | 0.57% | 0.54% |
| LinearSVC | 0.58% | 0.58% |
| GaussianNB | 0.49% | 0.50% |
| KNeighborsClassifier | 0.58% | 0.58% |
| RandomForestClassifier | 0.57% | 0.59% |
| AdaBoostClassifier | 0.52% | 0.52% |
| GradientBoostingClassifier | 0.53% | 0.54% |
| DecisionTreeClassifier | 0.54% | 0.54% |


## Now let's try with only the values that has high correlation

In [92]:
%%time
for corr_threshold in [0.5, 0.7, 0.8, 0.9]:
    # get feautres with high corr
    corr_mat = df_poly_3.corr()['Avg']
    feats = (corr_mat[abs(corr_mat)>=corr_threshold].index)
    # now train 
    print("# training using `{}` features with correlation >= {}".format(len(feats), corr_threshold))
    auto_ml(df_poly_3[feats], df_poly_3['next_rate'])
    print("\n---\n")

# training using `210` features with correlation >= 0.5
| model | train score | test score|
| --- | --- | --- |
| LogisticRegression | 0.60% | 0.57% |
| SGDClassifier | 0.59% | 0.59% |
| SVC | 0.58% | 0.57% |
| LinearSVC | 0.58% | 0.61% |
| GaussianNB | 0.50% | 0.49% |
| KNeighborsClassifier | 0.60% | 0.57% |
| RandomForestClassifier | 0.58% | 0.56% |
| AdaBoostClassifier | 0.53% | 0.48% |
| GradientBoostingClassifier | 0.56% | 0.52% |
| DecisionTreeClassifier | 0.57% | 0.52% |


# the best classifier is `LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)` with train score of 0.6004566210045662 and test score of 0.5730593607305936

---

# training using `210` features with correlation >= 0.7
| model | train score | test score|
| --- | --- | --- |
| LogisticRegression | 0.60% 

# training using `210` features with correlation >= 0.5
| model | train score | test score|
| --- | --- | --- |
| LogisticRegression | 0.60% | 0.57% |
| SGDClassifier | 0.59% | 0.59% |
| SVC | 0.58% | 0.57% |
| LinearSVC | 0.58% | 0.61% |
| GaussianNB | 0.50% | 0.49% |
| KNeighborsClassifier | 0.60% | 0.57% |
| RandomForestClassifier | 0.58% | 0.56% |
| AdaBoostClassifier | 0.53% | 0.48% |
| GradientBoostingClassifier | 0.56% | 0.52% |
| DecisionTreeClassifier | 0.57% | 0.52% |


# the best classifier is `LogisticRegression` with train score of 0.60 and test score of 0.57

model specs : LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l1', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

---

# training using `210` features with correlation >= 0.7
| model | train score | test score|
| --- | --- | --- |
| LogisticRegression | 0.60% | 0.57% |
| SGDClassifier | 0.59% | 0.58% |
| SVC | 0.58% | 0.57% |
| LinearSVC | 0.59% | 0.59% |
| GaussianNB | 0.50% | 0.49% |
| KNeighborsClassifier | 0.60% | 0.57% |
| RandomForestClassifier | 0.58% | 0.54% |
| AdaBoostClassifier | 0.53% | 0.50% |
| GradientBoostingClassifier | 0.57% | 0.52% |
| DecisionTreeClassifier | 0.57% | 0.51% |


# the best classifier is `LogisticRegression` with train score of 0.599 and test score of 0.57

model specs : LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

---

# training using `206` features with correlation >= 0.75
| model | train score | test score|
| --- | --- | --- |
| LogisticRegression | 0.60% | 0.58% |
| SGDClassifier | 0.59% | 0.58% |
| SVC | 0.58% | 0.56% |
| LinearSVC | 0.57% | 0.60% |
| GaussianNB | 0.51% | 0.49% |
| KNeighborsClassifier | 0.60% | 0.57% |
| RandomForestClassifier | 0.58% | 0.55% |
| AdaBoostClassifier | 0.53% | 0.48% |
| GradientBoostingClassifier | 0.56% | 0.51% |
| DecisionTreeClassifier | 0.56% | 0.50% |


# the best classifier is `LogisticRegression` with train score of 0.60 and test score of 0.577

model specs : LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

---

# training using `198` features with correlation >= 0.8
| model | train score | test score|
| --- | --- | --- |
| LogisticRegression | 0.60% | 0.59% |
| SGDClassifier | 0.57% | 0.60% |
| SVC | 0.58% | 0.58% |
| LinearSVC | 0.59% | 0.56% |
| GaussianNB | 0.50% | 0.48% |
| KNeighborsClassifier | 0.60% | 0.57% |
| RandomForestClassifier | 0.57% | 0.54% |
| AdaBoostClassifier | 0.55% | 0.47% |
| GradientBoostingClassifier | 0.56% | 0.50% |
| DecisionTreeClassifier | 0.54% | 0.50% |


# the best classifier is `LogisticRegression` with train score of 0.597 and test score of 0.589

model specs : LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

---

# training using `180` features with correlation >= 0.9
| model | train score | test score|
| --- | --- | --- |
| LogisticRegression | 0.60% | 0.58% |
| SGDClassifier | 0.59% | 0.57% |
| SVC | 0.59% | 0.58% |
| LinearSVC | 0.57% | 0.59% |
| GaussianNB | 0.50% | 0.49% |
| KNeighborsClassifier | 0.60% | 0.55% |
| RandomForestClassifier | 0.57% | 0.52% |
| AdaBoostClassifier | 0.54% | 0.47% |
| GradientBoostingClassifier | 0.55% | 0.51% |
| DecisionTreeClassifier | 0.55% | 0.48% |


# the best classifier is `LogisticRegression` with train score of 0.599 and test score of 0.579

model specs : LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

---

CPU times: user 4min 39s, sys: 5.18 s, total: 4min 44s
Wall time: 25min

# Get the classification report of the chosen model


In [14]:
from sklearn.metrics import classification_report

model = LogisticRegression(C=100, penalty='l1')

df = get_featured_frame('../res/EURUSD_15m_BID_01.01.2010-31.12.2016.csv')

df = generate_poly_feats(df, degree=2)

corr_mat = df.corr()['Avg']
feats = (corr_mat[abs(corr_mat)>=0.8].index)
    


# split the data 

x_train, x_test, y_train, y_test = train_test_split(df[feats], df['next_rate'],
                                                    test_size=.2, random_state=0)

# let's get the cross validation score
scores = cross_val_score(model, x_train, y_train, cv=5, n_jobs=-1)

print("model LogisticRegression has accuracy of {:.2f}(+/-){:.2f}%".format(scores.mean(), scores.std()))

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2190 entries, 2011-01-03 to 2016-12-31
Freq: D
Columns: 113 entries, Open to next_rate
dtypes: float64(113)
memory usage: 1.9 MB
None
model LogisticRegression has accuracy of 0.58(+/-)0.02%
             precision    recall  f1-score   support

        0.0       0.57      0.67      0.62       214
        1.0       0.62      0.53      0.57       224

avg / total       0.60      0.60      0.59       438



# Conclusion

from the previous experiments we can say that the best model is Logistic Regression, as it can actually find a good separation between the classes, and the best setup for the model is as follows :

# Model : LogisticRegression

## Parameters :

`C` = 100
`penalty` = `l1`

# Features

## Poly features 

we used the second degree polynomials to fit the non linear features

## High correlation features

we use the features with correlation above `0.8` with the dependent column

# Scores

## Best score using 5-fold cross validation

the best score we have is `60%` accuracy

## Precision and recall

the model has a ***precision*** score of `0.60` and ***recall*** of `0.60`, and has an ***f1-score*** of `0.59`


