# In this notebook we try to transfer the problem into a classification one, using the features we gathered in the regression version

In [1]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [33]:
def get_featured_frame(file_path):
    df = pd.read_csv(file_path)
    DAY_VALUES = 1
    MONTH_VALUES = 30
    WEEK_VALUES = DAY_VALUES*7
    YEAR_VALUES = DAY_VALUES*365
    df['Time'] = pd.to_datetime(df.Time)
    df = df.set_index('Time')
    df = df.resample('D', convention='start').mean()
    df['date'] = df.index.values
    df['Avg'] =(df['Low'] + df['High'])/2
    # time features
    df['year'] = df['date'].apply(lambda x: x.year)
    df['month'] = df['date'].apply(lambda x: x.month)
    df['day'] = df['date'].apply(lambda x: x.day)
    df['hour'] = df['date'].apply(lambda x: x.hour)
    df['minute'] = df['date'].apply(lambda x: x.minute)
    # Lagged Values
    for unit, amount, shift_values in zip(['day', 'day', 'day', 'day', 'week', 'week', 'week', 'month', 'month', 'month', 'year'],[1,2,3,4,1,2,3,1,2,1],[DAY_VALUES, DAY_VALUES, DAY_VALUES, DAY_VALUES, WEEK_VALUES, WEEK_VALUES, WEEK_VALUES, MONTH_VALUES, MONTH_VALUES, YEAR_VALUES]):
        for col in ['Open', 'Close', 'High', 'Low', 'Volume', 'Avg']:
            new_col = "{}_{}{}_before".format(col, amount, unit)
            df[new_col] = df[col].shift(amount*shift_values)
    # Summary of values
    for unit, amount, win_size in zip(['day', 'day', 'week', 'week','month', 'month', 'month'],[1,1,1,1,1,1,1],[2,5,2,3,1,2,3]):
        for col in ['Open', 'Close', 'High', 'Low', 'Volume']:
            roll_col = "{}_av_{}{}_before_{}roll".format(col, amount, unit, win_size)
            shifted = "{}_{}{}_before".format(col, amount, unit)
            df[roll_col] = (df[shifted].rolling(window=win_size)).mean()
    # some stat of the values
    for col in ['Open', 'Close', 'High', 'Low']:
        window = df[col].expanding()
        df["{}_max".format(col)] = window.max()
        df["{}_min".format(col)] = window.min()
        df["{}_avg".format(col)] = window.mean()
    df = df.drop("date", axis=1)
    # create the prediction column
    df['next_rate'] = np.where(df['Avg'].shift(-1) > df['Avg'],1,-1)
    df = df.dropna()
    # sclae the values
    scaler = MinMaxScaler()
    df[df.columns] = scaler.fit_transform(df[df.columns])
    print(df.info())
    return df

In [35]:
%%time
df = get_featured_frame("EURUSD_15m_BID_01.01.2010-31.12.2016.csv")

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2190 entries, 2011-01-03 to 2016-12-31
Freq: D
Columns: 113 entries, Open to next_rate
dtypes: float64(113)
memory usage: 1.9 MB
None
CPU times: user 492 ms, sys: 12 ms, total: 504 ms
Wall time: 501 ms


In [36]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Avg,year,month,day,hour,...,Close_max,Close_min,Close_avg,High_max,High_min,High_avg,Low_max,Low_min,Low_avg,next_rate
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-03,0.659563,0.65949,0.659557,0.659548,0.11387,0.659523,0.0,0.0,0.066667,0.0,...,0.0,1.0,0.62663,0.0,1.0,0.627382,0.0,1.0,0.625566,1.0
2011-01-04,0.663954,0.664138,0.66373,0.663851,0.147426,0.663934,0.0,0.0,0.1,0.0,...,0.0,1.0,0.626846,0.0,1.0,0.627604,0.0,1.0,0.62578,0.0
2011-01-05,0.63512,0.635156,0.63478,0.634727,0.138942,0.634968,0.0,0.0,0.133333,0.0,...,0.0,1.0,0.626687,0.0,1.0,0.627453,0.0,1.0,0.62562,0.0
2011-01-06,0.605992,0.605905,0.605549,0.605563,0.126467,0.605727,0.0,0.0,0.166667,0.0,...,0.0,1.0,0.626155,0.0,1.0,0.626928,0.0,1.0,0.625086,0.0
2011-01-07,0.578792,0.579032,0.578433,0.578604,0.122968,0.578733,0.0,0.0,0.2,0.0,...,0.0,1.0,0.625281,0.0,1.0,0.626063,0.0,1.0,0.624209,0.0


# Now into classification itself

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier

In [37]:
df.next_rate.value_counts()

0.0    1119
1.0    1071
Name: next_rate, dtype: int64

In [38]:
x_train, x_test, y_train, y_test = train_test_split(df.drop('next_rate', 1), df['next_rate'], test_size=.2)

# A linear SVC takes 2min 11s and F1 score - 60% 

In [39]:
%%time
# let the training begins
clf = LinearSVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

        0.0       0.63      0.60      0.61       230
        1.0       0.58      0.61      0.59       208

avg / total       0.60      0.60      0.60       438

CPU times: user 460 ms, sys: 24 ms, total: 484 ms
Wall time: 473 ms


# Trainig a SGD Classifier best case ~ 5.07s - 59% f1

* 100k iterations takes ~ 50.8s >> f1 = 60%
* 10k iterations takes ~ 5.07s  >> f1 = 59% `best option`
* 1k iteration takes ~ 504ms    >> f1 = 54%

In [45]:
%%time
# let the training begins
clf = SGDClassifier(n_jobs=-1, max_iter=10000)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

        0.0       0.64      0.56      0.60       230
        1.0       0.57      0.65      0.61       208

avg / total       0.61      0.60      0.60       438

CPU times: user 5.04 s, sys: 56 ms, total: 5.1 s
Wall time: 5.04 s


# Training a random forest best case ~ 490ms - 61% f1

* 10 estimator takess ~ 236ms   >> f1 = 56%
* 100 estimator takes ~ 490ms   >> f1 = 61% `best option`
* 1k  estimator takes ~ 2.89s   >> f1 = 60%

In [52]:
from sklearn.ensemble import RandomForestClassifier

In [51]:
%%time
# let the training begins
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

        0.0       0.61      0.61      0.61       230
        1.0       0.57      0.57      0.57       208

avg / total       0.59      0.59      0.59       438

CPU times: user 1.75 s, sys: 8 ms, total: 1.76 s
Wall time: 484 ms


# Trying Adaboost best case 813ms - 54% f1

* 50 estimator takes 813ms  >> f1 = 54%
* 100 estimator takes 1.16s >> f1 = 53%

In [55]:
from sklearn.ensemble import AdaBoostClassifier

In [57]:
%%time
# let the training begins
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

        0.0       0.55      0.60      0.57       230
        1.0       0.51      0.46      0.48       208

avg / total       0.53      0.53      0.53       438

CPU times: user 1.6 s, sys: 4 ms, total: 1.61 s
Wall time: 1.61 s


# so we will go with the random forest model