In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv('ibm.us.txt')

In [3]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt
0,1962-01-02,6.413,6.413,6.3378,6.3378,467056,0
1,1962-01-03,6.3378,6.3963,6.3378,6.3963,350294,0
2,1962-01-04,6.3963,6.3963,6.3295,6.3295,314365,0
3,1962-01-05,6.3211,6.3211,6.1958,6.2041,440112,0
4,1962-01-08,6.2041,6.2041,6.0373,6.087,655676,0


## Feature Engineering

* Historique des prix
* Moyennes mobiles
* retours sur investissement
* Volatilités

$$R_t =\frac{P_t -P_{t-1}}{P_{t-1}}$$

$$V_t(7) = \text{std}(R_{t}, R_{t-1}, ..., R_{t-7})$$

In [4]:
# Historique des prix 

P= range(1,11)

for p in P:
    df['Open_'+ str(p)]=df['Open'].shift(p)
    
#df.head()

# Moyennes mobiles 

M=[5,10,15,20]

for m in M:
    df['MA_'+str(m)]=df['Open'].rolling(m).mean()
    

# Retours sur investissement 

R=[1,5,10,15]

for r in R:
    df['R_'+str(r)]=df['Open'].pct_change(r)
    
# Volatilité 

V=[5,10,15]

for v in V:
    df['V_'+str(v)]=df['R_1'].rolling(v).std()


df['target']=df['V_5'].shift(-5)

df['target']=df['target'].apply(lambda x : 1 if x>0.011 else 0)

df=df.dropna()

In [5]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt,Open_1,Open_2,Open_3,...,MA_15,MA_20,R_1,R_5,R_10,R_15,V_5,V_10,V_15,target
19,1962-01-29,5.9951,6.0373,5.8952,5.8952,700585,0,6.0287,6.0956,6.0624,...,6.144040,6.191645,-0.005573,-0.023186,-0.043966,-0.033687,0.006540,0.006214,0.007786,1
20,1962-01-30,5.8952,5.9285,5.8201,5.8201,889207,0,5.9951,6.0287,6.0956,...,6.129000,6.165755,-0.016664,-0.036858,-0.059897,-0.036858,0.008251,0.007060,0.008154,1
21,1962-01-31,5.8784,6.0036,5.8784,6.0036,916151,0,5.8952,5.9951,6.0287,...,6.109513,6.142785,-0.002850,-0.030351,-0.049956,-0.047369,0.008370,0.006644,0.007607,1
22,1962-02-01,6.0703,6.1621,6.0703,6.1457,772443,0,5.8784,5.8952,5.9951,...,6.101700,6.126485,0.032645,-0.004151,-0.009594,-0.018941,0.019369,0.013439,0.011953,0
23,1962-02-02,6.1457,6.1875,6.1208,6.1875,610767,0,6.0703,5.8784,5.8952,...,6.094460,6.117715,0.012421,0.019407,0.002708,-0.017364,0.019085,0.014088,0.012077,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14054,2017-11-06,150.2700,150.3200,148.7900,149.3500,4543499,0,151.9000,152.6500,152.5700,...,153.980667,152.038500,-0.010731,-0.012940,-0.063447,0.030941,0.005519,0.008324,0.021546,0
14055,2017-11-07,149.8700,150.0100,149.0100,149.8500,3729501,0,150.2700,151.9000,152.6500,...,154.293333,152.219500,-0.002662,-0.019175,-0.052176,0.032305,0.004316,0.007888,0.021521,0
14056,2017-11-08,150.1000,150.2800,148.7900,150.0700,4679520,0,149.8700,150.2700,151.9000,...,153.928667,152.378000,0.001535,-0.016189,-0.028290,-0.035161,0.004904,0.005062,0.009872,0
14057,2017-11-09,149.9300,151.8000,149.8600,150.3000,4776388,0,150.1000,149.8700,150.2700,...,153.376000,152.569500,-0.001133,-0.017819,-0.018654,-0.052395,0.004634,0.004198,0.008318,0


## Création de la pipeline

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import FunctionTransformer,StandardScaler

def drop_columns(df):
    to_drop=['Date','High','Low','Close','Volume','OpenInt']
    
    return df.drop(to_drop,axis=1)


col_dropper=FunctionTransformer(drop_columns)

scaler=StandardScaler()

model=GradientBoostingClassifier()


pipeline=Pipeline([
    ("drop_columns",col_dropper),
    ("scaling",scaler),
    ("model",model)
])

In [32]:
from joblib import dump,load 

dump(pipeline.steps[1][1],"scal.joblib")
dump(pipeline.steps[2][1],"model.joblib")


['model.joblib']

In [7]:
X=df.drop('target',axis=1)
y=df.target

X_train=X.iloc[:int(len(X)*0.7)]
y_train=y.iloc[:int(len(y)*0.7)]

X_test=X.iloc[int(len(X)*0.7):]
y_test=y.iloc[int(len(y)*0.7):]

In [8]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('drop_columns',
                 FunctionTransformer(func=<function drop_columns at 0x7f16e2d241f0>)),
                ('scaling', StandardScaler()),
                ('model', GradientBoostingClassifier())])

In [9]:
from sklearn.metrics import classification_report as cr

In [10]:
y_pred=pipeline.predict(X_test)

In [11]:
print(cr(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.73      0.49      0.58      2463
           1       0.51      0.75      0.61      1749

    accuracy                           0.60      4212
   macro avg       0.62      0.62      0.60      4212
weighted avg       0.64      0.60      0.59      4212



In [12]:
dir(GradientBoostingClassifier())

['_SUPPORTED_LOSS',
 '__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_initialized',
 '_check_n_features',
 '_check_params',
 '_clear_state',
 '_compute_partial_dependence_recursion',
 '_estimator_type',
 '_fit_stage',
 '_fit_stages',
 '_get_param_names',
 '_get_tags',
 '_init_state',
 '_is_initialized',
 '_make_estimator',
 '_more_tags',
 '_raw_predict',
 '_raw_predict_init',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_resize_state',
 '_staged_raw_predict',
 '_validate_data',
 '_validate_es

In [16]:
from sklearn.model_selection import GridSearchCV,TimeSeriesSplit


param = {"model__n_estimators":[100,200,300], "model__learning_rate":np.logspace(-1,-5,5)}

grid = GridSearchCV(pipeline,param,cv=TimeSeriesSplit(3),n_jobs=3).fit(X_train,y_train)

In [17]:
grid.best_params_

{'model__learning_rate': 0.01, 'model__n_estimators': 200}

In [18]:
col_dropper=FunctionTransformer(drop_columns)

scaler=StandardScaler()

model=GradientBoostingClassifier(n_estimators=200,learning_rate=0.01)


pipeline=Pipeline([
    ("drop_columns",col_dropper),
    ("scaling",scaler),
    ("model",model)
])

pipeline.fit(X_train,y_train)

pred_best = pipeline.predict(X_test)

cr(y_test,pred_best)

'              precision    recall  f1-score   support\n\n           0       0.67      0.08      0.14      2463\n           1       0.42      0.95      0.58      1749\n\n    accuracy                           0.44      4212\n   macro avg       0.54      0.51      0.36      4212\nweighted avg       0.56      0.44      0.32      4212\n'

In [20]:
print(cr(y_test,pred_best))

              precision    recall  f1-score   support

           0       0.67      0.08      0.14      2463
           1       0.42      0.95      0.58      1749

    accuracy                           0.44      4212
   macro avg       0.54      0.51      0.36      4212
weighted avg       0.56      0.44      0.32      4212



In [21]:
col_dropper=FunctionTransformer(drop_columns)

scaler=StandardScaler()

model=GradientBoostingClassifier()


pipeline=Pipeline([
    ("drop_columns",col_dropper),
    ("scaling",scaler),
    ("model",model)
])

pipeline.fit(X_train,y_train)

pred_best = pipeline.predict(X_test)

print(cr(y_test,pred_best))

              precision    recall  f1-score   support

           0       0.72      0.51      0.60      2463
           1       0.51      0.72      0.60      1749

    accuracy                           0.60      4212
   macro avg       0.61      0.61      0.60      4212
weighted avg       0.63      0.60      0.60      4212



In [23]:
from backtesting import Backtest, Strategy
from backtesting.lib import crossover

from backtesting.test import SMA, GOOG


class MaStrategie(Strategy):

    def init(self):
        self.model = pipeline
        
    def next(self):
        jour_en_cours = self.data.df.iloc[-1:]
        
        volatilite = self.model.predict(jour_en_cours)
        
        if volatilite == 1:
            if jour_en_cours.Open.iloc[0] > jour_en_cours.Open_1.iloc[0]:
                self.buy()
            else: 
                self.sell()

bt = Backtest(X_test, MaStrategie,
              cash=10000, commission=.002,
              exclusive_orders=True)

output = bt.run()
bt.plot()



  bt = Backtest(X_test, MaStrategie,


In [24]:
output

Start                                  9847.0
End                                   14058.0
Duration                               4211.0
Exposure Time [%]                   66.690408
Equity Final [$]                   129.330696
Equity Peak [$]                  11554.170038
Return [%]                         -98.706693
Buy & Hold Return [%]               52.992461
Return (Ann.) [%]                         0.0
Volatility (Ann.) [%]                     NaN
Sharpe Ratio                              NaN
Sortino Ratio                             NaN
Calmar Ratio                              0.0
Max. Drawdown [%]                  -98.884206
Avg. Drawdown [%]                  -20.240618
Max. Drawdown Duration                 4181.0
Avg. Drawdown Duration             701.166667
# Trades                               2433.0
Win Rate [%]                        42.622277
Best Trade [%]                      13.053822
Worst Trade [%]                    -33.836485
Avg. Trade [%]                    

In [None]:
model.predict()