In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv('ibm.us.txt')

In [3]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt
0,1962-01-02,6.413,6.413,6.3378,6.3378,467056,0
1,1962-01-03,6.3378,6.3963,6.3378,6.3963,350294,0
2,1962-01-04,6.3963,6.3963,6.3295,6.3295,314365,0
3,1962-01-05,6.3211,6.3211,6.1958,6.2041,440112,0
4,1962-01-08,6.2041,6.2041,6.0373,6.087,655676,0


## Feature Engineering

* Historique des prix
* Moyennes mobiles
* retours sur investissement
* Volatilités

$$R_t =\frac{P_t -P_{t-1}}{P_{t-1}}$$

$$V_t(7) = \text{std}(R_{t}, R_{t-1}, ..., R_{t-7})$$

In [28]:
# Historique des prix 

P= range(1,11)

for p in P:
    df['Open_'+ str(p)]=df['Open'].shift(p)
    
#df.head()

# Moyennes mobiles 

M=[5,10,15,20]

for m in M:
    df['MA_'+str(m)]=df['Open'].rolling(m).mean()
    

# Retours sur investissement 

R=[1,5,10,15]

for r in R:
    df['R_'+str(r)]=df['Open'].pct_change(r)
    
# Volatilité 

V=[5,10,15]

for v in V:
    df['V_'+str(v)]=df['R_1'].rolling(v).std()


df['target']=df['V_5'].shift(-5)

df['target']=df['target'].apply(lambda x : 1 if x>0.011 else 0)

df=df.dropna()

In [29]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt,Open_1,Open_2,Open_3,...,MA_15,MA_20,R_1,R_5,R_10,R_15,V_5,V_10,V_15,target
19,1962-01-29,5.9951,6.0373,5.8952,5.8952,700585,0,6.0287,6.0956,6.0624,...,6.144040,6.191645,-0.005573,-0.023186,-0.043966,-0.033687,0.006540,0.006214,0.007786,1
20,1962-01-30,5.8952,5.9285,5.8201,5.8201,889207,0,5.9951,6.0287,6.0956,...,6.129000,6.165755,-0.016664,-0.036858,-0.059897,-0.036858,0.008251,0.007060,0.008154,1
21,1962-01-31,5.8784,6.0036,5.8784,6.0036,916151,0,5.8952,5.9951,6.0287,...,6.109513,6.142785,-0.002850,-0.030351,-0.049956,-0.047369,0.008370,0.006644,0.007607,1
22,1962-02-01,6.0703,6.1621,6.0703,6.1457,772443,0,5.8784,5.8952,5.9951,...,6.101700,6.126485,0.032645,-0.004151,-0.009594,-0.018941,0.019369,0.013439,0.011953,0
23,1962-02-02,6.1457,6.1875,6.1208,6.1875,610767,0,6.0703,5.8784,5.8952,...,6.094460,6.117715,0.012421,0.019407,0.002708,-0.017364,0.019085,0.014088,0.012077,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14054,2017-11-06,150.2700,150.3200,148.7900,149.3500,4543499,0,151.9000,152.6500,152.5700,...,153.980667,152.038500,-0.010731,-0.012940,-0.063447,0.030941,0.005519,0.008324,0.021546,0
14055,2017-11-07,149.8700,150.0100,149.0100,149.8500,3729501,0,150.2700,151.9000,152.6500,...,154.293333,152.219500,-0.002662,-0.019175,-0.052176,0.032305,0.004316,0.007888,0.021521,0
14056,2017-11-08,150.1000,150.2800,148.7900,150.0700,4679520,0,149.8700,150.2700,151.9000,...,153.928667,152.378000,0.001535,-0.016189,-0.028290,-0.035161,0.004904,0.005062,0.009872,0
14057,2017-11-09,149.9300,151.8000,149.8600,150.3000,4776388,0,150.1000,149.8700,150.2700,...,153.376000,152.569500,-0.001133,-0.017819,-0.018654,-0.052395,0.004634,0.004198,0.008318,0


## Création de la pipeline

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import FunctionTransformer,StandardScaler

def drop_columns(df):
    to_drop=['Date','High','Low','Close','Volume','OpenInt']
    
    return df.drop(to_drop,axis=1)


col_dropper=FunctionTransformer(drop_columns)

scaler=StandardScaler()

model=GradientBoostingClassifier()


pipeline=Pipeline([
    ("drop_columns",col_dropper),
    ("scaling",scaler),
    ("model",model)
])

In [36]:
X=df.drop('target',axis=1)
y=df.target

X_train=X.iloc[:int(len(X)*0.7)]
y_train=y.iloc[:int(len(y)*0.7)]

X_test=X.iloc[int(len(X)*0.7):]
y_test=y.iloc[int(len(y)*0.7):]

In [37]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('drop_columns',
                 FunctionTransformer(func=<function drop_columns at 0x128db85e0>)),
                ('scaling', StandardScaler()),
                ('model', GradientBoostingClassifier())])

In [38]:
from sklearn.metrics import classification_report as cr

In [39]:
y_pred=pipeline.predict(X_test)

In [40]:
print(cr(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.72      0.52      0.60      2463
           1       0.51      0.72      0.60      1749

    accuracy                           0.60      4212
   macro avg       0.62      0.62      0.60      4212
weighted avg       0.63      0.60      0.60      4212

