## Import des bibliothèques

In [1]:
pip install googleapis-common-protos protobuf grpcio pandas systemathics.apis statsmodels matplotlib seaborn  

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import re
import grpc
import pandas as pd
import seaborn
import statsmodels.api as sm
import google.protobuf as pb
import systemathics.apis.services.static_data.v1.static_data_pb2 as static_data
import systemathics.apis.services.static_data.v1.static_data_pb2_grpc as static_data_service
import numpy as np
from statsmodels.tsa.stattools import adfuller,coint
import matplotlib.pyplot as plt
import systemathics.apis.type.shared.v1.identifier_pb2 as identifier
import systemathics.apis.services.daily.v1.daily_prices_pb2 as daily_prices
import systemathics.apis.services.daily.v1.daily_prices_pb2_grpc as daily_prices_service
from datetime import datetime
import itertools
import copy

## Authentification

In [3]:
token = f"Bearer {os.environ['AUTH0_TOKEN']}"
display(token)

'Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6ImpwZDhjS2Z5Zi13QXkzOURpNENqWSJ9.eyJpc3MiOiJodHRwczovL2dhbnltZWRlLXByb2QuZXUuYXV0aDAuY29tLyIsInN1YiI6ImF1dGgwfDYxNmQ4NzI5NWQzZDlkMDA3MGVkYmIxNCIsImF1ZCI6WyJodHRwczovL3Byb2QuZ2FueW1lZGUtcHJvZCIsImh0dHBzOi8vZ2FueW1lZGUtcHJvZC5ldS5hdXRoMC5jb20vdXNlcmluZm8iXSwiaWF0IjoxNjQxMjQ0ODc5LCJleHAiOjE2NDM4MzY4NzksImF6cCI6Ijl5R0tzbGtFczFWNm9xRk9aa0h0a1V0NWkyNTVackpJIiwic2NvcGUiOiJvcGVuaWQgcHJvZmlsZSBlbWFpbCIsInBlcm1pc3Npb25zIjpbInNlcnZpY2VzOmJhc2ljIl19.q5iE74mLR6xWwMEQvbuOWgym_XB-H3GVOHDRm33fAAzTS9-G4BzxyLWGio7aKBPO2oXvnbtyiXru8K08Qso_gFkhOZEpcLxZQlUDRmqz2O7dwn6Sqz0zmh6tdxEBOR1PliB_X9EiLUQTWZLpTUwUKgWQrOi1DjkZ5M1FsM4m-QyKTxNAv8NJfyY5Vhfbldvj94ZasJHaueeYhOh0wEhojxMTeYap9TJLDKXXkxb3sfnqJKvrHnftxKFC-61mtx7SAk9n4JOvXBYt9cbsXS7U4n7rMogJ51l4p0RJE2iqUW5DqUxhabISfKt9M0eQ7NLnCVD-KPFw1Og524AGhkHiuA'

# Sélection des paires

## Choix des indicateurs de sélection

### Correlation

In [4]:
def correlation(timeseries1,timeseries2):
    return np.corrcoef(sample1_sum,sample2_sum)[0,1]
# Correlation need to be near 1

### Stationarity

In [5]:
def stationarity_test_bool(timeseries,cutoff=0.01):
    # H_0 in adfuller is unit root exists (non-stationary)
    # We must observe significant p-value to convince ourselves that the series is stationary
    pvalue = adfuller(timeseries)[1]
    return True if pvalue<cutoff else False

In [6]:
def stationarity_test_pvalue(timeseries,cutoff=0.01):
    # H_0 in adfuller is unit root exists (non-stationary)
    # We must observe significant p-value to convince ourselves that the series is stationary
    return adfuller(timeseries)[1]

### Cointegration

In [7]:
def cointegration_test(timeseries1,timeseries2):
    return coint(timeseries1,timeseries2)[1]    # return the p-value of the test

# Low pvalue means high cointegration!

### Standardized data

In [8]:
def mean_norm(df_input):
    df_input.loc[:, df_input.columns != "Date"] = df_input.loc[:, df_input.columns != "Date"].apply(lambda x: (x-x.mean())/ x.std(), axis=0)
    return df_input
#We have to choose the best way to standardized the dataframe here

## Application des indicateurs de sélection

### Recueil des données

#### Recueil des tickers

In [9]:
# define a method to handle the equities reponse using a Pandas dataframe
def get_equities_dataframe(response):
    identifier = ['{0}|{1}'.format(equity.identifier.ticker, equity.identifier.exchange) for equity in response.equities]
    type = [equity.type for equity in response.equities]
    country = [equity.country for equity in response.equities]
    name = [equity.name for equity in response.equities]
    currency = [equity.currency for equity in response.equities]
    primary = [equity.primary for equity in response.equities]
    tick_size_rule = [equity.tick_size_rule for equity in response.equities]
    mapping = [get_mapping(equity.mapping) for equity in response.equities]
    index = [equity.index for equity in response.equities]
    open = [equity.open for equity in response.equities]
    close = [equity.close for equity in response.equities]
    time_zone = [equity.time_zone for equity in response.equities]
    lot_size = [equity.lot_size for equity in response.equities]
    point_value = [equity.point_value for equity in response.equities]
    isin = [equity.isin for equity in response.equities]
    cusip = [equity.cusip for equity in response.equities]
    sedol = [equity.sedol for equity in response.equities]
    sectors = [get_sectors(equity.sectors) for equity in response.equities]
    capitalization = [equity.capitalization.value for equity in response.equities]
    
    # Create pandas dataframe
    d = {'Identifier': identifier, 'Type': type, 'Country': country, 'Name': name, 'Currency': currency, 'Primary': primary, 'TickSizeRule': tick_size_rule, 'Mapping':mapping, 'Index': index, 'Open': open, 'Close': close, 'Time zone': time_zone, 'Lot size': lot_size, 'PointValue': point_value, 'Isin': isin, 'Cusip': cusip, 'Sedol': sedol, 'Sectors': sectors, 'Capitalization': capitalization}
    df = pd.DataFrame(data=d)
    return df

In [10]:
# define methods to handle identifiers mapping and sectors display as a string
def get_mapping(d):
    res=''
    for key, value in d.items():
        res = res + '['+key+'='+value+']'
    return res

def get_sectors(d):
    res=''
    for key, value in d.items():
        res = res + '['+key+','+value+']'
    return res

def get_identifier(d):
    res=''
    for key, value in d.items():
        res = res + '['+key+'='+value+']'
    return res

In [11]:
# generate static data request
request = static_data.StaticDataRequest( 
    asset_type = static_data.AssetType.ASSET_TYPE_EQUITY
)

request.index.value = 'NASDAQ 100'
request.exchange.value = 'XNGS'     # Requête qui ne filtre que la bourse primaire mais pas la bourse réelle
request.count.value = 1000

In [12]:
# open a gRPC channel
with open(os.environ['SSL_CERT_FILE'], 'rb') as f:
    credentials = grpc.ssl_channel_credentials(f.read())
with grpc.secure_channel(os.environ['GRPC_APIS'], credentials) as channel:
    
    # instantiate the static data service
    service = static_data_service.StaticDataServiceStub(channel)
    
    # process the request
    response = service.StaticData(request = request, metadata = [('authorization', token)])

# visualize request results
data = get_equities_dataframe(response)

def drop_others_exch(data):
    count = 0
    for i in range(len(data)):
        exch = data.iloc[count]['Identifier'].split('|')[1]
        if exch != request.exchange.value:
            data.drop(i, inplace = True)
            count -= 1
        count += 1  


drop_others_exch(data)       # Cette fonction réctifie le problème du filtre de la bourse dans la requete qui ne filtre pas complètement
display(data.sort_values(['Identifier']))

Unnamed: 0,Identifier,Type,Country,Name,Currency,Primary,TickSizeRule,Mapping,Index,Open,Close,Time zone,Lot size,PointValue,Isin,Cusip,Sedol,Sectors,Capitalization
70,AAPL|XNGS,Equity,US,Apple Inc Common Stock,USD,XNGS,[0:0.0001][1:0.01],[Esignal=AAPL][Idc|564=564|AAPL][Bloomberg=AAP...,Composite|Industrials|Nasdaq 100|Nasdaq Compos...,,,,1,1.0,US0378331005,037833100,2046251,"[SIC,3571 Electronic Computers][Nasdaq,Compute...",3.078591e+12
38,ADBE|XNGS,Equity,US,Adobe Inc,USD,XNGS,[0:0.0001][1:0.01],[Bloomberg=ADBE US Equity][Idc|564=564|ADBE][F...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US00724F1012,00724F101,2008154,"[Nasdaq,Computer Software: Prepackaged Softwar...",2.698071e+11
80,ADI|XNGS,Equity,US,Analog Devices Inc,USD,XNGS,[0:0.0001][1:0.01],[Figic=BBG000BB6G37][Idc|564=564|ADI][Figi=BBG...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US0326541051,032654105,2032067,"[Nasdaq,Semiconductors][SIC,3674 Semiconductor...",9.233737e+10
60,ADP|XNGS,Equity,US,Automatic Data Processing Inc,USD,XNGS,[0:0.0001][1:0.01],[Figi=BBG000JG0547][Bloomberg=ADP US Equity][E...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US0530151036,053015103,2065308,"[SIC,7374 Services-Computer Processing & Data ...",1.039049e+11
87,ADSK|XNGS,Equity,US,Autodesk Inc,USD,XNGS,[0:0.0001][1:0.01],[Idc|564=564|ADSK][Figi=BBG000BM7HL0][Figic=BB...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US0527691069,052769106,2065159,"[SIC,7372 Services-Prepackaged Software][Nasda...",6.185433e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,WBA|XNGS,Equity,US,Walgreens Boots Alliance Inc,USD,XNGS,[0:0.0001][1:0.01],[Esignal=WBA][Figic=BBG000BWLMJ4][Bloomberg=WB...,Composite|Industrials|Nasdaq 100|Nasdaq Compos...,,,,1,1.0,US9314271084,931427108,BTN1Y44,"[SIC,5912 Retail-Drug Stores and Proprietary S...",4.506336e+10
10,WDAY|XNGS,Equity,US,Workday Inc,USD,XNGS,[0:0.0001][1:0.01],[Figic=BBG000VC0T95][Figi=BBG000VC0T95][Bloomb...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US98138H1014,98138H101,B8K6ZD1,"[SIC,7374 Services-Computer Processing & Data ...",6.829500e+10
25,XEL|XNGS,Equity,US,Xcel Energy Inc,USD,XNGS,[0:0.0001][1:0.01],[Figi=BBG000BCTQ65][Idc|564=564|XEL][Figic=BBG...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US98389B1008,98389B100,2614807,"[SIC,4931 Electric & Other Services Combined][...",3.646834e+10
52,XLNX|XNGS,Equity,US,Xilinx Inc Common Stock,USD,XNGS,[0:0.0001][1:0.01],[Esignal=XLNX][Figi=BBG000C0F570][Idc|564=564|...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US9839191015,983919101,2985677,"[SIC,3674 Semiconductors & Related Devices][Na...",5.255808e+10


### Recupération des SIC (secteur)

In [13]:
def get_sic(data):      # Fonction qui permet de récupérer le code SIC d'un tableau d'equities contenant la colone ['Sectors']
    sic = []            # Cette fonction ajoute au dataframe une nouvelle colone appelée ['SIC'] au dataframe d'equities
    for i in range(len(data)):
        ligne = data.iloc[i]['Sectors']
        match = re.search(r"SIC,([0-9]{2})", ligne)
        sic.append(match.group().split(",")[1])
    data['SIC'] = sic
    return data

In [14]:
data = get_sic(data)
data

Unnamed: 0,Identifier,Type,Country,Name,Currency,Primary,TickSizeRule,Mapping,Index,Open,Close,Time zone,Lot size,PointValue,Isin,Cusip,Sedol,Sectors,Capitalization,SIC
0,MELI|XNGS,Equity,US,Mercadolibre Inc,USD,XNGS,[0:0.0001][1:0.01],[Bloomberg=MELI US Equity][Figic=BBG000GQPB11]...,Nasdaq 100|Nasdaq Composite,,,,1,1.0,US58733R1023,58733R102,B23X1H3,"[Nasdaq,Catalog/Specialty Distribution][SIC,73...",6.803886e+10,73
1,CHKP|XNGS,Equity,IL,Check Point Software Technologies Ltd,USD,XNGS,[0:0.0001][1:0.01],[Bloomberg=CHKP US Equity][Esignal=CHKP][Idc|5...,Nasdaq 100|Nasdaq Composite,,,,1,1.0,IL0010824113,M22465104,2181334,"[SIC,7372 Services-Prepackaged Software][Nasda...",1.547896e+10,73
2,OKTA|XNGS,Equity,US,Okta Inc Cl A,USD,XNGS,[0:0.0001][1:0.01],[Figic=BBG001YV1SM4][Bloomberg=OKTA US Equity]...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US6792951054,679295105,BDFZSP1,"[Nasdaq,EDP Services][SIC,7372 Services-Prepac...",3.486826e+10,73
3,SIRI|XNGS,Equity,US,Sirius Xm Holdings Inc,USD,XNGS,[0:0.0001][1:0.01],[Figic=BBG000BT0093][Bloomberg=SIRI US Equity]...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US82968B1035,82968B103,BGLDK10,"[SIC,4832 Radio Broadcasting Stations][Nasdaq,...",2.539679e+10,48
4,MRVL|XNGS,Equity,US,Marvell Technology Inc,USD,XNGS,[0:0.0001][1:0.01],[Idc|564=564|MRVL][Bloomberg=MRVL US Equity][F...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US5738741041,573874104,BNKJSM5,"[Nasdaq,Semiconductors][SIC,3674 Semiconductor...",7.382406e+10,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,AMD|XNGS,Equity,US,Advanced Micro Devices Inc,USD,XNGS,[0:0.0001][1:0.01],[Bloomberg=AMD US Equity][Figic=BBG000BBQCY0][...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US0079031078,007903107,2007849,"[Nasdaq,Semiconductors][SIC,3674 Semiconductor...",1.737751e+11,36
92,LRCX|XNGS,Equity,US,Lam Research Corp,USD,XNGS,[0:0.0001][1:0.01],[Bloomberg=LRCX US Equity][Figi=BBG000BNFLM9][...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US5128071082,512807108,2502247,"[Nasdaq,Industrial Machinery/Components][SIC,3...",1.012554e+11,35
93,ILMN|XNGS,Equity,US,Illumina Inc,USD,XNGS,[0:0.0001][1:0.01],[Esignal=ILMN][Figi=BBG000DSMS70][Idc|564=564|...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US4523271090,452327109,2613990,"[Nasdaq,Medical Specialities][SIC,3826 Laborat...",5.946277e+10,38
94,INTU|XNGS,Equity,US,Intuit Inc Common Stock,USD,XNGS,[0:0.0001][1:0.01],[Figi=BBG000BH5DV1][Esignal=INTU][Figic=BBG000...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US4612021034,461202103,2459020,"[SIC,7372 Services-Prepackaged Software][Nasda...",1.821386e+11,73


### Liste de Dataframes par SIC

In [15]:
def sep_secteur(data):
    groups = data.groupby(['SIC'])
    liste_sic = data['SIC'].unique()
    liste_sic.sort()
    df_SIC = []
    for i in range(len(liste_sic)):
        df_SIC.append(groups.get_group(liste_sic[i]))
    return df_SIC

In [16]:
data_sec = sep_secteur(data)   # Liste de DF avec entreprises par secteur (selon le SIC)

In [17]:
data_sec[3]

Unnamed: 0,Identifier,Type,Country,Name,Currency,Primary,TickSizeRule,Mapping,Index,Open,Close,Time zone,Lot size,PointValue,Isin,Cusip,Sedol,Sectors,Capitalization,SIC
68,CSCO|XNGS,Equity,US,Cisco Systems Inc Common Stock De,USD,XNGS,[0:0.0001][1:0.01],[Esignal=CSCO][Figi=BBG000C3J3C9][Figic=BBG000...,Composite|Industrials|Nasdaq 100|Nasdaq Compos...,,,,1,1.0,US17275R1023,17275R102,2198163,"[Nasdaq,Computer peripheral equipment][SIC,357...",267269700000.0,35
70,AAPL|XNGS,Equity,US,Apple Inc Common Stock,USD,XNGS,[0:0.0001][1:0.01],[Esignal=AAPL][Idc|564=564|AAPL][Bloomberg=AAP...,Composite|Industrials|Nasdaq 100|Nasdaq Compos...,,,,1,1.0,US0378331005,037833100,2046251,"[SIC,3571 Electronic Computers][Nasdaq,Compute...",3078591000000.0,35
92,LRCX|XNGS,Equity,US,Lam Research Corp,USD,XNGS,[0:0.0001][1:0.01],[Bloomberg=LRCX US Equity][Figi=BBG000BNFLM9][...,Nasdaq 100|Nasdaq Composite|Russell 1000|Russe...,,,,1,1.0,US5128071082,512807108,2502247,"[Nasdaq,Industrial Machinery/Components][SIC,3...",101255400000.0,35


In [18]:
def get_prices_df(equity_data):     # equity_data est un tableau de d'equities
    liste_df = []
    for i in range(len(equity_data)):
        id = equity_data.iloc[i]['Identifier'].split('|')
        ticker, exchange = id[0],id[1]
        SIC = equity_data.iloc[i]['SIC']
        request = daily_prices.DailyPricesRequest( identifier = identifier.Identifier(exchange = exchange, ticker = ticker))
        
        # open a gRPC channel
        with open(os.environ['SSL_CERT_FILE'], 'rb') as f:
            credentials = grpc.ssl_channel_credentials(f.read())
        with grpc.secure_channel(os.environ['GRPC_APIS'], credentials) as channel:

            # instantiate the daily prices service
            service = daily_prices_service.DailyPricesServiceStub(channel)

            # process the daily prices request
            response = service.DailyPrices(
            request = request, 
            metadata = [('authorization', token)]
            )
        
        # prepare the dataframe content
        dates=[datetime(p.date.year, p.date.month, p.date.day) for p in response.data]
        prices = [p.price for p in response.data]

        d = {'Date': dates, f'{ticker}': prices}
        liste_df.append(pd.DataFrame(data=d))
    return [SIC, liste_df]

### Application de la séparation par SIC (secteur)

In [19]:
# On cherche à obtenir les prix de chaque action en les classant par secteur
df = []
for i in range(len(data_sec)):    
    df.append(get_prices_df(data_sec[i]))     # df est une liste de dataframe contenant les prix des equities par secteur ainsi que le sic
                                              # df est de la forme [[SIC,[df,df]],[SIC,[df,df,df]],[SIC,[df,df]].....]

# Puis on merge les DF au sein des listes     
# Pour former  liste_df [[SIC,df],[SIC,df],[SIC,df],[SIC,df]]
liste_df = []
for i in range(len(df)):
    concat = df[i][1][0]
    for j in range(1,len(df[i][1])):
        concat = concat.merge(df[i][1][j], on = "Date")
    liste_df.append([df[i][0], concat])

In [20]:
print(df[6])

['38', [           Date    DXCM
0    2005-04-14   11.74
1    2005-04-15   10.25
2    2005-04-18   10.50
3    2005-04-19   10.58
4    2005-04-20   10.60
...         ...     ...
4206 2021-12-28  529.50
4207 2021-12-29  535.40
4208 2021-12-30  541.31
4209 2021-12-31  536.95
4210 2022-01-03  520.79

[4211 rows x 2 columns],            Date     ALGN
0    2001-01-26   17.313
1    2001-01-29   18.063
2    2001-01-30   16.875
3    2001-01-31   13.500
4    2001-02-01   14.188
...         ...      ...
5263 2021-12-28  656.370
5264 2021-12-29  653.340
5265 2021-12-30  662.220
5266 2021-12-31  657.180
5267 2022-01-03  648.050

[5268 rows x 2 columns],            Date        ISRG
0    2001-01-02    1.812444
1    2001-01-03    1.944444
2    2001-01-04    1.764000
3    2001-01-05    1.722222
4    2001-01-08    1.666667
...         ...         ...
5280 2021-12-28  364.540000
5281 2021-12-29  365.270000
5282 2021-12-30  363.300000
5283 2021-12-31  359.300000
5284 2022-01-03  360.000000

[5285 rows x 2 

## Création de toutes les paires possibles

#### Récupération des paires dans chaque secteur

In [21]:
def get_list_paires_possibles(liste_df):       # On cherche les paires possibles pour chaque secteur
    paires = []
    for i in range(len(liste_df)):       # Pour chaque secteur
        paires.append(liste_df[i][1].columns.tolist()[1:])
    return paires

#### Récupération des combinaisons dans chaque secteur

In [22]:
def get_combinations(liste_paires):   # On fait une combinaison de toutes les facons possibles de faire des paires
    combi = []
    for i in range(len(liste_paires)):
        combi.append(list(itertools.combinations(liste_paires[i],2)))
    return combi

In [23]:
liste_paires = get_list_paires_possibles(liste_df)  # de la forme [['AAPL','GOOGL'], ['NVDA','AMD'], ...]
combinaisons = get_combinations(liste_paires)  # Listes de tuples avec toutes les combinaisons possibles

### Requete de prix pour chaque paire possible

#### Fonction qui va requêter les prix à travers l'API

In [24]:
def df_paire(paire, exchange):
    request1 = daily_prices.DailyPricesRequest( identifier = identifier.Identifier(exchange = exchange, ticker = paire[0]))
    request2 = daily_prices.DailyPricesRequest( identifier = identifier.Identifier(exchange = exchange, ticker = paire[1]))
        
    # open a gRPC channel
    with open(os.environ['SSL_CERT_FILE'], 'rb') as f:
        credentials = grpc.ssl_channel_credentials(f.read())
    with grpc.secure_channel(os.environ['GRPC_APIS'], credentials) as channel:

        # instantiate the daily prices service
        service = daily_prices_service.DailyPricesServiceStub(channel)

        # process the daily prices request
        response1 = service.DailyPrices(request = request1, metadata = [('authorization', token)])
        response2 = service.DailyPrices(request = request2, metadata = [('authorization', token)])

    # prepare the dataframe content
    dates1 = [datetime(p.date.year, p.date.month, p.date.day) for p in response1.data]
    dates2 = [datetime(p.date.year, p.date.month, p.date.day) for p in response2.data]
    if (len(dates1) <= len(dates2)):
        dates = dates1
    else:
        dates = dates2
    prices1 = [p.price for p in response1.data][-len(dates):]       # So all arrays have the same length
    prices2 = [p.price for p in response2.data][-len(dates):]
    
    d = {'Date': dates, f'{paire[0]}': prices1, f'{paire[1]}': prices2}
    return pd.DataFrame(data=d)

In [25]:
df_paire(combinaisons[0][0],'XNGS')

Unnamed: 0,Date,KDP,KHC
0,2018-07-10,22.19,64.00
1,2018-07-11,24.00,63.74
2,2018-07-12,25.00,63.64
3,2018-07-13,24.25,63.85
4,2018-07-16,24.80,62.64
...,...,...,...
874,2021-12-28,36.37,35.86
875,2021-12-29,36.58,35.81
876,2021-12-30,36.68,35.66
877,2021-12-31,36.86,35.90


#### Fonction qui agrège les requêtes entre elle pour classer les paires suivant leur groupe

In [26]:
def get_combinaisons_df_opti(combinaisons, exchange):
    return [[df_paire(x, exchange) for x in groupe] for groupe in combinaisons]

In [140]:
%%time
df_combi = get_combinaisons_df_opti(combinaisons, 'XNGS')  # Représente une liste de liste avec les paires triés par groupe [#Groupe1[df,df,df,df], #Groupe2[df,df,df,df], [df,df], ...] Les groupes représentant les secteurs

CPU times: user 25.1 s, sys: 0 ns, total: 25.1 s
Wall time: 7min 36s


### Calcul du nombre de Paires crées

In [28]:
nb = 0
for i in range(len(df_combi)):
    nb += len(df_combi[i])
"Nombre de paires = " + str(nb)   # Nombre de paires

'Nombre de paires = 576'

## Séparation entre les données de test et celles d'entrainement

In [135]:
def sep_tableau_train_test(df):    # Séparation entre les données de test et de train
    sub_train = []
    sub_test = []
    test = []
    train = []
    for i in range(len(df)):
        for j in range(len(df[i])):
            long = df[i][j].shape[0]
            sub_train.append(df[i][j].iloc[0 : int(long/2)])   #int(long/2)
            sub_test.append(df[i][j].iloc[int(long/2)+1 : long-1])
        train.append(sub_train)
        test.append(sub_test)
    train_df = copy.deepcopy(train)
    test_df = copy.deepcopy(test)
    return train_df, test_df

In [141]:
train, test = sep_tableau_train_test(df_combi)

## Fonctions de Sélections des paires

#### Calcul de la cointégration

In [180]:
def coint_df(df):
    df.loc[df.index[0], 'Cointégration'] = cointegration_test(df.iloc[:,1], df.iloc[:,2])
    global i
    print(f"{list(df.columns)[1:3]} & i = {i}")
    i += 1
    return df

In [123]:
def add_coint(df_combi):
    return [[coint_df(x) for x in groupe] for groupe in df_combi]

#### Calcul de la stationnarité

In [33]:
def statio_df(df, column_name, indicateur):
    df[column_name] = pd.Series(stationarity_test_pvalue(df[indicateur]), index = df.index[[0]])
    return df

In [34]:
def add_statio(df, column_name, indicateur):
    return [[statio_df(x, column_name, indicateur) for x in groupe] for groupe in df]

#### Calcul du ratio

In [35]:
def ratio_df(df):
    df['Ratio'] = df.iloc[:,1] / df.iloc[:,2]
    return df

In [36]:
def add_ratio_list_df(df):
    return add_statio(df, 'Statio_Ratio', 'Ratio')

#### Calcul du log-ratio

In [37]:
def log_ratio_df(df):
    df['Log_Ratio'] = np.log(df.iloc[:,1] / df.iloc[:,2])
    return df

In [38]:
def add_log_list_df(df):
    return add_statio(df, 'Statio_Log_Ratio', 'Log_Ratio')

##### Ajout des ratios au df

In [39]:
def add_ratios(df):
    df = [[ratio_df(x) for x in groupe] for groupe in df]
    return [[log_ratio_df(x) for x in groupe] for groupe in df]

#### Fonction qui enlève les paires peu cointégrées

In [40]:
def drop_faible_coint(df_combi, threshold):    # Retourne une liste par secteur de paires suffisament cointégrés
    df_combi_copy = []
    for i in range(len(df_combi)):
        df_combi_copy.append([x for x in df_combi[i] if x['Cointégration'][0] < threshold])
    for x in df_combi_copy:
        if x == []:
            df_combi_copy.remove([])
    return df_combi_copy

#### Fonction qui enlève les paires dont les 2 indicateurs sont non-stationnaires

In [41]:
def drop_faible_statio(df_combi, threshold):    # Retourne une liste par secteur de paires suffisament cointégrés
    df_combi_copy = []
    count_ratio = 0
    count_log = 0
    same = 0
    for i in range(len(df_combi)):
        df_combi_copy.append([x for x in df_combi[i] if (x['Statio_Log_Ratio'][0] < threshold or x['Statio_Ratio'][0] < threshold)])
        for x in df_combi[i]:
            if x['Statio_Ratio'][0] < threshold:     # On compte le nombre de stationnarité acceptable entre le ratio classique et le log_ratio
                count_ratio += 1
            if x['Statio_Log_Ratio'][0] < threshold:
                count_log += 1
            if x['Statio_Log_Ratio'][0] < threshold and x['Statio_Ratio'][0] < threshold:
                same += 1
    for x in df_combi_copy:
        if x == []:
            df_combi_copy.remove([])
    return df_combi_copy, count_ratio, count_log, same

#### Cointégration

In [181]:
i = 0

In [None]:
%%time
df_coint = add_coint(train)

['KDP', 'KHC'] & i = 0
['KDP', 'PEP'] & i = 1
['KDP', 'MDLZ'] & i = 2
['KDP', 'MNST'] & i = 3
['KHC', 'PEP'] & i = 4
['KHC', 'MDLZ'] & i = 5
['KHC', 'MNST'] & i = 6
['PEP', 'MDLZ'] & i = 7
['PEP', 'MNST'] & i = 8
['MDLZ', 'MNST'] & i = 9
['LULU', 'CTAS'] & i = 10
['SGEN', 'GILD'] & i = 11
['SGEN', 'BIIB'] & i = 12
['SGEN', 'MRNA'] & i = 13
['SGEN', 'AMGN'] & i = 14
['SGEN', 'IDXX'] & i = 15
['SGEN', 'VRTX'] & i = 16
['SGEN', 'REGN'] & i = 17
['GILD', 'BIIB'] & i = 18
['GILD', 'MRNA'] & i = 19
['GILD', 'AMGN'] & i = 20
['GILD', 'IDXX'] & i = 21
['GILD', 'VRTX'] & i = 22
['GILD', 'REGN'] & i = 23
['BIIB', 'MRNA'] & i = 24
['BIIB', 'AMGN'] & i = 25
['BIIB', 'IDXX'] & i = 26
['BIIB', 'VRTX'] & i = 27
['BIIB', 'REGN'] & i = 28
['MRNA', 'AMGN'] & i = 29
['MRNA', 'IDXX'] & i = 30
['MRNA', 'VRTX'] & i = 31
['MRNA', 'REGN'] & i = 32
['AMGN', 'IDXX'] & i = 33
['AMGN', 'VRTX'] & i = 34
['AMGN', 'REGN'] & i = 35
['IDXX', 'VRTX'] & i = 36
['IDXX', 'REGN'] & i = 37
['VRTX', 'REGN'] & i = 38
['CSCO',

In [148]:
df_coint[0][0]

Unnamed: 0,Date,KDP,KHC,Cointégration
0,2018-07-10,22.19,64.0,0.9859
1,2018-07-11,24.0,63.74,
2,2018-07-12,25.0,63.64,
3,2018-07-13,24.25,63.85,
4,2018-07-16,24.8,62.64,
5,2018-07-17,24.96,63.05,
6,2018-07-18,24.79,61.59,
7,2018-07-19,24.43,61.34,
8,2018-07-20,24.57,60.69,
9,2018-07-23,24.28,60.3,


### Fonction qui agrège les différentes fonctions de sélection (Log-Ratio, Ratio, Stationnarité des indicateurs) et qui enlève les paires qui ne valident pas les critères.

In [149]:
def pair_selection(df_coint):
    df = drop_faible_coint(df_coint, 0.05)
    df = add_ratios(df)
    df = add_log_list_df(df)
    df = add_ratio_list_df(df)
    return drop_faible_statio(df, 0.05)

In [150]:
%%time
df_final, ratio_score, log_score, same = pair_selection(df_coint)

CPU times: user 798 ms, sys: 0 ns, total: 798 ms
Wall time: 802 ms


## Score de la sélection des paires

In [151]:
"Score ratio = " + str(ratio_score) + " Score Log_Ratio = " + str(log_score) + " Same = " + str(same)

'Score ratio = 0 Score Log_Ratio = 0 Same = 0'

In [152]:
#We plot a random cointegrate pair to verify if everything is ok
plt.figure(figsize=(25, 10))
plt.plot('Date', df_final[3][0].columns[1], data=df_final[3][0], marker='', color='blue', linewidth=1, alpha = 0.6, label=df_final[3][0].columns[1])
plt.plot('Date', df_final[3][0].columns[2], data=df_final[3][0], marker='', color='red', linewidth=1, label=df_final[3][0].columns[2])
plt.ylabel('Price')
plt.xlabel('Date')
plt.title("{} and {} prices".format(df_final[3][0].columns[1],df_final[3][0].columns[2]))
plt.show()

IndexError: list index out of range

<Figure size 1800x720 with 0 Axes>

### Signals

In [None]:
def linear_regression(df): #prends en entrée un dataframe représentant une paire
# Engle-Granger method (spread method)
    S1=df[df.columns[1]]
    S2=df[df.columns[2]]
    S1 = sm.add_constant(S1)
    results = sm.OLS(S2, S1).fit()
    S1 = S1[df.columns[1]]
    b = results.params[df.columns[1]]
    spread = S2 - b * S1
    df['Spread']=spread
    ''' Plot for seeing 
    spread.plot(figsize=(12,6))
    plt.axhline(spread.mean(), color='black')
    plt.legend(['Spread']);
    '''
    return df

In [None]:
def zscore(df): #we standardized
    df["Zscore"]=(df["Spread"] - df["Spread"].mean()) / np.std(df["Spread"])
    return df

In [None]:
def rendement(df,j):
    #calculate the yield of the last j days
    rend1=[0 for i in range(j)]
    rend2=[0 for i in range(j)]
    for i in range(j,df.shape[0]):
        rend1.append((df.loc[i,df.columns[1]]-df.loc[i-j,df.columns[1]]) / df.loc[i-j,df.columns[1]])
        rend2.append((df.loc[i,df.columns[2]]-df.loc[i-j,df.columns[2]]) / df.loc[i-j,df.columns[2]])
    df["Rend_" + df.columns[1]]=rend1 
    df["Rend_" + df.columns[2]]=rend2 
    return df

In [None]:
def volatility(df,j):
    vol1=[0 for i in range(j)]
    vol2=[0 for i in range(j)]
    for i in range(j,df.shape[0]):
        vol1.append(df[df.columns[1]][i-j:i].std())
        vol2.append(df[df.columns[2]][i-j:i].std())
    df["Vol_" + df.columns[1]]=vol1 
    df["Vol_" + df.columns[2]]=vol2 
    return df
    

In [None]:
 find_stationarity(list_pairs) # All pair1-pair2 are not stationary (it makes sense)

In [None]:
for i in range(len(list_pairs)):
    if len(list_pairs[i])!=0:
        for j in range(len(list_pairs[i])):
            list_pairs[i][j]=linear_regression(list_pairs[i][j])
            list_pairs[i][j]=zscore(list_pairs[i][j])
            if list_pairs[i][j] is False: #we verify that each zscore series is stationary
                print(list_pairs[i][j].columns[2]+" - "+ list_pairs[i][j].columns[1] + " not stationary")
            list_pairs[i][j]=volatility(list_pairs[i][j],30)
            

            
            
list_pairs[13][0]["Zscore"].plot(figsize=(12,6))
plt.axhline(list_pairs[13][0]["Zscore"].mean())
plt.axhline(list_pairs[13][0]["Zscore"].std(), color='red')
plt.axhline(-list_pairs[13][0]["Zscore"].std(), color='green')
plt.show()

In [None]:
print(list_pairs[13][0])
print(adfuller(list_pairs[13][0]["Zscore"])[1])