In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import dateutil
import re
import os

In [2]:
boursorama_folder = 'data/boursorama'

In [3]:
compA_files = glob.glob(boursorama_folder + '/2019/compA 2019-*')
len(compA_files)

19948

## Loading files

In [4]:
years = os.listdir('data/boursorama/')
years.remove('.DS_Store')
years = [int(year) for year in years]
years.sort()
years = [str(year) for year in years]
years

all_paths = ['./data/boursorama/' + year + '/' for year in years]
all_paths

['./data/boursorama/2019/',
 './data/boursorama/2020/',
 './data/boursorama/2021/',
 './data/boursorama/2022/',
 './data/boursorama/2023/']

In [None]:
%%time
compA = pd.concat({dateutil.parser.parse(f.split('amsterdam')[1][:-4]): pd.read_pickle(f) for year_action in all_paths for f in glob.glob(year_action + 'amsterdam*')})
compA.sort_index(inplace=True) # chronological order
compA.head()

In [6]:
compB = pd.concat({dateutil.parser.parse(f.split('compB')[1][:-4]): pd.read_pickle(f) for year_action in all_paths for f in glob.glob(year_action + 'compB*')})
compB.sort_index(inplace=True) # chronological order
compB.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,last,volume,symbol,name
Unnamed: 0_level_1,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-01 09:05:02.607291,1rPAAF,42.0,0,1rPAAF,ALAIN AFFLELOU SA
2019-01-01 09:05:02.607291,1rPAB,3.468,90743,1rPAB,AB SCIENCE
2019-01-01 09:05:02.607291,1rPABCA,6.08,20318,1rPABCA,ABC ARBITRAGE
2019-01-01 09:05:02.607291,1rPABEO,31.8,869,1rPABEO,ABEO
2019-01-01 09:05:02.607291,1rPABIO,18.9,6643,1rPABIO,ALBIOMA


In [None]:
amsterdam = pd.concat({dateutil.parser.parse(f.split('amsterdam')[1][:-4]): pd.read_pickle(f) for year_action in all_paths for f in glob.glob(year_action + 'amsterdam*')})
amsterdam.sort_index(inplace=True) # chronological order
amsterdam.head()

## Prepare Dataframe

last should be numerical but isn't, the other columns have a correct type.

In [None]:
compA.isna().sum(), compA.info()

In [None]:
def floatify(x) -> float:
    """
    Convert a string to a float, removing spaces if necessary.
    Used because .str.replace(' ', '').astype(float) + pd.to_numeric worked only on strings.
    Doing the operation on a float would result in a NaN.

    Handle:
    - regular numeric (13, 0.14, 2.343)
    - string ('13.0', '1321.491823', '12  222.222', '34.23 (c)')

    :param x: str|float
    :return: float
    """
    try:
        return float(re.sub(r'[^0-9.]', '', x))
    except:
        return x

In [None]:
assert floatify('13') == 13, f"Expected 13, got {floatify('13')} with type {type(floatify('13'))}"
assert floatify('13.0') == 13, f"Expected 13, got {floatify('13.0')} with type {type(floatify('13.0'))}"
assert floatify('13.2 (c)') == 13.2, f"Expected 13.2, got {floatify('13.2 (c)')} with type {type(floatify('13.2 (c)'))}"
assert floatify(13.0) == 13, f"Expected 13, got {floatify(13.0)} with type {type(floatify(13.0))}"

In [None]:
%%time
compA['last'] = compA['last'].apply(floatify).astype(float)
compA.isna().sum()

In [None]:
axa_symbol = '1rPCS'
compA.loc[compA['symbol'] == axa_symbol
].plot(y='volume', title=axa_symbol)

## Compute volume at time T instead of cumulative volume of the day

The volumes in the dataframe are intra-day total volume. Each day it gets reset to 0.
Having the volume as the volume traded since last timestamp would be more interesting.

In [None]:
compA['volume_diff'] = compA.groupby([compA.index.get_level_values('symbol'), compA.index.get_level_values(0).date])['volume'].diff()
compA.fillna({'voxlume_diff': compA.volume}, inplace=True)
compA[compA['volume_diff'] < 0] 

In [None]:
compA.loc[compA['symbol'] == axa_symbol].plot(y='volume_diff', title=axa_symbol)

Some value are negative. Should'nt be. Let's remove them.\
Let's check that the volume gets reset correctly on each day while we are at it.

In [None]:
# from 2019-01-03 17:35:00 to 2019-01-04 9:20:00
compA.loc[compA.symbol == axa_symbol].loc['2019-01-03 17:35:00':'2019-01-04 9:20:00']

In [None]:
compA = compA[compA['volume_diff'] >= 0]
compA.shape

In [None]:
compA.loc[compA['symbol'] == axa_symbol].plot(y='volume_diff', title="No more shit volume value")

Do we have fucked up volume after removing them? Lets redo the diff to see.

In [None]:
compA['volume_diff'] = compA.groupby([compA.index.get_level_values('symbol'), compA.index.get_level_values(0).date])['volume'].diff()
compA.fillna({'volume_diff': compA.volume}, inplace=True)
compA.loc[compA.volume_diff < 0]

Oh no shit values again!

In [None]:
nb_bad_values = len(compA.loc[compA.volume_diff < 0])
while nb_bad_values != 0:
    print(nb_bad_values)

    compA['volume_diff'] = compA.groupby([compA.index.get_level_values('symbol'), compA.index.get_level_values(0).date])['volume'].diff()
    compA.fillna({'volume_diff': compA.volume}, inplace=True)
    compA = compA[compA['volume_diff'] >= 0]

    nb_bad_values = len(compA.loc[compA.volume_diff < 0])

In [None]:
compA.loc[compA['symbol'] == '1rPMC'].plot(y='volume_diff', title="No more shit volume value")

In [None]:
compA.isna().sum()

# Day open/close/high/low

In [None]:
grouped = compA.groupby([compA.index.get_level_values('symbol'), compA.index.get_level_values(0).date])
ohlc = grouped['last'].ohlc()
ohlc

Lets drop the NaN

In [None]:
ohlc = ohlc.dropna()
ohlc.tail()

Lets that the values are correct for AXA on 2019-01-03

In [None]:
ohlc.loc[axa_symbol].head()
# 18.664 18.928	18.646	18.650

In [None]:
axa_20190103 = compA.loc[compA.symbol == axa_symbol].loc['2019-01-03']['last'].to_numpy()
print("Expected: 18.664, 18.928, 18.646,18.650")
"Acutal: ", axa_20190103[0], axa_20190103.max(), axa_20190103.min(), axa_20190103[-1]


# Lets plot candlestick (and bollingers bands)

In [None]:
import plotly.graph_objects as go
import plotly.offline as pyo

In [None]:
def candlestick(symbol: str) -> go.Candlestick:
    stock = ohlc.loc[symbol]
    ret = go.Candlestick(
        x=stock.index,
        open=stock['open'],
        high=stock['high'],
        low=stock['low'],
        close=stock['close'],
        text=symbol,
        name=symbol,
    )
    return ret

In [None]:
symbols = ['1rPEDFPFF'] #On peut rajouter le nom de la boite pour opti?

In [None]:
stock = ohlc.loc['1rPEDFPFF']
# bollinger bands
stock['MA20'] = stock.close.rolling(window=20).mean()
stock['STD20'] = stock.close.rolling(window=20).std()
stock['upper'] = stock['MA20'] + (stock['STD20'] * 2)
stock['lower'] = stock['MA20'] - (stock['STD20'] * 2)
stock.drop_duplicates(keep=False)


# REMOVE SYMBOLS END WITH NV

In [None]:
"""
    1rPVK  -> Change name VALOUREC -> SRDVALOUREC à partir de 2022 ça change pas
    1rPEDFPFF -> Fais un truc chelou -> condition d'un minimum de pts sur la courbe ?
    FF11_FP -> Bizarre 3 types de valeurs differentes mais dupliquer -> condition d'un minimum de pts sur la courbe ?
    1rPEUCAR -> GROS TROUS en 2020-2021
    FIXME
"""
all_symbols = set(compA.symbol.values)

pattern = r'.*NV$'
filtered_symbols = {item for item in all_symbols if not re.match(pattern, item)}

# Plot several figure

In [None]:
def fill_stock(symbol: str) -> pd.DataFrame :
    stock = ohlc.loc[symbol]
    # bollinger bands
    stock['MA20'] = stock.close.rolling(window=20).mean()
    stock['STD20'] = stock.close.rolling(window=20).std()
    stock['upper'] = stock['MA20'] + (stock['STD20'] * 2)
    stock['lower'] = stock['MA20'] - (stock['STD20'] * 2)
    return stock

In [None]:
for symbol in filtered_symbols:
    
    stock = fill_stock(symbol)

    #Condition pour enlever les duplicates

    stock.drop_duplicates(keep=False, inplace=True)
    if stock.shape[0] <= 1:
        continue
    print(stock.shape[0])
    fig = go.Figure(
    data=[candlestick(symbol)] + [
        go.Line(x=stock.index, y=stock['MA20'], name=compA[compA['symbol'] == symbol]['name'].values[-1]),
        go.Scatter(x=stock.index, y=stock['upper'], name='upper bollinger', fill='tonexty', line_color='lightblue', opacity=0.3),
        go.Scatter(x=stock.index, y=stock['lower'], name='lower bollinger', fill='tonexty', line_color='lightblue', opacity=0.3),
    ])
    fig.show()

In [None]:
compA[compA['symbol'] == '1rASGO'].drop_duplicates()