In [None]:
import pandas as pd
from datetime import timedelta
import numpy as np

In [None]:
# paths and files
path_df_tickers = ".../2 NER/data/parsed_and_extracted_tickers.xlsx"
path_df_candles = "data/sechist_to_Arthur.xlsx"

df_tickers = pd.read_excel(path_df_tickers).iloc[:,1:] # Messages from Telegram chats where specific tickers are mentioned and identified
df_candles = pd.read_excel(path_df_candles).iloc[:,1:] # time series data of ticker candles

In [None]:
# We include only tickers that were mentioned on Telegram and are listed at level 2 or above
considered_tickers = df_tickers['security_id'].unique()
filter_1 = df_candles['securityid'].apply(lambda x: x in considered_tickers)
filter_2 = df_candles['listlevel'].apply(lambda x: x > 1)
df_candles = df_candles[filter_1 & filter_2]

In [None]:
# For each ticker, we normalize the current high and low prices by dividing them by yesterday’s closing price.
df_candles['deviation_high'] = df_candles['highprice'] / df_candles.groupby('securityid')['closeprice'].transform(lambda x: x.shift(1))
df_candles['deviation_low'] = df_candles['lowprice'] / df_candles.groupby('securityid')['closeprice'].transform(lambda x: x.shift(1))

In [None]:
# We remove missing values and compute the quantile.
df_candles = df_candles.dropna()

df_candles['quantile_high'] = df_candles.groupby(['securityid'])['deviation_high'].transform(lambda x: x.quantile(0.9))
df_candles['quantile_low'] = df_candles.groupby(['securityid'])['deviation_low'].transform(lambda x: x.quantile(0.1))

In [None]:
# filter anomalies: keep only values which fall below the 10th or exceed the 90th percentile
filter_ = (df_candles['deviation_high'] > df_candles['quantile_high']) | (df_candles['deviation_low'] < df_candles['quantile_low'])

df_filtered = df_candles[filter_].reset_index(drop=True)
df_filtered

Unnamed: 0,boardid,securityid,tradedate,openprice,highprice,lowprice,closeprice,listlevel,deviation_high,deviation_low,quantile_high,quantile_low
0,TQBR,ABIO,2024-01-04,104.100,110.980,103.500,109.440,2.0,1.074763,1.002324,1.050060,0.961169
1,TQBR,ABIO,2024-01-15,109.880,115.320,109.120,111.500,2.0,1.057206,1.000367,1.050060,0.961169
2,TQBR,ABIO,2024-01-25,111.820,113.800,106.300,107.240,2.0,1.017707,0.950635,1.050060,0.961169
3,TQBR,ABIO,2024-02-21,105.000,105.140,99.160,102.600,2.0,0.994138,0.937595,1.050060,0.961169
4,TQBR,ABIO,2024-03-20,102.160,115.000,102.160,109.800,2.0,1.125685,1.000000,1.050060,0.961169
...,...,...,...,...,...,...,...,...,...,...,...,...
2015,TQBR,KLVZ,2024-07-04,6.878,6.930,6.520,6.726,3.0,1.007560,0.947950,1.045459,0.953315
2016,TQBR,KLVZ,2024-07-05,6.720,6.740,6.400,6.506,3.0,1.002081,0.951531,1.045459,0.953315
2017,TQBR,KLVZ,2024-07-11,6.048,6.340,6.000,6.310,3.0,1.048280,0.992063,1.045459,0.953315
2018,TQBR,KLVZ,2024-07-19,6.130,6.628,6.118,6.300,3.0,1.083715,1.000327,1.045459,0.953315


In [None]:
# keep tickers that were mentioned more than 5 times in the same chat on the same day
df_anomalys = df_tickers.groupby(['chat', 'date', 'security_id'])['security_id'].count().reset_index(name='count').sort_values(by='count', ascending=False)
filter = df_anomalys['count'] > 5

df_anomalys = df_anomalys[filter].reset_index(drop=True)
df_anomalys.head(10)

Unnamed: 0,chat,date,security_id,count
0,BigBro_moex,2024-07-18,SNGS,65
1,BigBro_moex,2024-06-20,TATN,22
2,BigBro_moex,2024-05-15,SNGS,22
3,BigBro_moex,2024-05-31,MGNT,20
4,BigBro_moex,2024-06-13,MOEX,16
5,BigBro_moex,2024-07-17,SNGS,15
6,BigBro_moex,2024-05-21,MGNT,12
7,BigBro_moex,2024-06-03,MGNT,11
8,BigBro_moex,2024-07-09,SBER,10
9,BigBro_moex,2024-05-13,SNGS,10


In [None]:
### We attempt to measure the impact of messages on the following day!
df_anomalys = df_anomalys.rename({'security_id': 'securityid', 'date': 'date_of_discussion'}, axis='columns')
df_anomalys['tradedate'] = df_anomalys['date_of_discussion'].apply(lambda x: x + timedelta(days=1))

merged_data= df_anomalys.merge(df_filtered, on=["tradedate","securityid"])
merged_data

Unnamed: 0,chat,date_of_discussion,securityid,count,tradedate,boardid,openprice,highprice,lowprice,closeprice,listlevel,deviation_high,deviation_low,quantile_high,quantile_low
0,birzhevikstocksofficial2,2024-06-17,UNAC,4,2024-06-18,TQBR,0.9985,1.0015,0.9315,0.9505,3.0,1.003005,0.932899,1.054073,0.944349
1,BigBro_moex,2024-06-19,SNGS,3,2024-06-20,TQBR,27.0,28.385,26.86,28.33,2.0,1.049741,0.993343,1.031627,0.966223
2,BigBro_moex,2024-07-15,MGNT,3,2024-07-16,TQBR,5895.0,5942.5,5616.0,5912.0,3.0,1.008913,0.95348,1.023422,0.973725
3,BigBro_moex,2024-07-25,SIBN,3,2024-07-26,TQBR,705.3,709.2,684.0,684.4,3.0,1.005458,0.969731,1.023951,0.971971
4,birzhevikstocksofficial2,2024-05-16,ROSB,3,2024-05-17,TQBR,122.2,123.8,116.4,120.2,3.0,1.009788,0.949429,1.042891,0.962577
5,smartlabnews,2024-05-16,ROSB,2,2024-05-17,TQBR,122.2,123.8,116.4,120.2,3.0,1.009788,0.949429,1.042891,0.962577
6,kuzmlab,2024-05-13,SPBE,2,2024-05-14,TQBR,108.2,113.8,108.0,112.7,3.0,1.074599,1.01983,1.06614,0.945617
7,araketa,2024-07-23,NKHP,2,2024-07-24,TQBR,890.0,949.0,852.5,853.0,3.0,1.069899,0.961105,1.064705,0.955743
8,BigBro_moex,2024-06-04,SIBN,2,2024-06-05,TQBR,675.0,699.85,670.0,690.9,3.0,1.038122,0.993844,1.023951,0.971971
9,birzhevikstocksofficial2,2024-06-18,RUSI,2,2024-06-19,TQBR,104.7,104.8,99.0,99.3,3.0,1.000955,0.945559,1.060904,0.952923


In [None]:
# Mark deviations where anomalies occurred
merged_data.style.apply(lambda x: ["background: lightgreen" 
                                   if ((i == 11) and (x.iloc[11] > x.iloc[13])) or ((i == 12) and (x.iloc[12] < x.iloc[14]))
                                   else "" for i,_ in enumerate(x)],
                        axis=1
                        )

Unnamed: 0,chat,date_of_discussion,securityid,count,tradedate,boardid,openprice,highprice,lowprice,closeprice,listlevel,deviation_high,deviation_low,quantile_high,quantile_low
0,birzhevikstocksofficial2,2024-06-17 00:00:00,UNAC,4,2024-06-18 00:00:00,TQBR,0.9985,1.0015,0.9315,0.9505,3.0,1.003005,0.932899,1.054073,0.944349
1,BigBro_moex,2024-06-19 00:00:00,SNGS,3,2024-06-20 00:00:00,TQBR,27.0,28.385,26.86,28.33,2.0,1.049741,0.993343,1.031627,0.966223
2,BigBro_moex,2024-07-15 00:00:00,MGNT,3,2024-07-16 00:00:00,TQBR,5895.0,5942.5,5616.0,5912.0,3.0,1.008913,0.95348,1.023422,0.973725
3,BigBro_moex,2024-07-25 00:00:00,SIBN,3,2024-07-26 00:00:00,TQBR,705.3,709.2,684.0,684.4,3.0,1.005458,0.969731,1.023951,0.971971
4,birzhevikstocksofficial2,2024-05-16 00:00:00,ROSB,3,2024-05-17 00:00:00,TQBR,122.2,123.8,116.4,120.2,3.0,1.009788,0.949429,1.042891,0.962577
5,smartlabnews,2024-05-16 00:00:00,ROSB,2,2024-05-17 00:00:00,TQBR,122.2,123.8,116.4,120.2,3.0,1.009788,0.949429,1.042891,0.962577
6,kuzmlab,2024-05-13 00:00:00,SPBE,2,2024-05-14 00:00:00,TQBR,108.2,113.8,108.0,112.7,3.0,1.074599,1.01983,1.06614,0.945617
7,araketa,2024-07-23 00:00:00,NKHP,2,2024-07-24 00:00:00,TQBR,890.0,949.0,852.5,853.0,3.0,1.069899,0.961105,1.064705,0.955743
8,BigBro_moex,2024-06-04 00:00:00,SIBN,2,2024-06-05 00:00:00,TQBR,675.0,699.85,670.0,690.9,3.0,1.038122,0.993844,1.023951,0.971971
9,birzhevikstocksofficial2,2024-06-18 00:00:00,RUSI,2,2024-06-19 00:00:00,TQBR,104.7,104.8,99.0,99.3,3.0,1.000955,0.945559,1.060904,0.952923


In [None]:
# Find texts relating to those securities which had anomalies 

merged_data = merged_data.loc[:, ['date_of_discussion', 'securityid']]

filter_final = np.full(df_tickers.shape[0], False)
for i in range(merged_data.shape[0]):
    filter_ = (df_tickers['security_id'] == merged_data['securityid'][i]) & (df_tickers['date'] == merged_data['date_of_discussion'][i])
    print(filter_.value_counts())
    filter_final = filter_final | filter_

False    4371
True        4
Name: count, dtype: int64
False    4372
True        3
Name: count, dtype: int64
False    4370
True        5
Name: count, dtype: int64
False    4371
True        4
Name: count, dtype: int64
False    4370
True        5
Name: count, dtype: int64
False    4370
True        5
Name: count, dtype: int64
False    4372
True        3
Name: count, dtype: int64
False    4372
True        3
Name: count, dtype: int64
False    4373
True        2
Name: count, dtype: int64
False    4373
True        2
Name: count, dtype: int64
False    4373
True        2
Name: count, dtype: int64
False    4371
True        4
Name: count, dtype: int64
False    4372
True        3
Name: count, dtype: int64
False    4373
True        2
Name: count, dtype: int64
False    4373
True        2
Name: count, dtype: int64


In [None]:
# Save them
# df_tickers[filter_final].to_excel('data/messages_day_before_anomalys.xlsx')
df_tickers[filter_final].reset_index(drop=True)

Unnamed: 0,chat,date,time,text,text_id,security_id,matching_score
0,moexwolf,2024-05-13,14:05:26,Соболезную многим Евроклировским \n\nАкции Yan...,-1001511455705_12979,SPBE,0.888889
1,kuzmlab,2024-05-13,14:07:13,Спб биржа поэтому растет на 3%? Типа хоть что-...,-1001241408755_7613,SPBE,0.888889
2,kuzmlab,2024-05-13,12:29:12,"Ясно, а то думал торгануть эппл, теслу и Гонко...",-1001241408755_7610,SPBE,1.0
3,birzhevikstocksofficial2,2024-05-16,14:23:57,💬 Мнение: до чего ж непристойно отрастёт прибы...,-1001313313883_21582,ROSB,1.0
4,birzhevikstocksofficial2,2024-05-16,14:23:57,💬 Мнение: до чего ж непристойно отрастёт прибы...,-1001313313883_21582,ROSB,0.933333
5,birzhevikstocksofficial2,2024-05-16,14:23:57,💬 Мнение: до чего ж непристойно отрастёт прибы...,-1001313313883_21582,ROSB,1.0
6,smartlabnews,2024-05-16,18:53:29,Отчётности управляют котировками🔥Акции и инвес...,-1001063908560_12379,ROSB,1.0
7,smartlabnews,2024-05-16,18:53:29,Отчётности управляют котировками🔥Акции и инвес...,-1001063908560_12379,ROSB,0.933333
8,BigBro_moex,2024-05-28,13:22:27,"SIBN #SIBN 720,6₽\n""Газпром нефть"" начала пром...",-1001666192661_55122,SIBN,1.0
9,BigBro_moex,2024-05-28,12:45:59,"SIBN #SIBN 716,1₽\n""ГАЗПРОМ НЕФТЬ"" НАЧАЛА ПРОМ...",-1001666192661_55120,SIBN,1.0


In [None]:
# an example
print(df_tickers[filter_final]['text'].reset_index(drop=True)[12])

MGNT    Магнит ао 🐳
🔴Аномальный объем!
62М₽ (7 935 лот.)
🔸Изм. цены -0,90% ⚡⚡ Цена 7 749,5₽
Покупки 19% Продажи 81%

Итого за день: 
цена -1,6% ATR 82%
объем 335М₽, покупки 36%
кол-во сигн. пок. 0 прод. 1
30.05.24  11:42:37  #MGNT
Отправлено с задержкой.
Подключай полную версию
@BigBro_robot
