In [1]:
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import polars as pl

In [2]:
news_embeddings = pl.read_parquet('../artifacts/russian_news_embeddings.parquet')
news_embeddings.head()

text,timestamp,embedding
str,str,list[f64]
"""В 1930-е годы Советский Союз о…","""2020-08-30T00:01:00+03:00""","[1.074591, -1.174109, … -0.41263]"
"""Олимпийская чемпионка по фигур…","""2020-08-31T20:04:00+03:00""","[-0.124826, -0.899925, … -0.113143]"
"""Российский врач-диетолог Римма…","""2020-08-31T20:07:00+03:00""","[0.705892, -1.752971, … -0.415909]"
"""В 2019 году телеканал «Ю» запу…","""2020-08-30T00:04:00+03:00""","[0.944776, -1.135635, … -0.15538]"
"""Актер Михаил Ефремов система…","""2020-08-31T18:27:00+03:00""","[0.129445, -0.584635, … 0.095618]"


In [57]:
from pathlib import Path
from tqdm.notebook import tqdm
import scipy

def convert_russian_timestamps(timestamp):
    month_map = {
        'января': '01',
        'февраля': '02',
        'марта': '03',
        'апреля': '04',
        'мая': '05',
        'июня': '06',
        'июля': '07',
        'августа': '08',
        'сентября': '09',
        'октября': '10',
        'ноября': '11',
        'декабря': '12'
    }
    
    if "+" in timestamp:
        timestamp = timestamp[:-6]

    try:
        time_part, date_part = timestamp.split(',', 1)
        hour_min = time_part.strip()
        
        date_components = date_part.strip().split()
        day = date_components[0].zfill(2)
        month_ru = date_components[1]
        year = date_components[2]
        
        month_num = month_map.get(month_ru.lower())
        if not month_num:
            raise ValueError(f"Unknown month: {month_ru}")
        
        formatted_ts = f"{year}-{month_num}-{day}T{hour_min}:00"
        return formatted_ts

    except Exception as exc:
        pass

    try:
        time_part, date_part = timestamp.split(' ')[:2]
        hour_min = time_part.strip()
        
        date_components = date_part.strip().split('.')
        day = date_components[0].zfill(2)
        month = date_components[1]
        year = date_components[2]
        
        formatted_ts = f"{year}-{month}-{day}T{hour_min}:00"

        return formatted_ts

    except Exception as exc:
        pass
    
    return timestamp

def create_CAR_edges(news_embeddings: pl.DataFrame, path_to_stock_data: str, alpha: float = 0.05) -> pl.DataFrame:
    def calculate_p_value(t: float, df: int) -> float:
        if df < 1:
            return 1.0
        return 2 * scipy.stats.t.sf(abs(t), df)
    ticker_files = list(Path(path_to_stock_data).glob('*.csv'))
    ticker_data = []
    
    if "id" not in news_embeddings.columns:
        news_embeddings.insert_column(0, pl.Series("id", list(range(news_embeddings.shape[0]))))

    for file_path in tqdm(ticker_files):
        if "all_stocks" in str(file_path): 
            continue

        ticker_name = file_path.stem
        try:
            df = pl.read_csv(file_path)
            
            df = df.with_columns([
                pl.col('close').pct_change().alias('price_change'),
                pl.col('volume').pct_change().alias('volume_change')
            ])
            
            df = df.fill_null(0)
            df.insert_column(0, pl.Series("stock_name", [ticker_name] * df.shape[0]))
            ticker_data.append(df)

        except Exception as e:
            print(f"Error processing {ticker_name}: {str(e)}")
            continue
    
    ticker_data = pl.concat(ticker_data, how="vertical")

    ticker_data = ticker_data.with_columns(
        (pl.col("close") / pl.col("open")).log().alias("log_return")
    )

    news_embeddings = news_embeddings.with_columns(
        timestamp=pl.col("timestamp")
            .map_elements(convert_russian_timestamps)
            .str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S")
    )

    news_embeddings = news_embeddings.with_columns(
        event_start=pl.col("timestamp").dt.ceil("1h"),
        estimation_start=pl.col("timestamp").dt.ceil("1h") - pl.duration(hours=24),
        estimation_end=pl.col("timestamp").dt.ceil("1h") - pl.duration(hours=1)
    )

    unique_stocks = ticker_data.select(pl.col("stock_name").unique())
    news_with_stocks = news_embeddings.join(unique_stocks, how="cross")

    estimation = news_with_stocks.join(
        ticker_data.select(["stock_name", "begin", "log_return"]),
        on="stock_name"
    ).filter(
        pl.col("begin").is_between(pl.col("estimation_start"), pl.col("estimation_end"))
    ).group_by(["id", "stock_name"]).agg(
        mu=pl.mean("log_return"),
        sigma=pl.std("log_return"),
        count=pl.count()
    ).filter(
        pl.col("count") >= 2
    )

    event = news_with_stocks.join(
        ticker_data.select(["stock_name", "begin", "log_return"]),
        on="stock_name"
    ).filter(
        (pl.col("begin") >= pl.col("event_start")) &
        (pl.col("begin") < pl.col("event_start") + pl.duration(hours=6))
    ).group_by(["id", "stock_name"]).agg(
        event_returns=pl.col("log_return"),
        event_count=pl.count()
    ).filter(
        pl.col("event_count") == 6
    )

    combined = estimation.join(
        event,
        on=["id", "stock_name"],
        how="inner"
    ).with_columns(
        CAR=pl.col("event_returns").list.sum() - 6 * pl.col("mu"),
        t=pl.col("CAR") / (pl.col("sigma") * (6 ** 0.5))
    )

    combined = combined.with_columns(
        p_value=pl.struct(["t_stat", "df"]).map_elements(
            lambda x: calculate_p_value(x["t_stat"], x["df"]),
            return_dtype=pl.Float64
        )
    )
    links = combined.with_columns(
        link=pl.when(
            (pl.col("p_value") < alpha) & (pl.col("CAR") > 0)
        ).then(1).when(
            (pl.col("p_value") < alpha) & (pl.col("CAR") < 0)
        ).then(-1).otherwise(0)
    ).select(
        pl.col("id").alias("news_id"),
        pl.col("stock_name"),
        pl.col("link")
    )

    return links

In [58]:
links = create_CAR_edges(news_embeddings, "../ticker_data")

  0%|          | 0/214 [00:00<?, ?it/s]



InvalidOperationError: conversion from `str` to `datetime[μs]` failed in column 'timestamp' for 15624 out of 21673 values: ["21:06 31.08.2020  (обновлено: 21:13 31.08.2020)", "18:46 31.08.2020  (обновлено: 18:47 31.08.2020)", … "19:22 31.08.2020"]

You might want to try:
- setting `strict=False` to set values that cannot be converted to `null`
- using `str.strptime`, `str.to_date`, or `str.to_datetime` and providing a format string

In [None]:
def create_ticker_embeddings(path: str) -> pl.DataFrame:
    ticker_files = list(Path(path).glob('*.csv'))
    ticker_embeddings = {}
    
    for file_path in tqdm(ticker_files):
        if "all_stocks" in str(file_path): 
            continue

        ticker_name = file_path.stem
        try:
            df = pl.read_csv(file_path)
            
            df = df.with_columns([
                pl.col('close').pct_change().alias('price_change'),
                pl.col('volume').pct_change().alias('volume_change')
            ])
            
            df = df.fill_null(0)
            
            features = [
                df['price_change'].mean(),
                df['price_change'].std(),
                df['price_change'].skew(),
                df['price_change'].kurtosis(),
                df['price_change'].rolling_median(3).mean(),
                df['price_change'].rolling_median(5).mean(),
                df['price_change'].rolling_median(12).mean(),
                df['volume_change'].mean(),
                df['volume_change'].std(),
                df['volume_change'].skew(),
                df['volume_change'].kurtosis(),
                df['volume_change'].median(),
                df['volume_change'].rolling_median(3).mean(),
                df['volume_change'].rolling_median(5).mean(),
                df['volume_change'].rolling_median(12).mean(),
            ]
            
            ticker_embeddings[ticker_name] = np.array(features, dtype=np.float32)
            
        except Exception as e:
            print(f"Error processing {ticker_name}: {str(e)}")
            continue
    
    return ticker_embeddings