In [115]:
import sys

sys.path.insert(0, '../')

In [122]:
from urllib.parse import urlparse

from utils.parsers import PolymarketAPIClient, PriceHistoryFetcher
from utils.base import DateConverter
from utils.parsers import fetch_google_news_rss, get_real_url_async

from pathlib import Path

import os
import random
import asyncio
import itertools
import json
import logging
from datetime import datetime, timedelta

url = 'https://polymarket.com/event/largest-company-end-of-may'
o = urlparse(url)

slug = o.path.split('/')[2]

event_api = PolymarketAPIClient("https://gamma-api.polymarket.com/events")
price_api = PriceHistoryFetcher(PolymarketAPIClient("https://clob.polymarket.com/prices-history/"))

res = event_api.get_events(params={'slug': slug})

In [123]:
ROOT =  f"../data/inference_data/"
event = res[0]
event_id = event['id']
event_path = f'{ROOT}{event_id}'
Path(f'{event_path}/sentence_embeddings/').mkdir(parents=True, exist_ok=True)


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename=f'../logs/{event_id}.log'
)

handler = logging.StreamHandler()
handler.setLevel(logging.WARNING)
logging.getLogger().addHandler(handler)
logger = logging.getLogger(__name__)

# Сбор отдельного ивента
now = datetime.now()
end_ts = now.timestamp()
start_ts = max(
    (now - timedelta(days=14)).timestamp(), 
    DateConverter().iso_or_yy_mm_dd_to_unix(res[0]['markets'][0]['startDate'])
)

for m in event['markets']:
    
    prices = []
    try:
        tid = m['clobTokenIds']
    except KeyError:
        continue
    for market_id in eval(tid):

        price = price_api.fetch_prices_history(
            market_id=market_id,
            start_ts=start_ts,
            end_ts=end_ts,
            fidelity=60
        )

        prices.append(price)
        
    m['prices_history'] = prices


with open(f'{event_path}/event.json', 'w') as f:
    json.dump(event, f)

SSLError: HTTPSConnectionPool(host='clob.polymarket.com', port=443): Max retries exceeded with url: /prices-history/?market=26840969418190051427599175382220987005102672149573610325117060079934763731527&startTs=1746304013.249933&endTs=1747513613.249933&fidelity=60 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm
import json
import spacy
import logging


with open(f"{event_path}/event.json", "r") as f:
    event = json.load(f)

# Configuration
entities_path = f"{event_path}/entities.json"


# Load NLP models
try:
    spacy_nlp = spacy.load("en_core_web_sm")
except Exception as e:
    logger.error(f"Failed to load spaCy model: {e}")
    exit(1)

try:
    tokenizer = AutoTokenizer.from_pretrained(
        "dslim/bert-base-NER", tokenizer_args={"do_basic_tokenize": False}
    )
    bert_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
    bert_ner_pipeline = pipeline("ner", model=bert_model, tokenizer=tokenizer, aggregation_strategy="first")
except Exception as e:
    logger.error(f"Failed to load BERT NER model: {e}")
    exit(1)

# Dictionary to hold final results
event_entities = {}

# Load SpaCy for stopword detection
nlp = spacy.load("en_core_web_sm")

# Process each event
try:
    if not event:
        logger.warning("Missing 'event' key in event.")

    event_id = event.get("id")
    event_title = event.get("title")
    event_description = event.get("description")

    # Validate description
    if not isinstance(event_description, str):
        logger.warning(f"Invalid description for event ID {event_id}: {type(event_description)}")
        event_description = ""

    # Extract entities using BERT NER
    try:
        bert_entities = list({
            ent["word"].lower() 
            for ent in bert_ner_pipeline(event_description)
            if not nlp(ent["word"].lower())[0].is_stop
            }) if event_description else []
    except Exception as e:
        logger.error(f"BERT NER error for event {event_id}: {e}")
        bert_entities = []

    # Extract tags
    tags = event.get("tags", [])
    tag_labels = list({tag.get("label", "").lower() for tag in tags if tag.get("label")})
    tag_slugs = list({tag.get("slug", "") for tag in tags if tag.get("slug")})

    # Prepare event entity dictionary
    event_entities_dict = {
        "event_title": event_title,
        "event_ents_bert": bert_entities,
        "tag_labels": tag_labels,
        "tag_slugs": tag_slugs,
        "market_ents": {},
    }

    # Process market questions
    m_ents = {}
    markets = event.get("markets", [])
    for market in markets:

        market_id = market.get("id")
        market_question = market.get("question")

        if not isinstance(market_question, str):
            logger.warning(f"Invalid market question for market ID {market_id}")
            market_question = ""


        m_ents[market_id] = market_question

    event_entities_dict["market_ents"] = m_ents
    event_entities[event_id] = event_entities_dict

except Exception as e:
    logger.error(f"Unexpected error processing event data: {e}")

# Save results to file
try:
    json.dump(event_entities, open(entities_path, "w"), indent=2)
    logger.info(f"Results saved to {entities_path}")
except Exception as e:
    logger.error(f"Failed to save results: {e}")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [None]:
ARTICLES_PATH = f'{event_path}/articles.json'
entities_path = f'{event_path}/entities.json'

MAX_SAMPLE_SIZE = 100

def get_extracted_events(path=ARTICLES_PATH):
    urls = os.listdir(path)
    result = [u.split('_')[0] for u in urls]
    return result
    
def fetch_news_for_queries(queries, cutoff_date):
    """Fetch news 
    for a list of queries and a cutoff date."""
    logger.info(f"Fetching news for {len(queries)} queries (cutoff: {cutoff_date})")
    news_data = {}
    for query in queries:
        try:
            res = fetch_google_news_rss(query, cutoff_date=cutoff_date)
            if res:
                news_data.update(res)
        except Exception as e:
            logger.error(f"Error fetching news for query '{query}': {e}")
    return news_data

async def url_extractor(event_id, articles, num_parallel_tasks=10):
    """
    Extract real URLs from articles in parallel using a specified number of tasks.
    
    Args:
        event_id (str): ID of the event.
        articles (list): List of article dicts.
        num_parallel_tasks (int): Number of parallel coroutines to use.
    """
    logger.info(f"Processing {len(articles)} articles for event ID: {event_id}")
    
    # Sample 100 articles if total exceeds 100
    if len(articles) > MAX_SAMPLE_SIZE:
        logger.warning(f"Exceeded {MAX_SAMPLE_SIZE} articles. Sampling {MAX_SAMPLE_SIZE} out of {len(articles)}.")
        articles = random.sample(articles, MAX_SAMPLE_SIZE)
        logger.info(f"Sampled {MAX_SAMPLE_SIZE} articles for event ID: {event_id}")

    # Create chunks based on number of parallel tasks
    chunk_size = max(1, len(articles) // num_parallel_tasks)
    chunks = [
        articles[i:i + chunk_size]
        for i in range(0, len(articles), chunk_size)
    ]

    logger.info(f"Created {len(chunks)} chunks for parallel processing")

    # Run all chunks in parallel using get_real_url_async
    tasks = [get_real_url_async(chunk) for chunk in chunks]
    results = await asyncio.gather(*tasks)
    combined_articles = []
    for result in results:
        combined_articles.extend(result)

    output_path = ARTICLES_PATH
    try:
        with open(output_path, 'w') as f:
            json.dump(combined_articles, f)
        logger.info(f"Saved processed articles to {output_path}")
    except Exception as e:
        logger.error(f"Failed to save articles for event ID {event_id}: {e}")

async def news_parsing(event):
    """Main function to process events and fetch related news articles."""
    event_id = event['id']
    
    with open(entities_path, 'r') as f:
        event_ents = json.load(f)
    

    logger.warning(f"Starting processing for event ID: {event_id}")
    markets = event['markets']

    last_date_str = datetime.now().strftime('%Y-%m-%d')

    tags = event_ents[event_id]['tag_labels']
    tag_combinations = list(itertools.combinations(tags, len(tags)))
    tag_queries = [' '.join(combo) for combo in tag_combinations]

    # Collect all queries from different sources
    queries = []
    queries.extend(event_ents[event_id]['event_ents_bert'])
    queries.extend(tag_queries)
    queries.extend([m.get('question') for m in markets])

    news_data = fetch_news_for_queries(queries, last_date_str)

    articles = [
        {**article, 'query': source}
        for source, articles_list in news_data.items()
        for article in articles_list
    ]

    await url_extractor(event_id, articles, num_parallel_tasks=20)

import nest_asyncio
nest_asyncio.apply()

await news_parsing(event)

Starting processing for event ID: 23935
Starting processing for event ID: 23935
Exceeded 100 articles. Sampling 100 out of 623.
Exceeded 100 articles. Sampling 100 out of 623.


🧭 Navigating to https://news.google.com/rss/articles/CBMipgFBVV95cUxOZzI1dEl4ZUN6TlNrVjRWREtGLTRJRmx4ZWZKbjA1UmExRVpFWTg0bW45YWJOeEFEb1h1dk9yTkE4T05vczV5WXhaNThrS1JDWnJKa0ZvQUwtajA0YTFOZmdwTXB2S3FpcnE2Y0ZwTHNmSk5CcnJOa2xkdnJQM1hDVldCMFdwMTczR1h6Y3NUanR3S0hmWDNza25UWWlQeFkzS01qa19B?oc=5
🧭 Navigating to https://news.google.com/rss/articles/CBMikwFBVV95cUxOc0Nqbi1NSGwxZ2E0QU1WZHdETlRPVXF5aWJpQ1JYY25RMS1GejNGdUVVSURYTTVTX3gyeGZ6WXc2ZGZtaTYweTAyTFhHSGQxQjhJYTJKSjVOQnpJanE1cTZHUVh0eUFqQWdRLUdlZ2tEaHRCMnhxMDI1d0tBdjhTRzF1VlhMUy1Kb1ZGRXdTa0VfQ1E?oc=5
🧭 Navigating to https://news.google.com/rss/articles/CBMiXEFVX3lxTE1WQTBNbHVzaVlPUDFycmo1WGptbGt3bF9YeEp6bGVGdkMyS2FBc2kwaERZbHJsU0YxRHRaa0tPQUViaDVhRzhuSEV4ajVocnRCMHFnSlFVTVN6Vzdk0gFiQVVfeXFMTkdMNHhOM1BYR3gzWV9IQ3h2RUJjVHBzM0c2YWFCX1lqRHZBRjJJbjBLT21Fc0pJZl9RZTl0akxqMWhmdFBJZHhUbE96ZFJzMzZWMk9mNEZQVXc2VWtmZ2xCb2c?oc=5
🧭 Navigating to https://news.google.com/rss/articles/CBMiekFVX3lxTFBOTVZpY0RqbEtrX09DZzY4d3lnN2dKQmF4RV80WTUyX1NrQjZpcm5aMTZMQXhl

In [None]:
import json
import re
import gc
import unicodedata
import requests
from bs4 import BeautifulSoup
import numpy as np
from sentence_transformers import SentenceTransformer


# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2')
 
# Constants
EVENTS_FILE_PATH = f'{event_path}/event.json'
OUTPUT_PATH = f'{event_path}/sentence_embeddings/'

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "https://www.google.com/ ",
}


def load_events(file_path):
    """Load event data from JSON file."""
    try:
        logger.info(f"Loading events from {file_path}")
        with open(file_path, 'r') as f:
            return json.load(f)
    except Exception as e:
        logger.error(f"Error loading events from {file_path}: {e}")
        return []


def filter_url(url_info):
    """Filter URLs based on conditions."""
    url = url_info.get('real_url', '')
    if not url:
        return None

    try:
        conditions_met = (
            url.count('/') > 3 and
            len(re.findall(r'(author|topic|tag|category|youtube)', url)) < 1
        )
        return url_info if conditions_met else None
    except Exception as e:
        logger.warning(f"Error filtering URL {url}: {e}")
        return None


def normalize_unicode_text(text):
    """Normalize Unicode characters in text."""
    try:
        return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    except Exception as e:
        logger.warning(f"Error normalizing Unicode text: {e}")
        return text


def clean_text(text):
    """Clean and preprocess raw HTML text content."""
    try:
        text = re.sub(r'\s+', ' ', text).strip()
        text = normalize_unicode_text(text)

        text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
        text = re.sub(r'\S+@\S+', '', text)                # Remove emails
        text = re.sub(r'\d+', '', text)                    # Remove digits
        text = text.lower()

        return text
    except Exception as e:
        logger.warning(f"Error cleaning text: {e}")
        return ""


def extract_sentences(text):
    """Extract non-empty sentences from cleaned text."""
    try:
        sentences = clean_text(text).split('.')
        return [sentence.strip() for sentence in sentences if sentence.strip()]
    except Exception as e:
        logger.warning(f"Error extracting sentences: {e}")
        return []


def process_event_articles(event, articles_path):
    """Process all articles for a single event."""
    
    try:
        event_id = event['id']
        


        logger.info(f"Processing event ID: {event_id}")

        file_path = ARTICLES_PATH
        with open(file_path, 'r') as f:
            urls = json.load(f)

        filtered_urls = list(filter(None, [filter_url(url) for url in urls]))
        for url_info in filtered_urls:

            url = url_info.get('real_url', 'Unknown URL')
            article_id = url_info.get('id', 'Unknown ID')

            logger.info(f"Fetching URL: {url}")
            response = requests.get(url, headers=HEADERS)
            if response.status_code != 200:
                logger.warning(f"Failed to fetch {url} (Status {response.status_code})")
                continue

            soup = BeautifulSoup(response.content, 'html.parser')
            sentences = extract_sentences(soup.text)
            
            del soup, response
            gc.collect()

            if not sentences:
                logger.warning(f"No valid sentences extracted from {url}")
                continue

            embeddings = model.encode(sentences)
            output_path = f'{OUTPUT_PATH}{article_id}.npz'
            logger.info(f"Saving embeddings to {output_path}")

            with open(output_path, 'wb') as out_file:
                np.savez_compressed(out_file, embeddings)
            
            del sentences, embeddings
            gc.collect()
        

    except Exception as e:
        logger.error(f"Error processing event articles for ID {event_id}: {e}", exc_info=True)


event_id = event['id']

process_event_articles(event, ARTICLES_PATH)


Batches: 100%|██████████| 3/3 [00:05<00:00,  1.87s/it]
Error processing event articles for ID 23935: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
Traceback (most recent call last):
  File "/Users/ivanesipov/miniconda/envs/diploma_env/lib/python3.11/site-packages/urllib3/connectionpool.py", line 773, in urlopen
    self._prepare_proxy(conn)
  File "/Users/ivanesipov/miniconda/envs/diploma_env/lib/python3.11/site-packages/urllib3/connectionpool.py", line 1042, in _prepare_proxy
    conn.connect()
  File "/Users/ivanesipov/miniconda/envs/diploma_env/lib/python3.11/site-packages/urllib3/connection.py", line 741, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ivanesipov/miniconda/envs/diploma_env/lib/python3.11/site-packages/urllib3/connection.py", line 920, in _ssl_wrap_socket_and_match_hostname
    ssl_sock = ssl_wrap_socket(
               ^^^^^^^^^^^^^^^^

In [None]:
import os
import json
import logging
import numpy as np
from tqdm import tqdm
from itertools import combinations
import networkx as nx
from transformers import BertTokenizer, BertForTokenClassification, pipeline

# Local imports
import sys
sys.path.insert(0, '..')
from utils.base import load_events  # Assuming this wraps `json.load`

# --- Paths ---

EVENTS_PATH = f'{event_path}/event.json'
EMBEDDINGS_PATH = f'{event_path}/sentence_embeddings/'
ARTICLES_PATH = f'{event_path}/articles.json'
GRAPH_PATH = f'{event_path}/graph.graphml'

SHARE_THRESHOLD = 0.35

# --- NER Setup ---
try:
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    model = BertForTokenClassification.from_pretrained("dslim/bert-base-NER")
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
    logger.info("NER pipeline loaded successfully.")
except Exception as e:
    logger.error(f"Failed to load NER pipeline: {e}")
    raise


def extract_entities(title):
    """Extract unique named entities from a title using the NER pipeline."""
    try:
        ner_results = ner_pipeline(title)
        return set(entity['word'] for entity in ner_results if entity['entity'].startswith('B-'))
    except Exception as e:
        logger.warning(f"Error extracting entities from title: {e}")
        return set()


def stream_articles(file_path):
    """Stream articles from a JSON file one by one to reduce memory usage."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            articles = json.load(f)
            for article in articles:
                yield article
    except FileNotFoundError:
        logger.warning(f"File not found: {file_path}")
    except json.JSONDecodeError as e:
        logger.warning(f"JSON decode error in {file_path}: {e}")


def build_event_graph(event, articles_file_path):
    """Build a NetworkX graph based on article relationships for a single event."""
    logger.info(f"Building graph for event")

    # Get embedded article IDs related to this event
    try:
        embedded_article_ids = [x.split('.')[0] for x in os.listdir(EMBEDDINGS_PATH)]
        logger.debug(f"Found {len(embedded_article_ids)} embedded article IDs for event")
    except Exception as e:
        logger.error(f"Error reading embeddings directory for event: {e}")
        return None

    valid_articles = []
    for article in stream_articles(articles_file_path):
        aid = article.get('id')
        if aid in embedded_article_ids:
            try:
                article['ner'] = extract_entities(article['title'])
                valid_articles.append(article)
            except Exception as e:
                logger.warning(f"Error processing article {aid}: {e}")

    if not valid_articles:
        logger.warning(f"No valid articles found for event")
        return None

    logger.info(f"Processing {len(valid_articles)} valid articles for event")

    # Build graph
    G = nx.Graph()

    # Add nodes
    for item in valid_articles:
        G.add_node(item['id'], article=item['real_url'])

    # Add edges
    for a, b in combinations(valid_articles, 2):
        id_a, id_b = a['id'], b['id']
        same_date = a['date'] == b['date']
        same_query = a['query'] == b['query']
        same_publisher = a['publisher'] == b['publisher']

        common_entities = a['ner'].intersection(b['ner'])
        common_entities_share = len(common_entities) / max(len(a['ner']) or 1, len(b['ner']) or 1)

        if any([same_date, same_query, same_publisher, common_entities_share > SHARE_THRESHOLD]):
            G.add_edge(
                id_a,
                id_b,
                same_date=same_date,
                same_query=same_query,
                same_publisher=same_publisher,
                common_entities_share=common_entities_share,
            )

    logger.info(f"Built graph with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges for event")
    return G


def main():
    try:
        events = load_events(EVENTS_PATH)
        logger.info(f"Loaded {len(events)} events from {EVENTS_PATH}")
    except Exception as e:
        logger.error(f"Failed to load events from {EVENTS_PATH}: {e}")
        return


    for i, event in enumerate(events):

        articles_file = ARTICLES_PATH

        if not os.path.exists(articles_file):
            logger.warning(f"Articles file does not exist: {articles_file}")
            continue

        logger.info(f"Starting graph construction for e")
        G = build_event_graph(event, articles_file)

        if G is not None:
            output_file = GRAPH_PATH
            nx.write_graphml(G, output_file)
            logger.info(f"Saved graph for e to {output_file}")
        else:
            logger.warning(f"Graph for e was not saved (None returned)")


if __name__ == '__main__':
    main()

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [None]:
# TRAINING_GRAPH_PATH = f'{event_path}/training_graph.pickle'
# TRAINING_GRAPH_PATH

In [None]:
# Inference p.1
import os
import json
import torch
import joblib
import networkx as nx
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from datetime import datetime
from sentence_transformers import SentenceTransformer
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
from torch_geometric.utils import dropout
from torch_geometric.utils.convert import from_networkx
import sys

# Project root and path setup
sys.path.insert(0, ROOT)
TRAINING_GRAPH_PATH = f'{event_path}/training_graph.pickle'

def get_similarity_vector(article_emb, question_emb, model):
    """Calculate similarity vector between article and question embeddings."""
    similarities_list = [
        model.similarity(s_emb, question_emb) 
        for s_emb in article_emb['arr_0']
    ]
    
    n = 10
    similarities = torch.cat(similarities_list)
    idx = similarities.argsort(dim=0, descending=True)[:n]
    
    article_similarity = torch.cat([
        torch.quantile(similarities, torch.tensor([i/10.0 for i in range(10)]), 
                      dim=0, keepdim=False).flatten(),
        torch.tensor([similarities.mean(), max(0, similarities[idx].std()), len(similarities > 0.5)]),
    ])
    
    return article_similarity

def clear_attrs(graph):
    """Remove specified attributes from graph."""
    keys = [
        'same_date',
        'common_entities_share',
        'same_query',
        'same_publisher'
    ]
    
    for key in keys:
        del graph[key]
    
    return graph

def generate_graph(market, G, model):
    """Generate graph with node embeddings and labels."""
    # Label
    pos = np.argmax(eval(market['outcomePrices']))
    y = np.where(eval(market['outcomes'])[pos] == 'No', 0, 1)
    
    # Question embedding
    market_question = market['question']
    question_emb = model.encode(market_question)
    
    for node_id in G.nodes:
        article_emb = np.load(f'{EMBEDDINGS_PATH}{node_id}.npz')
        node_vector = get_similarity_vector(article_emb, question_emb, model=model)
        G.nodes[node_id]['embedding'] = node_vector
    
    graph = from_networkx(G)
    graph.y = torch.tensor(y)
    
    return graph

def add_key(graph_list, key):
    """Add specified key to graphs if missing."""
    key = 'same_date'
    for g in graph_list:
        try:
            g[key]
        except KeyError:
            g[key] = torch.zeros(g.edge_index.size()[1])
    
    graph_list_new = [
        Data(
            x=g['embedding'],
            y=g['y'],
            edge_index=g['edge_index'],
            same_date=g[key]
        ) 
        for g in graph_list if g is not None
    ]
    return graph_list_new

# Initialize sentence transformer
sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")

# Process all events

graphs = []

G = nx.read_graphml(GRAPH_PATH)
for market in event['markets']:
    try:
        graphs.append(
            generate_graph(market, G=G, model=sentence_transformer)
        )
    except Exception as e:
        print(f"Error processing market: {e}")
        
graph_data = add_key(graphs, 'same_date')

with open(TRAINING_GRAPH_PATH, 'wb') as f:
    joblib.dump(graph_data, f)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.85it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.12it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.78it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.40it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.64it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.87it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.62it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.92it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.84it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.54it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.52it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.91it/s]
Batches: 1

In [None]:
with open(TRAINING_GRAPH_PATH, 'rb') as f:
    graph_data = joblib.load(f)

In [None]:
# Inference p.2

import torch
import sys
sys.path.insert(0, '..')
from utils.models.gcn_v1 import GCNGraphClassifier_v1
from utils.models.gcn_v2 import GCNGraphClassifier_v2
from utils.models.baseline import BaselineClassifier


dataloader = DataLoader(graph_data, batch_size=1)

model = GCNGraphClassifier_v2(13, 64)
state_dict = torch.load('../models/GCNGraphClassifier_v2_20250517_164339.pth')
model.load_state_dict(state_dict)

model.eval()
out_list, predictions = [], []
for data in dataloader:
    out = model(data)
    pred = out.max(1)[1]
    predictions.append(pred)
    out_list.append(out)

In [None]:
# torch.cat(out_list).max(0)
torch.cat(out_list).argsort(0)[:, 1]



tensor([34, 10,  4, 36, 13,  1, 35, 32, 15,  9, 26,  0, 22, 11,  8,  7, 25,  6,
        33, 18, 29, 19, 21, 23,  5,  2, 20, 17, 14, 24, 31, 16, 30, 12, 27, 28,
         3])

In [None]:
from tsfresh import extract_features
import pandas as pd


with open('../models/price_predictor.pkl', 'rb') as f:
    price_clf = joblib.load(f)


def series_to_tsfresh_df(history, series_id):
    return pd.DataFrame({
        'id': [series_id for _ in history],
        'time': [point['t'] for point in history],
        'value': [point['p'] for point in history]
    })

X = series_to_tsfresh_df(event['markets'][4]['prices_history'][0]['history'], series_id=4)
X = extract_features(X, column_id='id', column_sort='time', column_value='value')

prediction = price_clf.predict_proba(X)
prediction

Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!
Feature Extraction: 100%|██████████| 1/1 [00:03<00:00,  3.20s/it]


array([[0.27728431, 0.72271569]])

In [None]:
graph_preds = torch.exp(torch.cat(out_list))[:,0]
for i, m in enumerate(event['markets']):
    graph_pred = graph_preds[i]
    try:
        X = series_to_tsfresh_df(m['prices_history'][0]['history'], series_id=0)
        X = extract_features(X, column_id='id', column_sort='time', column_value='value')
        X.replace([np.inf, -np.inf], np.nan, inplace=True)

        price_pred = price_clf.predict_proba(X)[:, 1].item()

        print(f"Market {m['question']}, Graph Prediction: {graph_pred:.4f}, Price Prediction: {price_pred:.4f}")
    except:
        continue

Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!
Feature Extraction: 100%|██████████| 1/1 [00:04<00:00,  4.34s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Sweden finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3284, Price Prediction: 0.8868


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.55s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Austria finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3301, Price Prediction: 0.8409


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.66s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will France finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3251, Price Prediction: 0.9520


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.38s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Israel finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3192, Price Prediction: 0.7710


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.36s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Netherlands finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3318, Price Prediction: 0.7227


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.55s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Finland finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3252, Price Prediction: 0.6990


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.44s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Estonia finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3268, Price Prediction: 0.5716


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.36s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Ukraine finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3272, Price Prediction: 0.1267


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.45s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Albania finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3275, Price Prediction: 0.5663


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.33s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Belgium finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3291, Price Prediction: 0.2004


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.40s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Czechia finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3322, Price Prediction: 0.1267


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.37s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Italy finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3279, Price Prediction: 0.2457


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.74s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will San Marino finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3223, Price Prediction: 0.3517


Feature Extraction: 100%|██████████| 1/1 [00:03<00:00,  3.49s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Germany finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3304, Price Prediction: 0.2982


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.81s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Malta finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3241, Price Prediction: 0.3434


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.63s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will United Kingdom finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3295, Price Prediction: 0.2131


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.85s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Norway finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3233, Price Prediction: 0.3405


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.93s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Switzerland finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3244, Price Prediction: 0.4451


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.82s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Australia finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3265, Price Prediction: 0.2342


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.88s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Greece finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3257, Price Prediction: 0.2338


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.71s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Spain finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3250, Price Prediction: 0.4154


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.54s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Ireland finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3257, Price Prediction: 0.1842


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.64s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Poland finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3281, Price Prediction: 0.3154


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.85s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Cyprus finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3253, Price Prediction: 0.2671


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.94s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Lithuania finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3239, Price Prediction: 0.3214


Feature Extraction: 100%|██████████| 1/1 [00:03<00:00,  3.07s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Luxembourg finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3268, Price Prediction: 0.2987


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.76s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Latvia finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3290, Price Prediction: 0.4302


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.68s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Georgia finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3221, Price Prediction: 0.2938


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.53s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Azerbaijan finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3216, Price Prediction: 0.1802


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.80s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Portugal finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3262, Price Prediction: 0.3260


Feature Extraction: 100%|██████████| 1/1 [00:03<00:00,  3.11s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Iceland finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3226, Price Prediction: 0.3239


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  3.00s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Armenia finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3236, Price Prediction: 0.3826


Feature Extraction: 100%|██████████| 1/1 [00:03<00:00,  3.03s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Serbia finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3297, Price Prediction: 0.3438


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.73s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Denmark finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3268, Price Prediction: 0.3517


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.45s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Slovenia finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3340, Price Prediction: 0.1307


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.68s/it]
Dependency not available for matrix_profile, this feature will be disabled!
Dependency not available for matrix_profile, this feature will be disabled!


Market Will Montenegro finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3299, Price Prediction: 0.4570


Feature Extraction: 100%|██████████| 1/1 [00:02<00:00,  2.77s/it]

Market Will Croatia finish in the 2025 Eurovision top 5?, Graph Prediction: 0.3318, Price Prediction: 0.2478





In [None]:
m

{'id': '541504',
 'question': 'Will Croatia finish in the 2025 Eurovision top 5?',
 'conditionId': '0x6988235ff80eb52d22a93d5110d5a9012dffa662ec512765ed1771cdd846f0b8',
 'slug': 'will-croatia-finish-in-the-2025-eurovision-top-5',
 'resolutionSource': '',
 'endDate': '2025-05-17T12:00:00Z',
 'startDate': '2025-05-05T20:28:43.839822Z',
 'image': 'https://polymarket-upload.s3.us-east-2.amazonaws.com/flag+croatia.png',
 'icon': 'https://polymarket-upload.s3.us-east-2.amazonaws.com/flag+croatia.png',
 'description': 'This market will resolve to "Yes" if the named country finishes Eurovision 2025 as one of the top 5 highest scoring candidates.\n\nIf at any point it is impossible for the listed candidate to finish as one of the top 5 highest scorers based on the rules of the competition (i.e. they are eliminated), this market may immediately resolve to "No".\n\nIf no winner is announced by July 31, 2025, 11:59 PM ET, this market will resolve to "No".\n\nThe primary resolution source for this 

In [None]:
# for i, m in enumerate(event['markets']):
#     prognosis = eval(m['outcomes'])[predictions[i]]
#     print(f"Market {m['question']}, Prediction: {prognosis}")
    
#     predictions[i]

predictions[i]

tensor([1])