In [7]:
!pip install requests beautifulsoup4 pandas




In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import difflib
import logging

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")



In [11]:

class DataCollector:
    def run(self):
        data = {}

        # Polymarket (example public page)
        url = "https://polymarket.com/markets"
        r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(r.text, "html.parser")
        markets = soup.find_all("div", class_="MarketCardstyles__Question-sc")
        poly_data = []
        for m in markets[:5]:  # first 5 markets
            name = m.get_text()
            price = None  # Prices load via JS, skipping
            poly_data.append({"name": name, "price": price})
        data["polymarket"] = poly_data

        # Kalshi (mock data since site often needs login)
        kalshi_data = [
            {"name": "Trump wins 2024", "price": 0.62},
            {"name": "Bitcoin > 100k in 2025", "price": 0.42}
        ]
        data["kalshi"] = kalshi_data

        # Prediction-Market (mock sample)
        pred_data = [
            {"name": "Trump presidency 2024", "price": 0.64}
        ]
        data["prediction-market"] = pred_data

        logging.info("Collected data from 3 sources (scraped + mocked).")
        return data


In [12]:
class ProductIdentifier:
    def run(self, data):
        unified = []
        products = []

        for site, items in data.items():
            for p in items:
                products.append((site, p["name"], p["price"]))

        for site, name, price in products:
            matched = None
            for item in unified:
                if difflib.SequenceMatcher(None, item["name"], name).ratio() > 0.75:
                    item["prices"][site] = price
                    matched = True
                    break
            if not matched:
                unified.append({
                    "name": name,
                    "prices": {site: price},
                    "confidence": "High" if "Trump" in name else "Medium"
                })

        logging.info("Unified products identified.")
        return unified


In [13]:
class DataOrganizer:
    def run(self, unified):
        rows = []
        for u in unified:
            row = {"Product": u["name"], "Confidence": u["confidence"]}
            row.update(u["prices"])
            rows.append(row)

        df = pd.DataFrame(rows)
        df.to_csv("unified_markets.csv", index=False)
        logging.info("CSV file generated: unified_markets.csv")
        return df


In [14]:
collector = DataCollector()
identifier = ProductIdentifier()
organizer = DataOrganizer()

# Step 1: Collect
data = collector.run()

# Step 2: Identify unified products
unified = identifier.run(data)

# Step 3: Organize + Save to CSV
df = organizer.run(unified)

# Display in Jupyter
df


2025-08-22 15:26:44,870 - INFO - Collected data from 3 sources (scraped + mocked).
2025-08-22 15:26:44,882 - INFO - Unified products identified.
2025-08-22 15:26:44,888 - INFO - CSV file generated: unified_markets.csv


Unnamed: 0,Product,Confidence,kalshi,prediction-market
0,Trump wins 2024,High,0.62,
1,Bitcoin > 100k in 2025,Medium,0.42,
2,Trump presidency 2024,High,,0.64
