In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
chrome_options.add_argument("--headless") 

# Set up the Chrome WebDriver
service = Service("/opt/homebrew/bin/chromedriver")
driver = webdriver.Chrome(service=service, options=chrome_options)

# Navigate to the specific oddschecker page
url = "https://www.oddschecker.com/politics/us-politics/us-presidential-election/winner"
driver.get(url)

# Wait for the page to load
time.sleep(10)

# Get the page source
page_source = driver.page_source

# Close the browser
driver.quit()

In [None]:
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')

# Find the table or relevant parts
odds_table = soup.find('tbody', id='t1')

# Extract each row and the data within
odds_data = []
bookmakers_set = set()

for row in odds_table.find_all('tr'):
    market_name = row.find('a', class_='popup').text.strip()  # Extract the party name
    odds_dict = {'Market': market_name}
    
    # Find all td elements with odds information
    for td in row.find_all('td', class_=lambda x: x and ('o' in x.split() or 'bs' in x.split())):
        bookmaker = td.get('data-bk')  # Extract the bookmaker name
        decimal_odds = td.get('data-odig')  # Extract the decimal odds value
        if bookmaker and decimal_odds:  # Only add if both are present
            odds_dict[bookmaker] = float(decimal_odds)  # Convert odds to float
            bookmakers_set.add(bookmaker)
    
    odds_data.append(odds_dict)

# Create a DataFrame with all bookmakers as columns
df = pd.DataFrame(odds_data).set_index('Market')

# Ensure all bookmakers are columns, even if some are missing in certain rows
df = df.reindex(columns=sorted(bookmakers_set))

print(df)

In [12]:
# Extract oddschecker politics market urls from sitemap

chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

# Set up the Chrome WebDriver (update the path to where you have your ChromeDriver)
service = Service("/opt/homebrew/bin/chromedriver")  # Update this path
driver = webdriver.Chrome(service=service, options=chrome_options)

# URL of the sitemap
sitemap_url = "https://www.oddschecker.com/sport/politics/sitemap.xml"

try:
    # Load the sitemap page
    driver.get(sitemap_url)
    
    # Wait for the page to fully load (adjust time as needed)
    time.sleep(5)
    
    # Get the page source (XML content)
    xml_content = driver.page_source

    # Parse the XML content with BeautifulSoup
    soup = BeautifulSoup(xml_content, 'xml')

    # Find all <loc> tags which contain the URLs
    url_tags = soup.find_all('loc')

    # Extract URLs and add them to the list
    urls = [url_tag.text for url_tag in url_tags]

    print(f"Found {len(urls)} URLs.")
    print(urls)  # Print the URLs or add further processing

finally:
    # Close the browser
    driver.quit()

Found 17 URLs.
['https://www.oddschecker.com/politics/british-politics', 'https://www.oddschecker.com/politics/british-politics/next-uk-general-election/most-seats', 'https://www.oddschecker.com/politics/british-politics/next-conservative-leader', 'https://www.oddschecker.com/politics/us-politics', 'https://www.oddschecker.com/politics/us-politics/us-presidential-election/winner', 'https://www.oddschecker.com/politics/us-politics/us-state-betting/arizona', 'https://www.oddschecker.com/politics/us-politics/us-senate-elections/arizona', 'https://www.oddschecker.com/politics/us-politics/house-and-senate-elections-overall-control/senate-control', 'https://www.oddschecker.com/politics/european-politics', 'https://www.oddschecker.com/politics/european-politics/irish-politics/next-president', 'https://www.oddschecker.com/politics/european-politics/northern-irish-politics/next-united-ireland-referendum-result', 'https://www.oddschecker.com/politics/european-politics/scottish-politics/independe

In [13]:
import concurrent.futures
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

# Function to extract odds data from a given URL
def extract_odds(url, user_agent):
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument(f'user-agent={user_agent}')
    chrome_options.add_argument("--headless")

    # Set up the Chrome WebDriver
    service = Service("/opt/homebrew/bin/chromedriver")
    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        # Navigate to the specific oddschecker page
        driver.get(url)

        # Wait for the page to load
        time.sleep(10)  # Adjust if necessary

        # Get the page source
        page_source = driver.page_source

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find the table or relevant parts
        odds_table = soup.find('tbody', id='t1')

        if not odds_table:
            print(f"No odds table found for URL: {url}")
            return None  # Skip this URL if the table isn't found

        # Extract each row and the data within
        odds_data = []
        bookmakers_set = set()

        for row in odds_table.find_all('tr'):
            party_name = row.find('a', class_='popup').text.strip()  # Extract the party name
            odds_dict = {'Party': party_name}
            
            # Find all td elements with odds information
            for td in row.find_all('td', class_=lambda x: x and ('o' in x.split() or 'bs' in x.split())):
                bookmaker = td.get('data-bk')  # Extract the bookmaker name
                decimal_odds = td.get('data-odig')  # Extract the decimal odds value
                if bookmaker and decimal_odds:  # Only add if both are present
                    odds_dict[bookmaker] = float(decimal_odds)  # Convert odds to float
                    bookmakers_set.add(bookmaker)
            
            odds_data.append(odds_dict)

        # Create a DataFrame with all bookmakers as columns
        df = pd.DataFrame(odds_data).set_index('Party')

        # Ensure all bookmakers are columns, even if some are missing in certain rows
        df = df.reindex(columns=sorted(bookmakers_set))

        # Add the URL as a column in the DataFrame
        df['URL'] = url

        return df
    finally:
        # Close the browser
        driver.quit()

# List of URLs to scrape
urls = [
    "https://www.oddschecker.com/politics/us-politics/us-presidential-election/winner",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/arizona",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/georgia",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/michigan",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/pennsylvania",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/wisconsin"
]

# List of user agents to rotate
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'
]

# List to store DataFrames
dataframes_list = []

# Use ThreadPoolExecutor to process URLs in parallel in batches of 5
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = []
    for i, url in enumerate(urls):
        user_agent = user_agents[i % len(user_agents)]  # Rotate user agents
        futures.append(executor.submit(extract_odds, url, user_agent))

        # Wait for each batch of 5 to complete before starting the next batch
        if (i + 1) % 5 == 0 or i == len(urls) - 1:
            for future in concurrent.futures.as_completed(futures):
                df = future.result()
                if df is not None:
                    dataframes_list.append(df)
            futures = []  # Clear futures list for the next batch


No odds table found for URL: https://www.oddschecker.com/politics/us-politics
No odds table found for URL: https://www.oddschecker.com/politics/british-politics
No odds table found for URL: https://www.oddschecker.com/politics/european-politics
No odds table found for URL: https://www.oddschecker.com/politics/australian-politics
No odds table found for URL: https://www.oddschecker.com/politics/world-politics


In [14]:
print(dataframes_list)

[                   AKB     B3     BF   BY      CE   DP   EE      FB      FR  \
Party                                                                         
Labour             0.0   1.53   1.73  0.0    1.44  0.0  0.0    1.62    1.44   
Conservatives      0.0   3.00   3.20  0.0    3.75  0.0  0.0    2.88    3.50   
Reform             0.0   9.00   8.25  0.0    9.00  0.0  0.0    7.50    8.00   
Liberal Democrats  0.0  81.00  29.00  0.0   67.00  0.0  0.0  101.00  101.00   
Green              0.0   0.00   0.00  0.0  501.00  0.0  0.0    0.00  501.00   

                    G5  ...      S6   SI   SK   SX   UN      VC   VT   WA  \
Party                   ...                                                 
Labour             0.0  ...    1.44  0.0  0.0  0.0  0.0    1.67  0.0  0.0   
Conservatives      0.0  ...    3.25  0.0  0.0  0.0  0.0    3.00  0.0  0.0   
Reform             0.0  ...    8.00  0.0  0.0  0.0  0.0    7.00  0.0  0.0   
Liberal Democrats  0.0  ...   67.00  0.0  0.0  0.0  0.0   51

In [4]:
from dotenv import load_dotenv
import os
import queue

load_dotenv()

bf_usr = os.getenv("BF_LOGIN")
bf_pass = os.getenv("BF_PASS")
bf_api = os.getenv("BF_API_KEY")
bf_session = os.getenv("BF_SESSION")


In [5]:
import betfairlightweight
from betfairlightweight import filters
from betfairlightweight import APIClient

client = APIClient(bf_usr, bf_pass, app_key=bf_api)
client.session_token = bf_session

market_filter = betfairlightweight.filters.market_filter(
    event_type_ids=['2378961'],  # Politics event type
)

market_catalogues = client.betting.list_market_catalogue(
    filter=market_filter,
    max_results=100
)

# Process the market catalogues
for market in market_catalogues:
    print(market.market_name, market.market_id)

Next Labour Leader 1.170273835
Next Lib Dem Leader 1.179233218
Year Rishi Sunak replaced as Conservative Leader 1.205534173
Next Conservative Leader 1.205526560
Democratic Nominee 1.178163685
Democratic Vice President Nominee 1.190716127
Nominee Forecast 1.216990085
Northern Territory Election 2024 1.223219482
Queensland State Election 2024 1.218937278
Will Joe Biden be impeached before 2024 Election? 1.218257169
Senate Control after 2024 Election 1.225479090
Election Winner 1.176878927
Winning Party 1.178176964
Popular Vote Winner 1.178165812
Party of Popular Vote Winner 1.178176967
Gender of Election Winner 1.178176193
Will Election Winner lose Popular Vote? 1.226054697
Joe Manchin to be re-elected to the senate in 2024 1.213966025
Mississippi 1.230000329
Arizona 1.229996509
Massachusetts 1.230000327
Oklahoma 1.230123427
Pennsylvania 1.230123429
South Dakota 1.230123632
Michigan 1.229999165
Oregon 1.230123428
Tennessee 1.230123636
Minnesota 1.229999638
Hawaii 1.229997182
Alabama 1.22

In [None]:
import threading

# create queue
output_queue = queue.Queue()

# create stream listener
listener = betfairlightweight.StreamListener(output_queue=output_queue)

# create stream
stream = client.streaming.create_stream(listener=listener)

# create filters (GB WIN racing)
market_filter = filters.streaming_market_filter(
    event_type_ids=['2378961']
)
market_data_filter = filters.streaming_market_data_filter(
    fields=["EX_BEST_OFFERS", "EX_MARKET_DEF"], ladder_levels=3
)

# subscribe
streaming_unique_id = stream.subscribe_to_markets(
    market_filter=market_filter,
    market_data_filter=market_data_filter,
    conflate_ms=5000,  # send update every 1000ms
)

# start stream in a new thread (in production would need err handling)
t = threading.Thread(target=stream.start, daemon=True)
t.start()

In [6]:
def process_runner_books(runner_books):
    '''
    This function processes the runner books and returns a DataFrame with the best back/lay prices + vol for each runner
    :param runner_books:
    :return:
    '''
    best_back_prices = [runner_book.ex.available_to_back[0]['price']
        if runner_book.ex.available_to_back
        else 1.01
        for runner_book
        in runner_books]
    best_back_sizes = [runner_book.ex.available_to_back[0]['size']
        if runner_book.ex.available_to_back
        else 1.01
        for runner_book
        in runner_books]

    best_lay_prices = [runner_book.ex.available_to_lay[0]['price']
        if runner_book.ex.available_to_lay
        else 1000.0
        for runner_book
        in runner_books]
    best_lay_sizes = [runner_book.ex.available_to_lay[0]['size']
        if runner_book.ex.available_to_lay
        else 1.01
        for runner_book
        in runner_books]

    selection_ids = [runner_book.selection_id for runner_book in runner_books]
    last_prices_traded = [runner_book.last_price_traded for runner_book in runner_books]
    total_matched = [runner_book.total_matched for runner_book in runner_books]
    statuses = [runner_book.status for runner_book in runner_books]
    scratching_datetimes = [runner_book.removal_date for runner_book in runner_books]
    adjustment_factors = [runner_book.adjustment_factor for runner_book in runner_books]

    df = pd.DataFrame({
        'Selection ID': selection_ids,
        'Best Back Price': best_back_prices,
        'Best Back Size': best_back_sizes,
        'Best Lay Price': best_lay_prices,
        'Best Lay Size': best_lay_sizes,
        'Last Price Traded': last_prices_traded,
        'Total Matched': total_matched,
        'Status': statuses,
        'Removal Date': scratching_datetimes,
        'Adjustment Factor': adjustment_factors
    })
    return df

# Create a price filter. Get all traded and offer data
price_filter = betfairlightweight.filters.price_projection(
    price_data=['EX_BEST_OFFERS']
)

# Request market books
market_books = client.betting.list_market_book(
    market_ids=['1.176878927'],
    price_projection=price_filter
)

# Grab the first market book from the returned list as we only requested one market 
market_book = market_books[0]

runners_df = process_runner_books(market_book.runners)

runners_df

Unnamed: 0,Selection ID,Best Back Price,Best Back Size,Best Lay Price,Best Lay Size,Last Price Traded,Total Matched,Status,Removal Date,Adjustment Factor
0,10874213,1.99,2587.42,2.00,5748.15,1.99,0.0,ACTIVE,,
1,12126964,2.10,3495.50,2.12,3422.30,2.12,0.0,ACTIVE,,
2,45008858,210.00,46.86,220.00,54.90,220.00,0.0,ACTIVE,,
3,6196629,270.00,11.68,310.00,12.71,280.00,0.0,ACTIVE,,
4,53349422,310.00,19.41,350.00,399.72,320.00,0.0,ACTIVE,,
...,...,...,...,...,...,...,...,...,...,...
128,27933522,1000.00,1456.77,1000.00,1.01,1000.00,0.0,ACTIVE,,
129,71182698,1000.00,1696.75,1000.00,1.01,1000.00,0.0,ACTIVE,,
130,71112637,1000.00,1727.28,1000.00,1.01,1000.00,0.0,ACTIVE,,
131,71438581,1000.00,1847.37,1000.00,1.01,1000.00,0.0,ACTIVE,,
