In [42]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

import concurrent.futures
from selenium import webdriver

from dotenv import load_dotenv
import os
import queue
import threading

import betfairlightweight
from betfairlightweight import filters
from betfairlightweight import APIClient

In [17]:
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
chrome_options.add_argument("--headless") 

# Set up the Chrome WebDriver
service = Service("/opt/homebrew/bin/chromedriver")
driver = webdriver.Chrome(service=service, options=chrome_options)

# Navigate to the specific oddschecker page
url = "https://www.oddschecker.com/politics/us-politics/us-presidential-election/winner"
driver.get(url)

# Wait for the page to load
time.sleep(10)

# Get the page source
page_source = driver.page_source

# Close the browser
driver.quit()

In [None]:
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')

# Find the table or relevant parts
odds_table = soup.find('tbody', id='t1')

# Extract each row and the data within
odds_data = []
bookmakers_set = set()

for row in odds_table.find_all('tr'):
    market_name = row.find('a', class_='popup').text.strip()  # Extract the party name
    odds_dict = {'Market': market_name}
    
    # Find all td elements with odds information
    for td in row.find_all('td', class_=lambda x: x and ('o' in x.split() or 'bs' in x.split())):
        bookmaker = td.get('data-bk')  # Extract the bookmaker name
        decimal_odds = td.get('data-odig')  # Extract the decimal odds value
        if bookmaker and decimal_odds:  # Only add if both are present
            odds_dict[bookmaker] = float(decimal_odds)  # Convert odds to float
            bookmakers_set.add(bookmaker)
    
    odds_data.append(odds_dict)

# Create a DataFrame with all bookmakers as columns
df = pd.DataFrame(odds_data).set_index('Market')

# Ensure all bookmakers are columns, even if some are missing in certain rows
df = df.reindex(columns=sorted(bookmakers_set))

print(df)

In [43]:
# Extract oddschecker politics market urls from sitemap

chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

# Set up the Chrome WebDriver (update the path to where you have your ChromeDriver)
service = Service("/opt/homebrew/bin/chromedriver")  # Update this path
driver = webdriver.Chrome(service=service, options=chrome_options)

# URL of the sitemap
sitemap_url = "https://www.oddschecker.com/sport/politics/sitemap.xml"

try:
    # Load the sitemap page
    driver.get(sitemap_url)
    
    # Wait for the page to fully load (adjust time as needed)
    time.sleep(5)
    
    # Get the page source (XML content)
    xml_content = driver.page_source

    # Parse the XML content with BeautifulSoup
    soup = BeautifulSoup(xml_content, 'xml')

    # Find all <loc> tags which contain the URLs
    url_tags = soup.find_all('loc')

    # Extract URLs and add them to the list
    urls = [url_tag.text for url_tag in url_tags]

    print(f"Found {len(urls)} URLs.")
    print(urls)  # Print the URLs or add further processing

finally:
    # Close the browser
    driver.quit()

Found 17 URLs.
['https://www.oddschecker.com/politics/british-politics', 'https://www.oddschecker.com/politics/british-politics/next-uk-general-election/most-seats', 'https://www.oddschecker.com/politics/british-politics/next-conservative-leader', 'https://www.oddschecker.com/politics/us-politics', 'https://www.oddschecker.com/politics/us-politics/us-presidential-election/winner', 'https://www.oddschecker.com/politics/us-politics/us-state-betting/arizona', 'https://www.oddschecker.com/politics/us-politics/us-senate-elections/arizona', 'https://www.oddschecker.com/politics/us-politics/house-and-senate-elections-overall-control/senate-control', 'https://www.oddschecker.com/politics/european-politics', 'https://www.oddschecker.com/politics/european-politics/irish-politics/next-president', 'https://www.oddschecker.com/politics/european-politics/northern-irish-politics/next-united-ireland-referendum-result', 'https://www.oddschecker.com/politics/european-politics/scottish-politics/independe

In [None]:
# Function to extract odds data from a given URL
def extract_odds(url, user_agent):
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument(f'user-agent={user_agent}')
    chrome_options.add_argument("--headless")

    # Set up the Chrome WebDriver
    service = Service("/opt/homebrew/bin/chromedriver")
    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        # Navigate to the specific oddschecker page
        driver.get(url)

        # Wait for the page to load
        time.sleep(10)  # Adjust if necessary

        # Get the page source
        page_source = driver.page_source

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find the table or relevant parts
        odds_table = soup.find('tbody', id='t1')

        if not odds_table:
            print(f"No odds table found for URL: {url}")
            return None  # Skip this URL if the table isn't found

        # Extract each row and the data within
        odds_data = []
        bookmakers_set = set()

        for row in odds_table.find_all('tr'):
            party_name = row.find('a', class_='popup').text.strip()  # Extract the party name
            odds_dict = {'Party': party_name}
            
            # Find all td elements with odds information
            for td in row.find_all('td', class_=lambda x: x and ('o' in x.split() or 'bs' in x.split())):
                bookmaker = td.get('data-bk')  # Extract the bookmaker name
                decimal_odds = td.get('data-odig')  # Extract the decimal odds value
                if bookmaker and decimal_odds:  # Only add if both are present
                    odds_dict[bookmaker] = float(decimal_odds)  # Convert odds to float
                    bookmakers_set.add(bookmaker)
            
            odds_data.append(odds_dict)

        # Create a DataFrame with all bookmakers as columns
        df = pd.DataFrame(odds_data).set_index('Party')

        # Ensure all bookmakers are columns, even if some are missing in certain rows
        df = df.reindex(columns=sorted(bookmakers_set))

        # Add the URL as a column in the DataFrame
        df['URL'] = url

        return df
    finally:
        # Close the browser
        driver.quit()

# List of URLs to scrape
urls = [
    "https://www.oddschecker.com/politics/british-politics/next-labour-leader",
    "https://www.oddschecker.com/politics/british-politics/next-conservative-leader",
    "https://www.oddschecker.com/politics/australian-politics/state-elections/queensland-state-election",
    "https://www.oddschecker.com/politics/us-politics/us-presidential-election/winner",
    "https://www.oddschecker.com/politics/us-politics/us-presidential-election/winning-party",
    "https://www.oddschecker.com/politics/us-politics/us-presidential-election/party-of-popular-vote-winner",
    "https://www.oddschecker.com/politics/us-politics/us-presidential-election/gender-of-election-winner",
    "https://www.oddschecker.com/politics/us-politics/us-presidential-election/election-winner-to-lose-popular-vote",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/mississippi",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/arizona",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/massachusetts",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/oklahoma",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/pennsylvania",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/oregon",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/minnesota",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/hawaii",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/alabama",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/texas",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/rhode-island",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/florida",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/delaware",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/connecticut",
    "https://www.oddschecker.com/politics/us-politics/us-state-betting/colorado"
]

# List of user agents to rotate
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'
]

# List to store DataFrames
dataframes_list = []

# Use ThreadPoolExecutor to process URLs in parallel in batches of 5
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = []
    for i, url in enumerate(urls):
        user_agent = user_agents[i % len(user_agents)]  # Rotate user agents
        futures.append(executor.submit(extract_odds, url, user_agent))

        # Wait for each batch of 5 to complete before starting the next batch
        if (i + 1) % 5 == 0 or i == len(urls) - 1:
            for future in concurrent.futures.as_completed(futures):
                df = future.result()
                if df is not None:
                    dataframes_list.append(df)
            futures = []  # Clear futures list for the next batch


In [None]:
print(dataframes_list)

In [13]:
load_dotenv()

bf_usr = os.getenv("BF_LOGIN")
bf_pass = os.getenv("BF_PASS")
bf_api = os.getenv("BF_API_KEY")
#bf_session = os.getenv("BF_SESSION")
bf_certs_path = '../certs/'


In [38]:
client = APIClient(bf_usr, bf_pass, app_key=bf_api, certs=bf_certs_path)
#client.session_token = bf_session
client.login()

market_filter = betfairlightweight.filters.market_filter(
    event_type_ids=['2378961'],  # Politics event type
)

market_catalogues = client.betting.list_market_catalogue(
    filter=market_filter,
    max_results=100
)

# Process the market catalogues
for market in market_catalogues:
    print(market.market_name, market.market_id)

Next Labour Leader 1.170273835
Next Lib Dem Leader 1.179233218
Year Rishi Sunak replaced as Conservative Leader 1.205534173
Next Conservative Leader 1.205526560
Northern Territory Election 2024 1.223219482
Queensland State Election 2024 1.218937278
Will Joe Biden be impeached before 2024 Election? 1.218257169
Senate Control after 2024 Election 1.225479090
Election Winner 1.176878927
Winning Party 1.178176964
Popular Vote Winner 1.178165812
Party of Popular Vote Winner 1.178176967
Gender of Election Winner 1.178176193
Will Election Winner lose Popular Vote? 1.226054697
Joe Manchin to be re-elected to the senate in 2024 1.213966025
Mississippi 1.230000329
Arizona 1.229996509
Massachusetts 1.230000327
Oklahoma 1.230123427
Pennsylvania 1.230123429
Oregon 1.230123428
Minnesota 1.229999638
Hawaii 1.229997182
Alabama 1.229996495
Texas 1.230123858
Rhode Island 1.230123454
Florida 1.229997015
Delaware 1.229997012
Connecticut 1.229997011
Colorado 1.229997010
California 1.229997007
Utah 1.2301238

In [50]:
def process_runner_books(runner_books):
    best_back_prices = [runner_book.ex.available_to_back[0]['price']
        if runner_book.ex.available_to_back
        else 1.01
        for runner_book
        in runner_books]
    best_back_sizes = [runner_book.ex.available_to_back[0]['size']
        if runner_book.ex.available_to_back
        else 1.01
        for runner_book
        in runner_books]

    best_lay_prices = [runner_book.ex.available_to_lay[0]['price']
        if runner_book.ex.available_to_lay
        else 1000.0
        for runner_book
        in runner_books]
    best_lay_sizes = [runner_book.ex.available_to_lay[0]['size']
        if runner_book.ex.available_to_lay
        else 1.01
        for runner_book
        in runner_books]

    selection_ids = [runner_book.selection_id for runner_book in runner_books]
    last_prices_traded = [runner_book.last_price_traded for runner_book in runner_books]
    total_matched = [runner_book.total_matched for runner_book in runner_books]
    statuses = [runner_book.status for runner_book in runner_books]
    scratching_datetimes = [runner_book.removal_date for runner_book in runner_books]
    adjustment_factors = [runner_book.adjustment_factor for runner_book in runner_books]

    df = pd.DataFrame({
        'Selection ID': selection_ids,
        'Best Back Price': best_back_prices,
        'Best Back Size': best_back_sizes,
        'Best Lay Price': best_lay_prices,
        'Best Lay Size': best_lay_sizes,
        'Last Price Traded': last_prices_traded,
        'Total Matched': total_matched,
        'Status': statuses,
        'Removal Date': scratching_datetimes,
        'Adjustment Factor': adjustment_factors
    })
    return df

# Create a price filter. Get all traded and offer data
price_filter = betfairlightweight.filters.price_projection(
    price_data=['EX_BEST_OFFERS']
)

# Request market books
market_books = client.betting.list_market_book(
    market_ids=['1.176878927'],
    price_projection=price_filter
)

# Grab the first market book from the returned list as we only requested one market 
market_book = market_books[0]

runners_df = process_runner_books(market_book.runners)

runners_df

Unnamed: 0,Selection ID,Best Back Price,Best Back Size,Best Lay Price,Best Lay Size,Last Price Traded,Total Matched,Status,Removal Date,Adjustment Factor
0,10874213,2.06,3243.85,2.08,6126.09,2.06,0.0,ACTIVE,,
1,12126964,2.02,3617.35,2.04,1938.97,2.04,0.0,ACTIVE,,
2,45008858,160.00,23.30,190.00,58.03,170.00,0.0,ACTIVE,,
3,6196629,240.00,28.67,260.00,21.69,260.00,0.0,ACTIVE,,
4,53349422,700.00,11.60,950.00,15.37,840.00,0.0,ACTIVE,,
...,...,...,...,...,...,...,...,...,...,...
128,27933522,1000.00,1926.57,1000.00,1.01,1000.00,0.0,ACTIVE,,
129,71182698,1000.00,2166.42,1000.00,1.01,1000.00,0.0,ACTIVE,,
130,71112637,1000.00,2197.39,1000.00,1.01,1000.00,0.0,ACTIVE,,
131,71438581,1000.00,2317.00,1000.00,1.01,1000.00,0.0,ACTIVE,,


In [49]:
market_filter = betfairlightweight.filters.market_filter(
    event_type_ids=['2378961'],  # Politics event type
)

# Get market catalogues, including runners
market_catalogues = client.betting.list_market_catalogue(
    filter=market_filter,
    max_results=100,  # Adjust this as needed
    market_projection=['RUNNER_DESCRIPTION']  # Include runner descriptions to get selection names
)

# Extract market IDs
market_ids = [market.market_id for market in market_catalogues]

# Define function to process runner books and return DataFrame
def process_runner_books(runner_books):
    best_back_prices = [runner_book.ex.available_to_back[0]['price']
        if runner_book.ex.available_to_back
        else 1.01
        for runner_book
        in runner_books]
    best_back_sizes = [runner_book.ex.available_to_back[0]['size']
        if runner_book.ex.available_to_back
        else 1.01
        for runner_book
        in runner_books]

    best_lay_prices = [runner_book.ex.available_to_lay[0]['price']
        if runner_book.ex.available_to_lay
        else 1000.0
        for runner_book
        in runner_books]
    best_lay_sizes = [runner_book.ex.available_to_lay[0]['size']
        if runner_book.ex.available_to_lay
        else 1.01
        for runner_book
        in runner_books]

    selection_ids = [runner_book.selection_id for runner_book in runner_books]
    last_prices_traded = [runner_book.last_price_traded for runner_book in runner_books]
    total_matched = [runner_book.total_matched for runner_book in runner_books]
    statuses = [runner_book.status for runner_book in runner_books]
    scratching_datetimes = [runner_book.removal_date for runner_book in runner_books]
    adjustment_factors = [runner_book.adjustment_factor for runner_book in runner_books]

    df = pd.DataFrame({
        'Selection ID': selection_ids,
        'Best Back Price': best_back_prices,
        'Best Back Size': best_back_sizes,
        'Best Lay Price': best_lay_prices,
        'Best Lay Size': best_lay_sizes,
        'Last Price Traded': last_prices_traded,
        'Total Matched': total_matched,
        'Status': statuses,
        'Removal Date': scratching_datetimes,
        'Adjustment Factor': adjustment_factors
    })
    return df

# Create a price filter for market data
price_filter = betfairlightweight.filters.price_projection(
    price_data=['EX_BEST_OFFERS']
)

# List to store DataFrames
dataframes_list = []

# Loop through each market ID and fetch market book data
for market_id in market_ids:
    # Request market book for each market ID
    market_books = client.betting.list_market_book(
        market_ids=[market_id],
        price_projection=price_filter
    )
    
    # Ensure that market books were returned
    if market_books:
        # Process the first market book (only one is requested)
        market_book = market_books[0]
        
        # Process runner books and store in DataFrame
        runners_df = process_runner_books(market_book.runners)
        
        # Add the market ID to the DataFrame for reference
        runners_df['Market ID'] = market_id
        
        # Append the DataFrame to the list
        dataframes_list.append(runners_df)

# Optionally, you can concatenate all dataframes into a single dataframe
all_data_df = pd.concat(dataframes_list, ignore_index=True)

# Display or process the combined DataFrame as needed
print(all_data_df)

ValueError: No objects to concatenate

In [32]:
# Extract market IDs and selection information
market_info = []

for market in market_catalogues:
    market_id = market.market_id
    market_name = market.market_name
    for runner in market.runners:
        selection_id = runner.selection_id
        selection_name = runner.runner_name
        market_info.append({
            'Market ID': market_id,
            'Market Name': market_name,
            'Selection ID': selection_id,
            'Selection Name': selection_name
        })

# Convert to DataFrame for easier handling
market_info_df = pd.DataFrame(market_info)

In [33]:
# Merge odds data with selection names using 'Selection ID'
combined_df = pd.merge(runners_df, market_info_df, on='Selection ID', how='left')

print(combined_df)

   Selection ID  Best Back Price  Best Back Size  Best Lay Price  \
0       1111884             1.74          107.00            1.76   
1       1111884             1.74          107.00            1.76   
2       1111889             3.25           11.43            3.75   
3      38700528             8.40           13.03            9.80   
4       5191378            55.00           10.48          270.00   

   Best Lay Size  Last Price Traded  Total Matched  Status Removal Date  \
0         180.25               1.74            0.0  ACTIVE         None   
1         180.25               1.74            0.0  ACTIVE         None   
2          10.00               3.30            0.0  ACTIVE         None   
3          13.00               9.20            0.0  ACTIVE         None   
4          12.80              95.00            0.0  ACTIVE         None   

  Adjustment Factor  Market ID_x  Market ID_y Market Name     Selection Name  
0              None  1.230583324  1.219627935  Most Seats    

In [34]:
print(market_ids)

['1.170273835', '1.179233218', '1.205534173', '1.205526560', '1.223219482', '1.218937278', '1.218257169', '1.225479090', '1.176878927', '1.178176964', '1.178165812', '1.178176967', '1.178176193', '1.226054697', '1.213966025', '1.230000329', '1.229996509', '1.230000327', '1.230123427', '1.230123429', '1.230123428', '1.229999638', '1.229997182', '1.229996495', '1.230123858', '1.230123454', '1.229997015', '1.229997012', '1.229997011', '1.229997010', '1.229997007', '1.230123883', '1.229997003', '1.230123893', '1.230123892', '1.230123895', '1.230123894', '1.230123898', '1.229997534', '1.230123899', '1.230123632', '1.229999165', '1.230123636', '1.230123511', '1.229997102', '1.229997511', '1.229975960', '1.229997509', '1.229997508', '1.229997507', '1.230000097', '1.230123388', '1.230000502', '1.229997224', '1.230000498', '1.230123393', '1.229997223', '1.230123391', '1.229997222', '1.230123395', '1.230123396', '1.230000342', '1.230000473', '1.230000335', '1.230000332', '1.223963569', '1.166577

In [52]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Extract market names from Betfair data
betfair_market_names = list(set(market_info_df['Market Name'].tolist()))

# Combine all market names for vectorization
all_market_names = betfair_market_names + urls

# Vectorize the market names using TF-IDF
vectorizer = TfidfVectorizer().fit_transform(all_market_names)
vectors = vectorizer.toarray()

# Calculate cosine similarity between Betfair and Oddschecker markets
cosine_sim_matrix = cosine_similarity(vectors[:len(betfair_market_names)], vectors[len(betfair_market_names):])

# Find the best matches for each Betfair market
matches = []
for i, betfair_name in enumerate(betfair_market_names):
    similarity_scores = cosine_sim_matrix[i]
    best_match_idx = np.argmax(similarity_scores)
    best_match_score = similarity_scores[best_match_idx]
    best_match_name = urls[best_match_idx]
    matches.append({
        'Betfair Market Name': betfair_name,
        'Oddschecker Market Name': best_match_name,
        'Similarity Score': best_match_score
    })

# Convert matches to DataFrame for easier review
matches_df = pd.DataFrame(matches)

# Display or save the matches DataFrame
print(matches_df)

# You can now use this matches_df to manually review or further process the matched data

   Betfair Market Name                            Oddschecker Market Name  \
0        West Virginia  https://www.oddschecker.com/politics/british-p...   
1             Illinois  https://www.oddschecker.com/politics/british-p...   
2              Georgia  https://www.oddschecker.com/politics/british-p...   
3      Election Winner  https://www.oddschecker.com/politics/us-politi...   
4                 Ohio  https://www.oddschecker.com/politics/british-p...   
..                 ...                                                ...   
70            Michigan  https://www.oddschecker.com/politics/british-p...   
71       Winning Party  https://www.oddschecker.com/politics/us-politi...   
72                Utah  https://www.oddschecker.com/politics/british-p...   
73             Wyoming  https://www.oddschecker.com/politics/british-p...   
74        Pennsylvania  https://www.oddschecker.com/politics/us-politi...   

    Similarity Score  
0           0.000000  
1           0.000000  
2     

In [55]:
# Filter out rows with a similarity score of 0
matches_df_filtered = matches_df[matches_df['Similarity Score'] > 0]

# Sort the DataFrame by similarity score in descending order
matches_df_sorted = matches_df_filtered.sort_values(by='Similarity Score', ascending=False)

# Display the sorted DataFrame
print(matches_df_sorted)

                                  Betfair Market Name  \
9                           Gender of Election Winner   
30                       Party of Popular Vote Winner   
33                                 Next Labour Leader   
61                           Next Conservative Leader   
66            Will Election Winner lose Popular Vote?   
65                                       Rhode Island   
71                                      Winning Party   
37                                Popular Vote Winner   
52                     Queensland State Election 2024   
3                                     Election Winner   
50                                        Mississippi   
49                                            Arizona   
43                                           Colorado   
39                                        Connecticut   
36                                             Hawaii   
57                                           Delaware   
53                             