In [3]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

# Read CSV files
polymarket = pd.read_csv("../output/polymarket_data.csv")
betfair = pd.read_csv("../output/betfair_data.csv")
oddschecker = pd.read_csv("../output/oddschecker_data.csv")
predictit = pd.read_csv("../output/predictit_data.csv")

Matplotlib is building the font cache; this may take a moment.


#### Process datasets

In [6]:
# Process oddschecker data
oddschecker['timestamp'] = pd.to_datetime(oddschecker['timestamp'])
oddschecker = oddschecker[oddschecker['bet_name'].isin(['Donald Trump', 'Kamala Harris'])]

# Melt the DataFrame to long format
oddschecker_melted = oddschecker.melt(
    id_vars=['bet_name', 'timestamp'],
    var_name='name',
    value_name='value'
)

# Filter for specific bookmakers
oddschecker_melted = oddschecker_melted[oddschecker_melted['name'].isin(['B3', 'SK', 'BF'])]

# Add 'market' and 'source' columns
oddschecker_melted['market'] = oddschecker_melted['bet_name']
oddschecker_melted['source'] = oddschecker_melted['name']

# Select relevant columns
prices = oddschecker_melted[['timestamp', 'value', 'market', 'source']]

betfair['timestamp'] = pd.to_datetime(betfair['timestamp'])
betfair['market'] = betfair['bet_name']
betfair['source'] = 'betfairx'

# Select and rename columns
betfair_prices = betfair[['timestamp', 'back_price', 'market', 'source']].rename(columns={'back_price': 'value'})

# Append to prices DataFrame
prices = pd.concat([prices, betfair_prices], ignore_index=True)

# Process polymarket data
polymarket['timestamp'] = pd.to_datetime(polymarket['timestamp'])

# Map 'bet_id' to 'market' names
polymarket['market'] = polymarket['bet_id'].apply(
    lambda x: 'Donald Trump' if x == 253591 else 'Kamala Harris'
)

polymarket['source'] = 'polymarket'
polymarket['value'] = 1 / polymarket['yes_price']

# Select relevant columns
polymarket_prices = polymarket[['timestamp', 'value', 'market', 'source']]

# Append to prices DataFrame
prices = pd.concat([prices, polymarket_prices], ignore_index=True)

# Process predictit data
predictit['timestamp'] = pd.to_datetime(predictit['timestamp'])
predictit['market'] = predictit['bet_name']
predictit['source'] = 'predictit'
predictit['value'] = 1 / predictit['buy_yes_price']

# Select relevant columns
predictit_prices = predictit[['timestamp', 'value', 'market', 'source']]

In [7]:
# Append to prices DataFrame
prices = pd.concat([prices, predictit_prices], ignore_index=True)

# Reset index if necessary
prices = prices.reset_index(drop=True)

# Display the resulting DataFrame
print(prices)

                             timestamp      value         market     source
0     2024-10-30 19:01:37.317865+00:00        1.5   Donald Trump         B3
1     2024-10-30 19:01:37.317865+00:00        2.7  Kamala Harris         B3
2     2024-10-30 19:02:51.730743+00:00        1.5   Donald Trump         B3
3     2024-10-30 19:02:51.730743+00:00        2.7  Kamala Harris         B3
4     2024-10-30 19:04:04.982139+00:00        1.5   Donald Trump         B3
...                                ...        ...            ...        ...
59441 2024-11-06 09:37:30.261213+00:00  33.333333  Kamala Harris  predictit
59442 2024-11-06 09:38:30.611898+00:00   1.020408   Donald Trump  predictit
59443 2024-11-06 09:38:30.611898+00:00  33.333333  Kamala Harris  predictit
59444 2024-11-06 09:39:30.967674+00:00   1.020408   Donald Trump  predictit
59445 2024-11-06 09:39:30.967674+00:00  33.333333  Kamala Harris  predictit

[59446 rows x 4 columns]


#### Arbitrages

In [10]:
def calculate_arbitrage(prices_df):
    # Ensure 'value' is numeric
    prices_df['value'] = pd.to_numeric(prices_df['value'], errors='coerce')
    prices_df = prices_df.dropna(subset=['value']).copy()  # Make a copy here

    # Get best odds from any source for each market at each timestamp
    best_odds = prices_df.groupby(['timestamp', 'market']).agg({'value': 'max'}).reset_index()

    # Calculate implied probabilities
    best_odds['implied_prob'] = 1 / best_odds['value']

    # Pivot to have markets as columns
    odds_pivot = best_odds.pivot(index='timestamp', columns='market', values='implied_prob')

    # Drop rows with missing values (when odds for all outcomes are not available)
    odds_pivot = odds_pivot.dropna()

    # Calculate overround
    odds_pivot['overround'] = odds_pivot.sum(axis=1)

    # Identify arbitrage opportunities
    arbitrage_opportunities = odds_pivot[odds_pivot['overround'] < 1].copy()

    # Calculate profit margin
    arbitrage_opportunities['profit_margin'] = (1 - arbitrage_opportunities['overround']) * 100

    return arbitrage_opportunities

In [11]:
arbitrage_opportunities = calculate_arbitrage(prices)

print("\nArbitrage Opportunities Over Time:")
print(arbitrage_opportunities[['overround', 'profit_margin']])


Arbitrage Opportunities Over Time:
market                            overround  profit_margin
timestamp                                                 
2024-10-30 18:14:26.054038+00:00      0.999            0.1
2024-10-30 18:15:26.228410+00:00      0.999            0.1
2024-10-30 18:16:26.406672+00:00      0.999            0.1
2024-10-30 18:17:26.535326+00:00      0.999            0.1
2024-10-30 18:40:31.701970+00:00      0.999            0.1
...                                     ...            ...
2024-11-06 09:35:49.650731+00:00      0.998            0.2
2024-11-06 09:36:49.755536+00:00      0.998            0.2
2024-11-06 09:37:49.837351+00:00      0.998            0.2
2024-11-06 09:38:49.952954+00:00      0.998            0.2
2024-11-06 09:39:50.052930+00:00      0.998            0.2

[2283 rows x 2 columns]


In [12]:
# Visualize profit margin over time
plt.figure(figsize=(12, 6))
plt.plot(arbitrage_opportunities.index, arbitrage_opportunities['profit_margin'], marker='o')
plt.title('Arbitrage Profit Margin Over Time')
plt.xlabel('Timestamp')
plt.ylabel('Profit Margin (%)')
plt.grid(True)
plt.show()

NameError: name 'plt' is not defined