In [119]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np

# Read CSV files
polymarket = pd.read_csv("../output/polymarket_data.csv")
betfair = pd.read_csv("../output/betfair_data.csv")
oddschecker = pd.read_csv("../output/oddschecker_data.csv")
predictit = pd.read_csv("../output/predictit_data.csv")

#### Process datasets

In [147]:
# Convert 'timestamp' to datetime
for df in [polymarket, betfair, oddschecker, predictit]:
    df['timestamp'] = pd.to_datetime(df['timestamp'])

# Process each DataFrame to have consistent columns
# [Process oddschecker]
oddschecker = oddschecker[oddschecker['bet_name'].isin(['Donald Trump', 'Kamala Harris'])]
oddschecker_melted = oddschecker.melt(
    id_vars=['bet_name', 'timestamp'],
    var_name='source',
    value_name='value'
)
oddschecker_melted['value'] = pd.to_numeric(oddschecker_melted['value'], errors='coerce')
oddschecker_melted = oddschecker_melted[oddschecker_melted['source'].isin(['B3', 'SK', 'BF'])]
oddschecker_melted.rename(columns={'bet_name': 'market'}, inplace=True)
oddschecker_prices = oddschecker_melted[['timestamp', 'market', 'source', 'value']]

# [Process betfair]
betfair['market'] = betfair['bet_name']
betfair['source'] = 'betfairx'
betfair_prices = betfair[['timestamp', 'market', 'source', 'back_price']].rename(columns={'back_price': 'value'})

# [Process polymarket]
polymarket['market'] = polymarket['bet_id'].apply(
    lambda x: 'Donald Trump' if x == 253591 else 'Kamala Harris'
)
polymarket['source'] = 'polymarket'
polymarket['value'] = 1/polymarket['yes_price']
polymarket_prices = polymarket[['timestamp', 'market', 'source', 'value']]

# [Process predictit]
predictit['market'] = predictit['bet_name']
predictit['source'] = 'predictit'
predictit['value'] = 1/predictit['buy_yes_price']
predictit_prices = predictit[['timestamp', 'market', 'source', 'value']]

# Combine all prices
prices = pd.concat([oddschecker_prices, betfair_prices, polymarket_prices, predictit_prices], ignore_index=True)

# Ensure 'value' is numeric
prices['value'] = pd.to_numeric(prices['value'], errors='coerce')
prices = prices.dropna(subset=['value'])

In [148]:
# Append to prices DataFrame
prices = pd.concat([prices, predictit_prices], ignore_index=True)

# Reset index if necessary
prices.reset_index(inplace=True, drop=True)

# Display the resulting DataFrame
print(prices)

                             timestamp         market     source      value
0     2024-10-30 19:01:37.317865+00:00   Donald Trump         B3   1.500000
1     2024-10-30 19:01:37.317865+00:00  Kamala Harris         B3   2.700000
2     2024-10-30 19:02:51.730743+00:00   Donald Trump         B3   1.500000
3     2024-10-30 19:02:51.730743+00:00  Kamala Harris         B3   2.700000
4     2024-10-30 19:04:04.982139+00:00   Donald Trump         B3   1.500000
...                                ...            ...        ...        ...
75703 2024-11-06 09:37:30.261213+00:00  Kamala Harris  predictit  33.333333
75704 2024-11-06 09:38:30.611898+00:00   Donald Trump  predictit   1.020408
75705 2024-11-06 09:38:30.611898+00:00  Kamala Harris  predictit  33.333333
75706 2024-11-06 09:39:30.967674+00:00   Donald Trump  predictit   1.020408
75707 2024-11-06 09:39:30.967674+00:00  Kamala Harris  predictit  33.333333

[75708 rows x 4 columns]


In [149]:
all_timestamps = pd.date_range(start=prices['timestamp'].min(), end=prices['timestamp'].max(), freq='1 min')

In [150]:
# Convert 'timestamp' to datetime if not already
prices['timestamp'] = pd.to_datetime(prices['timestamp'])

# Set 'timestamp' as the index for resampling
prices.set_index('timestamp', inplace=True)

# Resample the data to fixed intervals (e.g., every minute), aggregating only the 'value' column
resampled_prices = prices.groupby(['market', 'source'])['value'].resample('1T').last()

# Reset index to turn 'timestamp' back into a column
resampled_prices = resampled_prices.reset_index()

  resampled_prices = prices.groupby(['market', 'source'])['value'].resample('1T').last()


In [151]:
# Step 3: Forward-Fill with Maximum Time Delta
max_delta = pd.Timedelta(minutes=15)

def forward_fill_with_time_limit(group):
    group = group.copy()
    group['value_ffill'] = group['value'].ffill()
    group['original_timestamp'] = group['timestamp'].where(group['value'].notnull())
    group['original_timestamp'] = group['original_timestamp'].ffill()
    group['time_diff'] = group['timestamp'] - group['original_timestamp']
    group.loc[group['time_diff'] > max_delta, 'value_ffill'] = np.nan
    return group

resampled_prices = resampled_prices.groupby(['market', 'source']).apply(forward_fill_with_time_limit).reset_index(drop=True)

  resampled_prices = resampled_prices.groupby(['market', 'source']).apply(forward_fill_with_time_limit).reset_index(drop=True)


In [152]:
pivoted_prices = resampled_prices.pivot_table(values='value_ffill', index='timestamp', columns=['market', 'source'])
pivoted_prices.columns = ['_'.join(col).strip() for col in pivoted_prices.columns.values]
pivoted_prices.reset_index(inplace=True)

In [153]:
markets = ['Donald Trump', 'Kamala Harris']
best_odds_list = []

for market in markets:
    market_cols = [col for col in pivoted_prices.columns if col.startswith(market)]
    market_df = pivoted_prices[['timestamp'] + market_cols].copy()

    def get_best_odds(row):
        odds = row[1:]
        if odds.isnull().all():
            return pd.Series([np.nan, np.nan])
        best_value = odds.max()
        best_source = odds.idxmax().split('_')[1]
        return pd.Series([best_value, best_source])

    market_df[['best_value', 'best_source']] = market_df.apply(get_best_odds, axis=1)
    market_df['market'] = market
    best_odds_list.append(market_df[['timestamp', 'market', 'best_value', 'best_source']])

best_odds_df = pd.concat(best_odds_list, ignore_index=True)

In [154]:
best_odds_pivot = best_odds_df.pivot(index='timestamp', columns='market', values=['best_value', 'best_source'])
best_odds_pivot.columns = ['_'.join(col).strip() for col in best_odds_pivot.columns.values]
best_odds_pivot.dropna(inplace=True)
best_odds_pivot.reset_index(inplace=True)

In [156]:
for market in markets:
    best_odds_pivot[f'implied_prob_{market}'] = 1 / best_odds_pivot[f'best_value_{market}']

implied_prob_cols = [f'implied_prob_{market}' for market in markets]
best_odds_pivot['overround'] = best_odds_pivot[implied_prob_cols].sum(axis=1)

arbitrage_opportunities = best_odds_pivot[best_odds_pivot['overround'] < 1].copy()
arbitrage_opportunities['profit_margin'] = (1 - arbitrage_opportunities['overround']) * 100

# Step 8: Display Results
print("\nArbitrage Opportunities with Best Providers (Aligned Timestamps):")
print(arbitrage_opportunities[['timestamp',
                               'best_value_Donald Trump', 'best_source_Donald Trump',
                               'best_value_Kamala Harris', 'best_source_Kamala Harris',
                               'overround', 'profit_margin']].head())


Arbitrage Opportunities with Best Providers (Aligned Timestamps):
                  timestamp best_value_Donald Trump best_source_Donald Trump  \
0 2024-10-30 17:49:00+00:00                1.694915                predictit   
1 2024-10-30 17:50:00+00:00                1.694915                predictit   
2 2024-10-30 17:51:00+00:00                1.694915                predictit   
3 2024-10-30 17:52:00+00:00                1.694915                predictit   
4 2024-10-30 17:53:00+00:00                1.694915                predictit   

  best_value_Kamala Harris best_source_Kamala Harris overround profit_margin  
0                  2.95421                polymarket    0.9285          7.15  
1                  2.95421                polymarket    0.9285          7.15  
2                  2.95421                polymarket    0.9285          7.15  
3                 2.945508                polymarket    0.9295          7.05  
4                 2.945508                polymarket    0

In [157]:
arbitrage_opportunities[arbitrage_opportunities['profit_margin']>5].head()

Unnamed: 0,timestamp,best_value_Donald Trump,best_value_Kamala Harris,best_source_Donald Trump,best_source_Kamala Harris,implied_prob_Donald Trump,implied_prob_Kamala Harris,overround,profit_margin
0,2024-10-30 17:49:00+00:00,1.694915,2.95421,predictit,polymarket,0.59,0.3385,0.9285,7.15
1,2024-10-30 17:50:00+00:00,1.694915,2.95421,predictit,polymarket,0.59,0.3385,0.9285,7.15
2,2024-10-30 17:51:00+00:00,1.694915,2.95421,predictit,polymarket,0.59,0.3385,0.9285,7.15
3,2024-10-30 17:52:00+00:00,1.694915,2.945508,predictit,polymarket,0.59,0.3395,0.9295,7.05
4,2024-10-30 17:53:00+00:00,1.694915,2.945508,predictit,polymarket,0.59,0.3395,0.9295,7.05


In [143]:
arbitrage_opportunities.to_csv('../output/arbs.csv',index=False)