# Data Processing (Ramirez)
Covered in this notebook:
1. Downloading final.dta from Ramirez's Google Drive.
2. Adding columns for all variables described in paper:
    a. RankDist.
    b. WikiBuzz.
    c. Implied Probability / Inverse Odds.
3. Produce a cleaned Ramirez_matches_cleaned.csv file ready for further use.

In [1]:
# Imports, Remove Warnings for notebook readability.
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore", category=UnicodeWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [2]:
url = 'https://drive.google.com/file/d/1ltOks54u7V6tvK22EmT5n_XL8D2V4o8R/view?usp=share_link'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]

df = pd.read_stata(path)

## RankDist Variable

In [3]:
def get_rankdist(winner_rank: int = 0, loser_rank: int = 0) -> float:
    if np.isnan(winner_rank):
        inverse_wr = 0
    else:
        inverse_wr = 1 / winner_rank

    if np.isnan(loser_rank):
        inverse_lr = 0
    else:
        inverse_lr = 1 / loser_rank
    rankdist = -(inverse_wr - inverse_lr)
    return rankdist

df['WRankDist'] = [get_rankdist(x,y) for x,y in zip(df['wrank'], df['lrank'])]
df['LRankDist'] = [get_rankdist(y, x) for x, y in zip(df['wrank'], df['lrank'])]

## WikiBuzz Variable

In [4]:
def get_wikibuzz(winner_pageviews, winner_median, loser_pageviews, loser_median):
    try:
        buzz = np.log(winner_pageviews / winner_median) - np.log(loser_pageviews / loser_median)
    except ZeroDivisionError:
        buzz = 'ZeroDivisionError'
    return buzz

df['W_WikiBuzz'] = [get_wikibuzz(row[0], row[1], row[2], row[3]) for row in zip(df['wiki_yesterday_w'], df['wiki_med365_w'],
                                                                                df['wiki_yesterday_l'], df['wiki_med365_l'])]
df['L_WikiBuzz'] = [get_wikibuzz(row[0], row[1], row[2], row[3]) for row in zip(df['wiki_yesterday_l'], df['wiki_med365_l'],
                                                                                df['wiki_yesterday_w'], df['wiki_med365_w'])]

## Inverse Odds

In [5]:
def impliedprob(player_odds):
    return 1/player_odds

df['W_inverse_bestodds'] = [impliedprob(x) for x in df['maxw']]
df['L_inverse_bestodds'] = [impliedprob(x) for x in df['maxl']]
df['W_inverse_avgodds'] = [impliedprob(x) for x in df['avgw']]
df['L_inverse_avgodds'] = [impliedprob(x) for x in df['avgl']]
df['W_inverse_B365'] = [impliedprob(x) for x in df['b365w']]
df['L_inverse_B365'] = [impliedprob(x) for x in df['b365l']]

In [6]:
# Date Column
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
# Match index column
df['match_id'] = df.index

In [7]:
# Convert each row into 2 rows (1 for winner, 1 for loser) sharing a match_id.
df_winners = df[['match_id', 'winner', 'date', 'year', 'WRankDist', 'W_WikiBuzz', 'W_inverse_B365', 'W_inverse_avgodds', 'W_inverse_bestodds']].copy()
df_winners['outcome'] = 1
df_losers = df[['match_id', 'loser', 'date', 'year', 'LRankDist', 'L_WikiBuzz', 'L_inverse_B365', 'L_inverse_avgodds', 'L_inverse_bestodds']].copy()
df_losers['outcome'] = 0

df_winners.rename({'winner': 'player', 'WRankDist': 'rankdist', 'W_WikiBuzz': 'wikibuzz',
                   'W_inverse_B365': 'inverse_b365', 'W_inverse_avgodds': 'inverse_avg', 'W_inverse_bestodds': 'inverse_best'}, axis=1, inplace=True)
df_losers.rename({'loser': 'player', 'LRankDist': 'rankdist', 'L_WikiBuzz': 'wikibuzz',
                  'L_inverse_B365': 'inverse_b365', 'L_inverse_avgodds': 'inverse_avg', 'L_inverse_bestodds': 'inverse_best'}, axis=1, inplace=True)
df = pd.concat([df_winners, df_losers], axis=0)

In [8]:
# Remove bad rows.
df = df.loc[df["wikibuzz"] != np.inf]
df = df.loc[df["wikibuzz"] != -np.inf]
df = df.loc[df["wikibuzz"] != 'ZeroDivisionError']

df = df[df['wikibuzz'].notna()]
df = df[df['rankdist'].notna()]
df = df[df['inverse_avg'].notna()]

df.sort_values(by='date', inplace=True)

In [9]:
df.to_csv("Ramirez_matches_cleaned.csv")