# Setup

In [1]:
DATA_FILE = f"rounds"
DATA_PATH = f'./data/{DATA_FILE}'
TABDATA_PATH = './data/tabroom'

In [184]:
import pandas as pd
from sqlalchemy import create_engine
from tqdm.notebook import tqdm
import json

db = create_engine("postgresql://postgres:password@localhost:5432/debate-cards")
pd.set_option('display.float_format', '{:.3f}'.format)
tqdm.pandas()
%matplotlib inline

In [3]:
try:
  rounds = pd.read_feather(DATA_PATH + '.feather')
except FileNotFoundError:
  query = """
    SELECT 
      round."roundId", round.side, round.tournament, round.round, round.opponent, round.judge, round.report, round."opensourcePath", round."caselistUpdatedAt", 
      team."teamId", team.name as "teamName", team."displayName" as "teamDisplayName", team.notes, team."debater1First", team."debater1Last", team."debater2First", team."debater2Last", 
      school."schoolId", school.name as "schoolName", school."displayName" as "schoolDisplayName", school.state, school."chapterId", 
      caselist."caselistId", caselist.name as "caselistName", caselist."displayName" as "caselistDisplayName", caselist.year, caselist.event, caselist.level, caselist."teamSize",
      file.id as "fileId"
      FROM "Round" round 
      JOIN "Team" team ON team."teamId" = round."teamId"
      JOIN "School" school on school."schoolId" = team."schoolId"
      JOIN "Caselist" caselist on caselist."caselistId" = school."caselistId"
      LEFT JOIN "File" file ON file."roundId" = round.id;
  """
  rounds = pd.read_sql(query, db)  
  rounds.to_feather(DATA_PATH + '.feather')

In [20]:
rounds.to_csv('./datasets/caselists/rounds.csv')

In [21]:
try:
  cites = pd.read_feather(DATA_PATH + '_cites.feather')
except FileNotFoundError:
  query = """ SELECT id, "citeId", title, cites, "roundId" FROM "Cite" """
  cites = pd.read_sql(query, db)
  print("Loaded")
  cites.to_feather(DATA_PATH + '_cites.feather')

In [22]:
cites.to_csv('./datasets/caselist/cites.csv')

In [4]:
import glob
import re
tournIds = sorted([int(re.search(r'/(\d+)\.json', path).group(1)) for path in glob.glob(f'{TABDATA_PATH}/*.json')])

In [None]:
from collections import Counter

abbrs = Counter()
for i in tqdm(tournIds):
  with open(f'{TABDATA_PATH}/{i}.json') as f:
    try:
      tourn = json.load(f)
      if 'categories' not in tourn: continue
      for category in tourn['categories']:
        for event in category['events']:
          abbrs[event['abbr'] or category['abbr']] += 1
    except KeyboardInterrupt:
      raise KeyboardInterrupt
    except:
      print(f'Failed: {i}')
abbrs

In [None]:
print("\n".join([f"{abbr or 'None'}: {str(abbrs[abbr])}" for abbr in abbrs]))

In [203]:
from bs4 import BeautifulSoup
import requests

def loadDoc(url):
  response = requests.get(url)
  return BeautifulSoup(response.text)

In [337]:
def loadTeamRecord(id1: int, id2: int):
  doc = loadDoc(f"https://www.tabroom.com/index/results/team_lifetime_record?id1={id1}&id2={id2}")
  divs = doc.select(".main > div.nospace.padtop")[1:]
  tables = [pd.read_html(str(div.find('table')))[0] for div in divs]
  names = [div.find('h5').text.strip() for div in divs]
  return pd.concat(tables, keys=[(name[-4:], name[0:-5]) for name in names])

In [431]:
# tournNum = 22381
tournNum = None

In [432]:
# with open(f'{TABDATA_PATH}/{tournNum}.json') as f:
#   tourn = json.load(f)
tourn = None

# Load

In [4]:
policyNames = ['cx', 'pol']
def isPolicy(s: str):
  if s is None: return False
  s = s.lower()
  return any([name in s for name in policyNames])

def findPolicyEvents(tourn):
  if 'categories' not in tourn: return []
  events = []
  for category in tourn['categories']:
    if isPolicy(category['abbr']) or isPolicy(category['name']):
      events += category['events']
    else:
      for event in category['events']:
        if isPolicy(event['abbr']) or isPolicy(event['name']): events += [event]
  return events

def optional(dest: dict, src: dict, field: str):
  if field in src: dest[field] = src[field]

In [5]:
def getBallots(round, json_tourn, entries: dict[int, set[int]]):
  json_round = json_tourn | {
    "roundId": int(round['id']),
    "type": round['type'],
    "name": round['name'],
    "protocol_name": round['protocol_name'],
    "start_time": round['start_time']
  }
  optional(json_round, round, 'label')
  optional(json_round, round, 'flights')
  optional(json_round, round, 'runoff')

  ballots = []
  if 'sections' in round:
    for section in round['sections']:
      json_section = json_round | {
        "sectionId": int(section['id']),
        "room": section['room'],
        "letter": section['letter']
      }
      optional(json_section, section, 'bye')
      if 'ballots' in section:
        for ballot in section['ballots']:
          json_ballot = json_section | {
            'ballotId': int(ballot['id']),
            'panelId': ballot['panel'],
            'entry': ballot['entry']
          }
          optional(json_ballot, ballot, 'judge_started')
          optional(json_ballot, ballot, 'started_by')
          optional(json_ballot, ballot, 'entered_by')
          optional(json_ballot, ballot, 'audited_by')
          optional(json_ballot, ballot, 'side')
          optional(json_ballot, ballot, 'speakerorder')

          optional(json_ballot, ballot, 'judge')
          optional(json_ballot, ballot, 'judge_code')
          optional(json_ballot, ballot, 'chair')
          optional(json_ballot, ballot, 'judge_first')
          optional(json_ballot, ballot, 'judge_last')

          optional(json_ballot, ballot, 'entry_code')
          optional(json_ballot, ballot, 'entry_name')

          optional(json_ballot, ballot, 'forfeit')
          if 'scores' in ballot:
            scores = ballot['scores']
            winlossScores = [s for s in scores if s['tag'] == 'winloss']
            if len(winlossScores) == 1: json_ballot['winloss'] = winlossScores[0]['value']
            
            pointScores = {s['speaker']: s['value'] for s in scores if s['tag'] =='point' and 'speaker' in s and 'value' in s}
            speakers = set(pointScores.keys())
            if len(speakers) < 2 and ballot['entry'] in entries:
              speakers = entries[ballot['entry']]
            else:
              entries[ballot['entry']] = speakers
            
            for i, speaker in enumerate(sorted(speakers)):
              json_ballot[f'speaker{i + 1}Id'] = speaker
              if speaker in pointScores: json_ballot[f'speaker{i + 1}Score'] = pointScores[speaker]
          ballots += [json_ballot]
  # else:
    # print(f"Missing sections for round {round['id']}")
  return ballots

In [18]:
def getResults(resultSet, json_tourn, entries: dict[int, set[int]]):
  json_resultSet = json_tourn | {
    "label": resultSet['label'],
    "coach": resultSet['coach'],
    "generated": resultSet['generated'],
  }
  results = []
  if 'results' not in resultSet or 'result_keys' not in resultSet: return results
  resultKeys = {resultKey['id']: resultKey for resultKey in resultSet['result_keys']}
  for result in resultSet['results']:
    json_result = json_resultSet | {}
    optional(json_result, result, 'rank')
    optional(json_result, result, 'place')
    optional(json_result, result, 'percentile')
    optional(json_result, result, 'entry')
    optional(json_result, result, 'student')
    optional(json_result, result, 'school')
    optional(json_result, result, 'round')
    # If result has entry and student, add student to that entry
    if 'student' in result and 'entry' in result:
      entry = result['entry']
      if entry not in entries: entries[entry] = set()
      entries[entry].add(result['student'])

    if 'values' in result:
      for i, result_value in enumerate(result['values']):
        if 'result_key' not in result_value or 'value' not in result_value: continue
        if result_value['result_key'] not in resultKeys: continue
        tag = resultKeys[result_value['result_key']]['tag']
        try:
          value = float(result_value['value'])
        except ValueError:
          value = result_value['value']
        results.append(json_result | {
          "tag": tag,
          "value": value
        })
  return results


In [8]:
# try:
#   # noinspection
#   len(policyTournIds)
# except NameError:
policyTournIds = set()

def processTourn(id):
  with open(f'{TABDATA_PATH}/{id}.json') as f:
    tourn = json.load(f)
    json_tourn = {
      'tournamentId': id,
      'tournamentName': tourn['name'],
      'tournamentStart': tourn['start'],
      'tournamentEnd': tourn['end'],
      'city': tourn['city'],
      'state': tourn['state'],
      'country': tourn['country'],
      'webname': tourn['webname'],
      'timezone': tourn['timezone']
    }

    events = findPolicyEvents(tourn)
    ballots = []
    results = []
    entries = {}
    for event in events:
      if 'result_sets' in event:
        policyTournIds.add(id)
        for resultSet in event['result_sets']:
          results += getResults(resultSet, json_tourn, entries)
      if 'rounds' in event:
        policyTournIds.add(id)
        for round in event['rounds']:
          ballots += getBallots(round, json_tourn, entries)
    return results, ballots
    

In [19]:
resultData = []
ballotData = []
try:
  ids = policyTournIds
  # raise
except:
  print("Using all tournament ids")
  ids = tournIds
for i in tqdm(sorted(ids)):
  try:
    results, ballots = processTourn(i)
    resultData += results
    ballotData += ballots
  except KeyboardInterrupt:
    raise KeyboardInterrupt
  except Exception as e:
    print(f'Failed: {i}')
    raise e

resultData = pd.DataFrame(resultData)
resultData.to_feather(DATA_PATH + '_results.feather')
print("Saved results") 
ballotData = pd.DataFrame(ballotData)
ballotData.to_feather(DATA_PATH + '_ballots.feather')
print("Saved ballots")

  0%|          | 0/5167 [00:00<?, ?it/s]

Saved results
Saved ballots


In [129]:
ballotData = pd.read_feather(DATA_PATH + "_ballots.feather")

In [None]:
with open('./data/policyTournIds.json', 'w') as f:
  json.dump(sorted(policyTournIds), f)
len(policyTournIds)

In [57]:
teamSet = set(ballotData.groupby(['speaker1Id', 'speaker2Id'], as_index=False).count()[['speaker1Id', 'speaker2Id']].itertuples(index=False))
with open('./data/teamSet.json', 'w') as f:
  json.dump(sorted(teamSet), f)
len(teamSet)

44217

# Analyze

## Ballots

In [14]:
teamSet = set(ballotData[ballotData.speaker2Id.notna()].groupby(['speaker1Id', 'speaker2Id']).groups)

In [364]:
len(ballotData.sectionId.unique())

353756

In [27]:
ballotData.groupby('side').winloss.describe().loc[[1, 2]]

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
side,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.0,307884.0,0.518,0.5,0.0,0.0,1.0,1.0,2.0
2.0,307496.0,0.483,0.5,0.0,0.0,0.0,1.0,2.0


In [27]:
ballotData.count()

tournamentId       836627
tournamentName     836627
tournamentStart    836627
tournamentEnd      836627
city               588265
state              835122
country            835122
webname            830128
timezone           836627
roundId            836627
type               836627
name               836627
protocol_name      832762
start_time         835546
label              301065
sectionId          836627
room               836627
letter             835546
ballotId           836627
panelId            836627
entry              836627
side               831425
judge              807951
judge_first        807588
judge_last         807226
entry_code         836508
entry_name         836415
flights            817940
bye                 22382
winloss            615442
speaker1Id         573385
speaker1Score      467194
speaker2Id         554057
speaker2Score      452077
judge_code         229318
chair               10412
entered_by         562401
speaker3Id            874
speaker3Scor

In [21]:
(ballotData.entry_name.str.find('&') >= 0).mean()

0.8982652962431287

In [368]:
ballotData.entry_code.iloc[550000:]

550000       Concordia Boals & Spindler
550001          UMKC Mitchell & Willett
550002         UTD Beutelspacher & Kaul
550003     Wyomin' Mitchell & Radcliffe
550004    Texas Coltzer & Marriott-Voss
                      ...              
836622         Shahzeb Khan & Eesh Pant
836623         Shahzeb Khan & Eesh Pant
836624            Jason Fan & Lesley Xu
836625        Shreyas Jain & Steve Pait
836626           Angad Hayer & Moon Liu
Name: entry_code, Length: 286627, dtype: object

In [22]:
ballotData.tournamentName.value_counts().iloc[0:10]

National Speech and Debate Tournament                   15580
Glenbrooks Speech and Debate Tournament                 11725
University of Michigan HS Debate Tournament             10276
Nationals Prep                                           8993
Greenhill Fall Classic                                   8317
CHSSA Simulation Tournament                              8192
Cal Invitational UC Berkeley                             7676
Tournament of Champions                                  6387
University of Michigan HS Tournament                     5561
John Edie Holiday Debates Hosted by The Blake School     5144
Name: tournamentName, dtype: int64

In [26]:
ballotData \
  .groupby(['speaker1Id', 'speaker2Id']) \
  .filter(lambda x: len(x) > 20) \
  .groupby(['speaker1Id', 'speaker2Id']) \
  .winloss.aggregate(['mean', 'sum', 'count']) \
  .sort_values(['sum', 'mean'], ascending=False) \
  .iloc[0:15]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,sum,count
speaker1Id,speaker2Id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
203981.0,309087.0,0.818,356.0,435
700291.0,700295.0,0.696,344.0,494
583152.0,586494.0,0.74,321.0,434
736265.0,736266.0,0.717,276.0,385
706156.0,706162.0,0.647,266.0,411
974070.0,975610.0,0.609,257.0,422
553706.0,553709.0,0.622,242.0,389
380151.0,501530.0,0.696,215.0,309
520709.0,520710.0,0.781,214.0,274
480617.0,553146.0,0.627,212.0,338


In [341]:
def getTeamBallots(id1: int, id2: int) -> pd.DataFrame:
  id1, id2 = sorted([id1, id2])
  return ballotData[(ballotData.speaker1Id == id1) & (ballotData.speaker2Id == id2)]

In [353]:
def compareDownload(id1: int, id2: int):
  team = getTeamBallots(id1, id2)
  teamTourns = team \
    .groupby(['tournamentId', 'tournamentName']) \
    .agg({'tournamentStart': 'first', 'tournamentId': 'count'}) \
    .rename(columns={'tournamentId': 'ballotCount'}) \
    .sort_values('tournamentStart', ascending=False)
  years = teamTourns.tournamentStart.str[0:4]
  reset = teamTourns.reset_index()
  downloadCounts = reset.set_index([years, reset.tournamentName.str.strip()]).ballotCount.astype('Int64') # Nullable integer
  
  record = loadTeamRecord(id1, id2)
  recordCounts = record.Decision.str.len().groupby(level=[0, 1], sort=False).sum()
  
  return pd.DataFrame({'download': downloadCounts, 'record': recordCounts })

In [370]:
id1, id2 = 203981, 309087
compareDownload(id1, id2)

Unnamed: 0,Unnamed: 1,download,record
2015,Ibis Debates at the University of Miami,4.0,4
2016,Cal Invitational at Berkeley HS Tournament,14.0,14
2016,John Edie Holiday Debates hosted by Blake,10.0,10
2016,NY Fall Face Off at Mamaroneck HS,12.0,12
2016,New York City Invitational Debate and Speech Tournament,18.0,18
2016,Policy Early Bird at Wake Forest,,12
2016,Samford University Bishop Guild,14.0,14
2016,The Tradition,9.0,9
2016,University of Michigan HS Debate Tournament,7.0,15
2017,Barkley Forum for High Schools,,9


In [372]:
(253 + 253) - (453)

53

In [28]:
teamGroups = ballotData.groupby(['speaker1Id', 'speaker2Id'])

In [29]:
teamGroups.ngroups

44217

In [56]:
%timeit (715369, 715370) in teamSet

61 ns ± 2.94 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [30]:
codes = ballotData.entry_code.dropna()
schools = codes[codes.str.match(r'.+ [A-Z]{2}$')].str.replace(r' [A-Z]{2}$', '', regex=True)

In [None]:
codes.str.match(r'.+ [A-Z]{2}$')

In [33]:
schoolCounts = schools.str.lower().str.replace(r'[^a-zA-Z]', '', regex=True).value_counts()
schoolCounts.iloc[0:20]

lexington                7066
montgomerybell           6694
glenbrooksouth           6433
glenbrooknorth           6405
                         6179
nileswest                5961
peninsula                5172
newtrier                 4890
notredame                4728
mamaroneck               4713
collegeprep              4231
woodward                 4024
liberty                  4022
greenhill                3831
nilesnorth               3665
damien                   3565
stmarksschooloftexas     3426
kansas                   3138
liberalartsandscience    3115
brooklyntechnical        3072
Name: entry_code, dtype: int64

In [49]:
caselistSchoolCounts = rounds[rounds.event == 'cx'].schoolDisplayName.str.lower().str.replace(r'[^a-zA-Z]', '', regex=True).value_counts()
caselistSchoolCounts.iloc[0:20]

kansas            3740
jesuitdallas      3001
glenbrooknorth    2913
woodward          2733
michigan          2679
emory             2585
lexington         2512
minnesota         2471
newtrier          2447
glenbrooksouth    2349
peninsula         2343
wakeforest        2199
kentucky          2139
georgemason       2028
georgetown        1959
liberty           1904
northwestern      1868
nileswest         1828
westminster       1780
stmarks           1765
Name: schoolDisplayName, dtype: int64

In [51]:
opponents = rounds[rounds.event == 'cx'].opponent.dropna()
opponentSchoolNames = opponents[opponents.str.match(r'.+ ?[A-Z]{2}$')].str.replace(r' [A-Z]{2}$', '', regex=True)
opponentSchoolCounts = opponentSchoolNames.str.lower().str.replace(r'[^a-zA-Z]', '', regex=True).value_counts()
opponentSchoolCounts

emory                 2114
kansas                2023
michigan              1928
mba                   1874
peninsula             1825
                      ... 
navymarywashington       1
webet                    1
minnesotaemory           1
indianamichigan          1
northestern              1
Name: opponent, Length: 4610, dtype: int64

In [52]:
# TODO: Add caselist opponent field to comparison
mergedCounts = pd.DataFrame({'caselist': caselistSchoolCounts, 'tabroom': schoolCounts, 'opponent': opponentSchoolCounts})
mergedCounts['max'] = mergedCounts.max(axis=1)
mergedCounts = mergedCounts.sort_values('max', ascending=False).drop(columns='max')
mergedCounts

Unnamed: 0,caselist,tabroom,opponent
lexington,2512.000,7066.000,1298.000
montgomerybell,1477.000,6694.000,612.000
glenbrooksouth,2349.000,6433.000,758.000
glenbrooknorth,2913.000,6405.000,690.000
,,6179.000,3.000
...,...,...,...
southmainecc,,,1.000
medinavalley,,,1.000
georgetownnight,,,1.000
southlakecarrol,,,1.000


In [145]:
mergedCounts.to_csv(DATA_PATH + "_schools.csv")

In [143]:
print(mergedCounts.to_string())

                                                                                                              caselist  tabroom  opponent
Lexington                                                                                                     2512.000 7066.000  1293.000
Montgomery Bell                                                                                               1477.000 6694.000   611.000
Glenbrook South                                                                                               2349.000 6433.000   750.000
Glenbrook North                                                                                               2913.000 6405.000   674.000
Niles West                                                                                                    1828.000 5961.000  1589.000
Peninsula                                                                                                     2343.000 5172.000  1817.000
New Trier                         

In [11]:
tabTourns = ballotData.tournamentName.value_counts()
tabTourns.iloc[0:20]

National Speech and Debate Tournament                          15580
Glenbrooks Speech and Debate Tournament                        11725
University of Michigan HS Debate Tournament                    10276
Nationals Prep                                                  8993
Greenhill Fall Classic                                          8317
CHSSA Simulation Tournament                                     8192
Cal Invitational UC Berkeley                                    7676
Tournament of Champions                                         6387
University of Michigan HS Tournament                            5561
John Edie Holiday Debates Hosted by The Blake School            5144
Michigan Summer Institutes                                      5068
1st and 2nd Year National Championships at Woodward Academy     4926
the biggest baddest test tournament                             4880
National Speech and Debate Season Opener hosted by UK           4816
New York City Invitational Debate 

In [12]:
wikiTourns = rounds.tournament.str.lower().value_counts()
wikiTourns.iloc[0:20]

glenbrooks      8018
harvard         6151
greenhill       5664
kentucky        4658
gsu             4378
blake           4274
berkeley        4171
any             3982
toc             3916
wake            3806
emory           3737
grapevine       3658
stanford        3408
st marks        3110
bronx           2878
all             2810
yale            2768
loyola          2757
contact info    2755
tfa state       2743
Name: tournament, dtype: int64

In [13]:
rounds.columns

Index(['roundId', 'side', 'tournament', 'round', 'opponent', 'judge', 'report',
       'opensourcePath', 'caselistUpdatedAt', 'teamId', 'teamName',
       'teamDisplayName', 'notes', 'debater1First', 'debater1Last',
       'debater2First', 'debater2Last', 'schoolId', 'schoolName',
       'schoolDisplayName', 'state', 'chapterId', 'caselistId', 'caselistName',
       'caselistDisplayName', 'year', 'event', 'level', 'teamSize'],
      dtype='object')

In [403]:
len(ballotData[ballotData.entry_code.str.len() < 5]) / len(ballotData)

0.10710507789014698

In [41]:
ballotData.columns

Index(['tournamentId', 'tournamentName', 'tournamentStart', 'tournamentEnd',
       'city', 'state', 'country', 'webname', 'timezone', 'roundId', 'type',
       'name', 'protocol_name', 'start_time', 'label', 'sectionId', 'room',
       'letter', 'ballotId', 'panelId', 'entry', 'side', 'judge',
       'judge_first', 'judge_last', 'entry_code', 'entry_name', 'flights',
       'bye', 'winloss', 'speaker1Id', 'speaker1Score', 'speaker2Id',
       'speaker2Score', 'judge_code', 'chair', 'entered_by', 'speaker3Id',
       'speaker3Score', 'forfeit', 'judge_started', 'speaker4Id',
       'speaker4Score', 'speaker5Id', 'speaker5Score', 'speakerorder',
       'runoff', 'audited_by', 'speaker6Id', 'speaker6Score', 'speaker7Id',
       'speaker7Score', 'speaker8Id', 'speaker8Score', 'speaker9Id',
       'speaker9Score', 'started_by'],
      dtype='object')

In [None]:
ballotData[ballotData.speaker4Score.notna() & ballotData.speaker4Score != 0]

In [24]:
with open('./data/example.json') as f:
  tmp = json.load(f) ['categories'][3]['events'][0]['rounds']
  with open('./data/exampleRounds.json', 'w') as f2:
    json.dump(tmp, f2, indent=2)

In [None]:
len(ballotData.tournamentStart[ballotData.tournamentStart.str.startswith("2")]), len(ballotData)

## Results

In [38]:
resultData.nunique()

tournamentId         3836
tournamentName       2680
tournamentStart      2833
tournamentEnd        2806
city                  748
state                  48
country                 3
webname              2111
timezone               21
label                  26
bracket                 1
published               1
coach                   2
generated            8412
rank                  506
place                1036
percentile          20440
entry              116507
tag                   896
value              284604
student             41438
round                5236
school               1134
dtype: int64

In [58]:
resultData.school.value_counts()

315254.000    198
416758.000    166
422238.000    140
315289.000    135
316290.000    108
             ... 
434848.000      1
434859.000      1
434840.000      1
439038.000      1
438543.000      1
Name: school, Length: 1134, dtype: int64

## Cites

In [19]:
oldTagCounts = cites.cites.str.count(r'====') / 2
newTagCounts = cites.cites.str.count(r'####')
cites['tagCount'] = newTagCounts + oldTagCounts

In [21]:
cites.tagCount.sum()

1387929.0

In [23]:
nonZeroTagCounts = cites.tagCount[cites.tagCount != 0]
zeroTagCites = cites[cites.tagCount == 0]
len(nonZeroTagCounts), nonZeroTagCounts.mean(), len(zeroTagCites)

(178991, 7.754183171220899, 168328)

In [37]:
zeroTagLink = zeroTagCites.cites.str.contains('')
zeroTagLinkCites = zeroTagCites[zeroTagLink]

: 

## Matching

Ideas

Find wiki teams student ids on tabroom
- Need to match school name on wiki to schools on tabroom
- Can use entry names to find specific teams
- Team names and attended tournaments can help with matching

Use tournament name + round num to match rounds
- Might be hard to match tournament name

Use opponent + judge + round num + side to match rounds
- Each individually is usually easier to match


In [91]:
ballotData = pd.read_feather(DATA_PATH + "_ballots.feather")

In [None]:
(ballotData.judge_first + ballotData.judge_last).value_counts().sort_values(ascending=False)

In [92]:
ballotData['tournamentStart'] = pd.to_datetime(ballotData.tournamentStart, errors='coerce')
ballotData['entry_initials'] = ballotData.entry_code.str.slice(-2).str.upper()

In [93]:
extraSpeakers = [f"speaker{num}{t}" for num in range(3, 10) for t in ("Id", "Score")]
ballotData = ballotData.dropna(subset='tournamentStart').drop(columns=extraSpeakers, errors='ignore')

In [94]:
def checkBothSides(round: pd.DataFrame):
    return 1 in round.side.values and 2 in round.side.values

In [187]:
seasons = ballotData.groupby(pd.Grouper(key="tournamentStart", freq="AS-JUL"))
seasonFiltered = seasons.progress_apply(lambda season: season.groupby('sectionId').filter(checkBothSides))
seasonRounds = seasonFiltered.set_index([seasonFiltered.sectionId, seasonFiltered.side, seasonFiltered.index.get_level_values(1)])

  0%|          | 0/11 [00:00<?, ?it/s]

In [188]:
def roundTeamJudge(round: pd.DataFrame):
  aff, neg = round.loc[:, 1, :].iloc[0], round.loc[:, 2, :].iloc[0]
  return pd.Series({
    "aff_initials": aff.entry_initials,
    "neg_initials": neg.entry_initials,
    "roundName": aff['name'],
    "judge_first": aff.judge_first.capitalize(),
    "judge_last": aff.judge_last.capitalize(),
    "season": aff.tournamentStart.year
  })

In [270]:
simpleRounds = seasonRounds.groupby(level=0).progress_apply(roundTeamJudge).droplevel(1).reset_index()
simpleRounds

Unnamed: 0,sectionId,aff_initials,neg_initials,roundName,judge_first,judge_last,season
0,135627,HZ,BC,1,Mike,Hester,2012
1,135628,CV,DR,1,Logan,Chin,2012
2,135631,BC,PS,1,Patri,Waldinger,2012
3,135632,MM,AM,1,John,Katsulas,2012
4,135633,CR,CE,1,Luke,Hill,2012
...,...,...,...,...,...,...,...
334867,6164526,MG,QY,8,Kevin,Hirn,2022
334868,6164527,RV,FZ,8,Colton,Gilbert,2022
334869,6164528,LH,LM,8,David,Kilpatrick,2022
334870,6164529,WW,SE,8,Scott,Wheeler,2022


In [197]:
simpleRounds.to_feather(DATA_PATH + "_simpleRounds.feather")

In [273]:
subset=['aff_initials', 'neg_initials' , 'judge_last', 'season']
duplicates = simpleRounds[simpleRounds.duplicated(keep=False, subset=subset)]
# duplicates = duplicates.dropna(axis=0, subset=['judge_first', 'judge_last'])
duplicates = duplicates[~duplicates.aff_initials.str.startswith("0")]
print(len(duplicates), len(simpleRounds))
duplicates

1643 334872


Unnamed: 0,sectionId,aff_initials,neg_initials,roundName,judge_first,judge_last,season
863,139905,LW,MP,4,Craig,Hennigan,2012
943,139999,LW,MP,14,Craig,Hennigan,2012
946,140002,KM,CP,15,Max,Archer,2012
965,140022,GL,CP,16,,,2012
969,140026,GL,CP,16,,,2012
...,...,...,...,...,...,...,...
332769,6125455,EZ,EZ,4,Andre,Sutton,2022
332781,6125467,EZ,NE,4,Daniel,Perez,2022
334320,6156421,ON,ON,4,Jeremiah,Etcheverry,2022
334523,6160687,EP,LS,6,,,2022


#### Caselist

In [6]:
(rounds.report.str.contains("contact", case=False)).sum()

3106

In [57]:
policyRounds = rounds[~(
  rounds.tournament.str.lower().str.contains("contact", case=False) | 
  rounds.report.str.contains("contact", case=False) |
  (rounds.tournament == "All Tournaments")
) & (
  rounds.event == "cx"
)]
len(policyRounds), len(rounds[rounds.event == "cx"])

(182360, 186658)

In [78]:
# Could also Match AbCd but lazy
hasOpponent = policyRounds.opponent.str.match(r'.+ ([a-zA-Z]){2}$').fillna(False) | policyRounds.opponent.str.contains("&")
hasOpponent.sum(), len(policyRounds), hasOpponent.sum() / len(policyRounds)

(152154, 182360, 0.8343606053959202)

In [79]:
fullJudge = policyRounds.judge.str.match(r'^\w+ \w+$')
fullJudge.sum(), len(policyRounds), fullJudge.sum() / len(policyRounds)

(100270, 182360, 0.5498464575564816)

In [85]:
# Could also handle Finals, Semifinals etc.
roundNum = policyRounds['round'].isin(str(i) for i in range(10))
roundNum.sum(), len(policyRounds), roundNum.sum() / len(policyRounds)

(159715, 182360, 0.8758225488045625)

In [216]:
fullData = policyRounds[hasOpponent & fullJudge & roundNum].set_index("roundId")
len(fullData), len(policyRounds), len(fullData) / len(policyRounds)

(93489, 182360, 0.5126617679315639)

In [246]:
def wikiroundTeamSide(round: pd.DataFrame):
  round = round.iloc[0]
  team_initials = round.teamDisplayName[-4] + round.teamDisplayName[-2]
  opponent_initials = round.opponent[-2:].upper()
  judge_first, judge_last = round['judge'].split(" ")
  if round.side == "A":
    aff_initials, neg_initials = team_initials, opponent_initials
  else:
    aff_initials, neg_initials = opponent_initials, team_initials
  return pd.Series({
    "aff_initials": aff_initials,
    "neg_initials": neg_initials,
    "roundName": round['round'],
    "judge_first": judge_first.capitalize(),
    "judge_last": judge_last.capitalize(),
    "season": round.year
  })

In [247]:
wikiSimpleRounds = fullData.groupby(level=0).progress_apply(wikiroundTeamSide).reset_index().astype({'roundName': 'int64'})
wikiSimpleRounds

  0%|          | 0/93489 [00:00<?, ?it/s]

Unnamed: 0,roundId,aff_initials,neg_initials,roundName,judge_first,judge_last,season
0,564515,RC,BC,4,Brian,Delong,2014
1,564516,RC,MR,2,Tess,Botkin,2014
2,564517,RC,KG,8,Edmund,Zagorin,2014
3,564518,RC,LM,1,Andy,Montee,2014
4,564519,RC,BO,6,Emily,Bosch,2014
...,...,...,...,...,...,...,...
93484,966386,GS,DM,3,Will,Katz,2022
93485,966388,CC,HH,3,Tim,Freehan,2022
93486,966390,NE,DW,3,Maria,Sanchez,2022
93487,966393,MZ,AG,2,Matt,Stannard,2022


In [248]:
wikiSimpleRounds.to_feather(DATA_PATH + "wikiSimpleRounds.feather")

In [290]:
mergedRounds = pd.merge(wikiSimpleRounds, simpleRounds, how='left', on=["aff_initials", "neg_initials", "roundName", "judge_first",  "judge_last", "season"])
mergedRounds.sectionId.count(), len(policyRounds), mergedRounds.sectionId.count() / len(policyRounds)

(19188, 182360, 0.10522044307962272)

In [252]:
mergedRounds[mergedRounds.sectionId.notna()]

Unnamed: 0,roundId,aff_initials,neg_initials,roundName,judge_first,judge_last,season,sectionId
23,564541,RR,NS,2,Ben,Menzies,2014,507858.000
25,564544,TM,RR,4,Malcolm,Gordon,2014,508634.000
26,564545,BQ,RR,7,Ben,Crossan,2014,509167.000
27,564546,MP,RR,6,Luis,Magallon,2014,508890.000
52,564610,CM,KR,1,Scott,Elliott,2014,510759.000
...,...,...,...,...,...,...,...,...
93186,962505,LS,LH,1,Todd,Le,2022,6045613.000
93209,962665,MF,LS,2,Brooke,Erickson,2022,6048149.000
93210,962673,CS,LS,3,Tanner,Cronce,2022,6051384.000
93223,962901,CS,MT,2,Joe,Klopotek,2022,6118868.000


In [263]:
# ballotData[ballotData.sectionId == 4095849]
gTmp = ballotData[ballotData.tournamentId == 12924]

In [291]:
# policyRounds[policyRounds.roundId == 741459]
# policyRounds[policyRounds.teamId == 63181]

In [298]:
fullMerged = pd.merge(policyRounds, mergedRounds, how="outer", on=['roundId'])
fullMerged = pd.merge(fullMerged, ballotData, how='outer', on=['sectionId'], suffixes=('_wiki', '_tabroom'))

In [299]:
counts = fullMerged.count()