In [57]:
filepath = "originalData"

with open(filepath, "r", encoding='utf-8') as f:
    colnames = f.readline().strip("\n")

colnames = colnames.split(",")
for idx,name in enumerate(colnames):
    #print(f"{idx} - {name}")
    pass

In [58]:
def get_mapping_colnameTOidx(data: str | list[str]):
    """
    Reads the first line of csv and intialize bidirectional mapping dictionaries.
    Accepts either filepath string or list of labels. 
    """
    if isinstance(data, str):
        with open(data, "r", encoding="utf-8") as f:
            info_line = f.readline().strip("\n").split(",")
    elif isinstance(data, list):
        info_line = data
    
    mapping_dict = {}
    for idx,colname in enumerate(info_line):
        mapping_dict[idx] = colname
        mapping_dict[colname] = idx
    return mapping_dict

def create_tournaments_dict(data_filepath: str):
    """Reads a data .csv, creates a tuple of (parsed_dict, mapping_dict)
    (
        dict[
            tourney_id: list[
                (tourny_match_list: list[str], winner_name: str, loser_name: str, round: str)
                ...
                ]
            ]
     ,   
        dict[
            colname_i: idx_i,
            idx_i: colname_i
        ]
    }
    Will drop unwanted columns, but does not processed NA fields.
    """

    mapping_dict = get_mapping_colnameTOidx(data_filepath)
    winner_idx = mapping_dict["winner_name"]
    loser_idx = mapping_dict["loser_name"]
    round_idx = mapping_dict["round"]

    tourney_dict = {}
    with open(data_filepath, "r", encoding="utf-8") as f:
        for line in f.readlines():
            if line == "":
                continue
            
            match_info = line.split(",")
            tourney_id = match_info[mapping_dict["tourney_id"]]
            match_info[-1] = match_info[-1].strip("\n") # Strip \n at end of each line

            if tourney_id == "tourney_id": # NOTE skip first line
                continue

            winner = match_info[winner_idx]
            loser = match_info[loser_idx]

            processed_match = (match_info, winner, loser, match_info[round_idx])
            
            if tourney_id in tourney_dict:
                tourney_dict[tourney_id].append(processed_match)
            else:  
                tourney_dict[tourney_id] = [processed_match] 
                
    return (tourney_dict, mapping_dict)

tourney_dict, mapping_dict = create_tournaments_dict("originalData")

In [59]:
new_tourney_dict = {}
finalists_dict = {}
for id in tourney_dict:
    has_final = False
    matches = tourney_dict[id]
    for match in matches:
        if match[3] == "F":
            has_final = True
            finalist_winner = match[1]
            finalist_loser = match[2]
            finalists_dict[id] = (finalist_winner, finalist_loser)
            break
    if has_final:
        new_tourney_dict[id] = matches
tourney_dict = new_tourney_dict

print(len(tourney_dict))
round_idx =  mapping_dict["round"]

lengths = []
for tourney_id in tourney_dict: 
    tourney = tourney_dict[tourney_id]
    lengths.append(len(tourney))
    finalist_winner = tourney
    if len(tourney) == 7:
        for t in tourney:
            print(t)
        print("*********")
            
print(set(lengths))


611
(['2010-W-CHA-INA-01A-2010', 'Tournament of Champions', 'Hard', '8', 'F', '20101101', '1', '200617', '', 'WC', 'Kimiko Date Krumm', 'R', '163', 'JPN', '40.0', '201419', '1', '', 'Na Li', 'R', '172', 'CHN', '28.6', '6-4 3-6 6-4', '3', 'QF', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '53', '1130', '11', '3540'], 'Kimiko Date Krumm', 'Na Li', 'QF')
(['2010-W-CHA-INA-01A-2010', 'Tournament of Champions', 'Hard', '8', 'F', '20101101', '2', '201421', '', '', 'Ana Ivanovic', 'R', '184', 'SRB', '22.9', '201499', '3', '', 'Anastasia Pavlyuchenkova', 'R', '177', 'RUS', '19.3', '6-0 6-1', '3', 'QF', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '24', '2255', '20', '2520'], 'Ana Ivanovic', 'Anastasia Pavlyuchenkova', 'QF')
(['2010-W-CHA-INA-01A-2010', 'Tournament of Champions', 'Hard', '8', 'F', '20101101', '3', '201290', '', 'WC', 'Daniela Hantuchova', 'R', '181', 'SVK', '27.5', '201518', '4', '', 'Yanina Wickmayer', 'R', '182', '

In [60]:
# Does each tournament have the same number of matches for tournament winner and loser? 
interested_tourneys = {}

for id in tourney_dict:
    tourney = tourney_dict[id]
    finalist_winner, finalist_loser = finalists_dict[id]

    finalist_winner_matches = []
    finalist_loser_matches = []

    for match in tourney:
        if match[3] == "F": # skip final match
            continue
        elif match[1] == finalist_winner:
            finalist_winner_matches.append(match)
        elif match[1] == finalist_loser:
            finalist_loser_matches.append(match)
        
    if len(finalist_winner_matches) != len(finalist_loser_matches):
        print(f"winner: {len(finalist_winner_matches)} loser: {len(finalist_loser_matches)}")
        interested_tourneys[id] = tourney


winner: 4 loser: 5
winner: 4 loser: 3
winner: 3 loser: 4
winner: 5 loser: 4
winner: 3 loser: 4
winner: 4 loser: 5
winner: 3 loser: 4
winner: 5 loser: 4
winner: 4 loser: 5
winner: 3 loser: 4
winner: 4 loser: 3
winner: 4 loser: 3
winner: 5 loser: 4
winner: 4 loser: 5
winner: 4 loser: 5
winner: 4 loser: 3
winner: 4 loser: 5
winner: 3 loser: 4
winner: 4 loser: 3
winner: 5 loser: 4
winner: 3 loser: 4
winner: 4 loser: 3
winner: 5 loser: 4
winner: 3 loser: 4
winner: 3 loser: 4
winner: 4 loser: 3
winner: 3 loser: 4
winner: 4 loser: 5
winner: 4 loser: 3
winner: 3 loser: 4
winner: 4 loser: 5
winner: 4 loser: 3
winner: 5 loser: 4
winner: 4 loser: 5
winner: 4 loser: 3
winner: 4 loser: 5
winner: 4 loser: 5
winner: 4 loser: 5
winner: 3 loser: 4
winner: 5 loser: 4
winner: 5 loser: 4
winner: 4 loser: 5
winner: 5 loser: 4
winner: 4 loser: 3
winner: 3 loser: 4
winner: 4 loser: 3
winner: 4 loser: 5
winner: 3 loser: 4
winner: 3 loser: 4
winner: 4 loser: 5
winner: 3 loser: 4
winner: 4 loser: 5
winner: 4 lo

In [61]:
interested_t_lengths = []
for id, tourney in interested_tourneys.items():
    interested_t_lengths.append(len(tourney))
    print(id)

print(set(interested_t_lengths))

2010-W-PM-CHN-01A-2010
2010-W-PR-AUS-01A-2010
2010-W-PR-FRA-01A-2010
2010-W-PR-ITA-01A-2010
2010-W-PR-RUS-01A-2010
2010-W-PR-USA-04A-2010
2010-W-PR-USA-05A-2010
2011-W-INT-GBR-01A-2011
2011-W-PM-CHN-01A-2011
2011-W-PR-BEL-01A-2011
2011-W-PR-FRA-01A-2011
2011-W-PR-GER-01A-2011
2011-W-PR-JPN-01A-2011
2011-W-PR-UAE-01A-2011
2011-W-PR-USA-01A-2011
2011-W-PR-USA-02A-2011
2011-W-PR-USA-04A-2011
2011-W-PR-USA-05A-2011
2011-W-WT-TUR-01A-2011
2012-W-INT-GBR-01A-2012
2012-W-PR-BEL-01A-2012
2012-W-PR-FRA-01A-2012
2012-W-PR-JPN-01A-2012
2012-W-PR-USA-02A-2012
2012-W-PR-USA-05A-2012
2013-W-CHA-BUL-01A-2013
2013-W-PR-AUS-02A-2013
2013-W-PR-CAN-01A-2013
2013-W-PR-FRA-01A-2013
2013-W-PR-RUS-01A-2013
2013-W-PR-USA-01A-2013
2013-W-PR-USA-04A-2013
2014-W-PM-CHN-01A-2014
2014-W-PR-CAN-01A-2014
2014-W-PR-FRA-01A-2014
2014-W-PR-GBR-01A-2014
2014-W-PR-ITA-01A-2014
2014-W-PR-USA-03A-2014
2014-W-PR-USA-04A-2014
2015-W-P5-CAN-01A-2015
2015-W-P5-CHN-01A-2015
2015-W-P5-ITA-01A-2015
2015-W-P700-GBR-02A-2015
2015-W