# 1st function

In [1]:

import csv

def filter_by_year(statistics, year, yearid):
    filtered_stats = []
    for row in statistics:
        if year == int(row[yearid]): # Comparing every row's year to the input year value
            filtered_stats.append(row) # Appending row, belonging to the input year
    return filtered_stats
          
    
baseballdatainfo = {
    "masterfile": "Master_2016.csv",   # Name of Master CSV file
    "battingfile": "Batting_2016.csv", # Name of Batting CSV file
    "separator": ",",                  # Separator character in CSV files
    "quote": '"',                      # Quote character in CSV files
    "playerid": "playerID",            # Player ID field name
    "firstname": "nameFirst",          # First name field name
    "lastname": "nameLast",            # Last name field name
    "yearid": "yearID",                # Year field name
    "atbats": "AB",                    # At bats field name
    "hits": "H",                       # Hits field name
    "doubles": "2B",                   # Doubles field name
    "triples": "3B",                   # Triples field name
    "homeruns": "HR",                  # Home runs field name
    "walks": "BB",                     # Walks field name
    "battingfields": ["AB", "H", "2B", "3B", "HR", "BB"]
}

statistics = []  # batting_2016.csv file extracted as list_of_dictionaries
with open(baseballdatainfo['battingfile'], newline='') as csvfile:
    csvreader = csv.DictReader(csvfile, delimiter=',', quotechar="'")
    for row in csvreader:
        statistics.append(row)

print(list(statistics[0].keys()))
        

filter_by_year(statistics, 1935, "yearID") # filters list of dictionaries are 513 for year 1935



['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP']


513

## 2nd function

In [38]:
MINIMUM_AB = 500

def slugging_percentage(info, batting_stats):
    """
    Inputs:
      batting_stats - dictionary of batting statistics (values are strings)
    Output:
      Returns the slugging percentage as a float
    """
    #     hits = float(batting_stats[info["hits"]])
    #     doubles = float(batting_stats[info["doubles"]])
    #     triples = float(batting_stats[info["triples"]])
    #     home_runs = float(batting_stats[info["homeruns"]])
    #     singles = hits - doubles - triples - home_runs
    #     at_bats = float(batting_stats[info["atbats"]])
    
    stats_list = []
    
    for row in batting_stats:
        hits = float(row[info["hits"]])
        doubles = float(row[info["doubles"]])
        triples = float(row[info["triples"]])
        home_runs = float(row[info["homeruns"]])
        singles = hits - doubles - triples - home_runs
        at_bats = float(row[info["atbats"]])
        
        if (at_bats >= MINIMUM_AB):
            stats_list.append(float((singles + 2 * doubles + 3 * triples + 4 * home_runs) / (at_bats)))
    
    return stats_list   
    #     if at_bats >= MINIMUM_AB:
    #         return float((singles + 2 * doubles + 3 * triples + 4 * home_runs) / at_bats)
    #     else:
    #         return 0

def onbase_percentage(info, batting_stats):
    """
    Inputs:
      batting_stats - dictionary of batting statistics (values are strings)
    Output:
      Returns the on-base percentage as a float
    """
    hits = float(row[info["hits"]])
    at_bats = float(row[info["atbats"]])
    walks = float(row[info["walks"]])
    if at_bats >= MINIMUM_AB:
        return (hits + walks) / (at_bats + walks)
    else:
        return 0
       
def top_player_ids(info, statistics, onbase_percentage, numplayers):
    """
    Inputs:
      info       - Baseball data information dictionary
      statistics - List of batting statistics dictionaries
      formula    - function that takes an info dictionary and a
                   batting statistics dictionary as input and
                   computes a compound statistic
      numplayers - Number of top players to return
    Outputs:
      Returns a list of tuples, player ID and compound statistic
      computed by formula, of the top numplayers players sorted in
      decreasing order of the computed statistic.
    """
    # preparing list of playerid and list of stats
    playerid_list = []
    stats_list = []
    for row in statistics:
        playerid_list.append(row[info['playerid']])
        stats_list.append(onbase_percentage(info, statistics))
    
    # building the list of tuples
    player_list_of_tuple = list(map(lambda player, stats : (player, stats), playerid_list, stats_list)) 

    # reversing the obtained list of tuples based on the stats value
    player_list_of_tuple.sort(key = lambda x:x[1], reverse=True)   
    
    # extracting the top 10 players from the obtained list of tuples
    top_ids_and_stats = []
    for x in range(numplayers):
        top_ids_and_stats.append(player_list_of_tuple[x])
        
    return (top_ids_and_stats)

top_ids_and_stats = top_player_ids(baseballdatainfo, statistics, onbase_percentage,  10)

# writing ids and stats to a file
# with open('ok1.csv', 'w', newline='') as csvfile:
#     csvwriter = csv.writer(csvfile, delimiter=',', quotechar="'")
#     for row in top_ids_and_stats:
#         csvwriter.writerow(row)
        


In [39]:
print(top_ids_and_stats)

[('abercda01', 0), ('addybo01', 0), ('allisar01', 0), ('allisdo01', 0), ('ansonca01', 0), ('armstbo01', 0), ('barkeal01', 0), ('barnero01', 0), ('barrebi01', 0), ('barrofr01', 0)]


# 3rd function

In [3]:

def lookup_player_names(info, top_ids_and_stats):
    """
    Inputs:
      info              - Baseball data information dictionary
      top_ids_and_stats - list of tuples containing player IDs and
                          computed statistics
    Outputs:
      List of strings of the form "x.xxx --- FirstName LastName",
      where "x.xxx" is a string conversion of the float stat in
      the input and "FirstName LastName" is the name of the player
      corresponding to the player ID in the input.
    """
    player_names = []
    with open("Master_2016.csv", newline = '') as master_file:
        master_reader = csv.DictReader(master_file, delimiter=',', quotechar = "'")
        for row in master_reader:
            for id in top_ids_and_stats:
                if id[0] == row[info['playerid']]:
                    player_names.append(str(f'{id[1]:.3f}')+" --- " + row[info['firstname']]+ " " + row[info['lastname']])  
    
    return player_names


lookup_player_names(baseballdatainfo, top_ids_and_stats)

['0.501 --- Hugh Duffy',
 '0.516 --- Billy Hamilton',
 '0.506 --- Rogers Hornsby',
 '0.498 --- Joe Kelley',
 '0.542 --- Babe Ruth',
 '0.510 --- Babe Ruth',
 '0.509 --- Babe Ruth',
 '0.497 --- Ted Williams',
 '0.496 --- Ted Williams',
 '0.496 --- Ted Williams']

In [None]:
# Typical cutoff used for official statistics
MINIMUM_AB = 500

def batting_average(info, batting_stats):
    """
    Inputs:
      batting_stats - dictionary of batting statistics (values are strings)
    Output:
      Returns the batting average as a float
    """
    hits = float(batting_stats[info["hits"]])
    at_bats = float(batting_stats[info["atbats"]])
    if at_bats >= MINIMUM_AB:
        return hits / at_bats
    else:
        return 0

def onbase_percentage(info, batting_stats):
    """
    Inputs:
      batting_stats - dictionary of batting statistics (values are strings)
    Output:
      Returns the on-base percentage as a float
    """
    hits = float(batting_stats[info["hits"]])
    at_bats = float(batting_stats[info["atbats"]])
    walks = float(batting_stats[info["walks"]])
    if at_bats >= MINIMUM_AB:
        return (hits + walks) / (at_bats + walks)
    else:
        return 0

def slugging_percentage(info, batting_stats):
    """
    Inputs:
      batting_stats - dictionary of batting statistics (values are strings)
    Output:
      Returns the slugging percentage as a float
    """
    hits = float(batting_stats[info["hits"]])
    doubles = float(batting_stats[info["doubles"]])
    triples = float(batting_stats[info["triples"]])
    home_runs = float(batting_stats[info["homeruns"]])
    singles = hits - doubles - triples - home_runs
    at_bats = float(batting_stats[info["atbats"]])
    if at_bats >= MINIMUM_AB:
        return (singles + 2 * doubles + 3 * triples + 4 * home_runs) / at_bats
    else:
        return 0

# 4th function

In [5]:
def compute_top_stats_year(info, formula, numplayers, year):
    """
    Inputs:
      info        - Baseball data information dictionary
      formula     - function that takes an info dictionary and a
                    batting statistics dictionary as input and
                    computes a compound statistic
      numplayers  - Number of top players to return
      year        - Year to filter by
    Outputs:
      Returns a list of strings for the top numplayers in the given year
      according to the given formula.
    """
    yearbased_list_of_dictionary = []
    for row in statistics:
        if int(row['yearID']) == year:
            yearbased_list_of_dictionary.append(row)
    
    list_of_top_players_by_year = []
    
#         for row in master_reader:
#             for id in top_ids_and_stats:
#                 if id[0] == row[info['playerid']]:
#                     player_names.append(str(f'{id[1]:.3f}')+" --- " + row[info['firstname']]+ " " + row[info['lastname']])
    for row in yearbased_list_of_dictionary:
#             list_of_top_players_by_year.append(slugging_percentage(baseballdatainfo, yearbased_list_of_dictionary))
        stat = f'{slugging_percentage(baseballdatainfo, row):.5f}'
        with open("Master_2016.csv", newline = '') as master_file:
            master_reader = csv.DictReader(master_file, delimiter=',', quotechar = "'")
            for row2 in master_reader:
                if row2[info['playerid']] == row[info['playerid']]:
                    list_of_top_players_by_year.append(str(stat)+" --- " + row2[info['firstname']]+ " " + row2[info['lastname']])  

    
    return list_of_top_players_by_year

xyz = compute_top_stats_year(baseballdatainfo, slugging_percentage, 20, 1936)

# 5th function

In [35]:
def aggregate_by_player_id(statistics, playerid, fields):
    """
    Inputs:
      statistics - List of batting statistics dictionaries
      playerid   - Player ID field name
      fields     - List of fields to aggregate
    Output:
      Returns a nested dictionary whose keys are player IDs and whose values
      are dictionaries of aggregated stats.  Only the fields from the fields
      input will be aggregated in the aggregated stats dictionaries.
    """

    aggregate_fields = {playerid:{'AB':0, 'H':0, '2B':0, '3B':0, 'HR':0, 'BB':0}}

    for row in statistics:
        if row['playerID'] == playerid:            
            for field in fields:
                   aggregate_fields[playerid][baseballdatainfo[field]] = float(aggregate_fields[playerid][baseballdatainfo[field]]) + float(row[baseballdatainfo[field]])
                       
    
    return aggregate_fields

# playerid = input("Enter playerID: ")
# fields = (input("Enter the fields out of the given fields to be summed up, separated by space:atbats, hits, doubles, triples, homeruns, walks"))
# fields = fields.split()
print(aggregate_by_player_id(statistics, playerid, fields))




{'strawda01': {'AB': 5418.0, 'H': 1401.0, '2B': 256.0, '3B': 38.0, 'HR': 335.0, 'BB': 816.0}}


# 6th function

In [34]:
def compute_top_stats_career(info, formula, numplayers):
    """
    Inputs:
      info        - Baseball data information dictionary
      formula     - function that takes an info dictionary and a
                    batting statistics dictionary as input and
                    computes a compound statistic
      numplayers  - Number of top players to return
    """
    # player_ids = []
    # for row in statistics:
    #     player_ids.append(row['playerID'])
    # player_ids = list(set(player_ids))
    # fields = ["atbats","hits", "doubles", "triples", "homeruns","walks"]
    unique_statistics = statistics
    player_ids = []
    for row in statistics:
        player_ids.append(row['playerID'])
    player_ids = list(set(player_ids)) 
    # the set function eliminates the repititive values and creates unique values set (18915 unique IDs)

    fields = ['atbats','hits', 'doubles', 'triples', 'homeruns','walks']
    
    career_high_score = {}
    aggregate_fields = {} # {playerid:{'AB':0, 'H':0, '2B':0, '3B':0, 'HR':0, 'BB':0}}

    for playerid in player_ids:
        temp = aggregate_by_player_id(statistics, playerid, fields)
        aggregate_fields[playerid] = temp[playerid]
            for battle_stats in statistics:
                if formula == slugging_percentage:
                    career_high_score[playerid] =  slugging_percentage(baseballdatainfo, battle_stats)
                elif formula == onbase_percentage:
                    career_high_score[playerid] = onbase_percentage(baseballdatainfo, battle_stats)
                else:
                    career_high_score[playerid] = batting_average(baseballdatainfo, battle_stats)

    list_career_high = [list(x,y) for x,y in career_high_score.items]
    list_career_high.sort(key = lambda pair:pair[1])
    print(list_career_high[0:numplayers])
    
    # aggregate_fields = {} # {playerid:{'AB':0, 'H':0, '2B':0, '3B':0, 'HR':0, 'BB':0}}
    # for playerid in player_ids:
    #     for row in statistics:
    #         if row['playerID'] == playerid:
    #             aggregate_fields[playerid] = {'AB':0, 'H':0, '2B':0, '3B':0, 'HR':0, 'BB':0}
    #             for field in fields:
    #                    aggregate_fields[playerid][baseballdatainfo[field]] = float(aggregate_fields[playerid][baseballdatainfo[field]]) + float(row[baseballdatainfo[field]])

#     agg_player_stat = {}
#     for player in player_ids:
#             agg_player_stat[player] = (aggregate_by_player_id(statistics, playerid, fields)).values()
    
    
    return list_career_high

print(compute_top_stats_career(baseballdatainfo, slugging_percentage, 20))

KeyboardInterrupt: 

In [41]:
unique_statistics = statistics
player_ids = []
for row in statistics:
    player_ids.append(row['playerID'])
player_ids = list(set(player_ids))
print(len(player_ids))

# fields = ['atbats','hits', 'doubles', 'triples', 'homeruns','walks']
# aggregate_fields = {} #{playerid:{'AB':0, 'H':0, '2B':0, '3B':0, 'HR':0, 'BB':0}}


# for playerid in player_ids:
#     for row in statistics:
#         if row['playerID'] == playerid:
#             aggregate_fields[playerid] = {'AB':0, 'H':0, '2B':0, '3B':0, 'HR':0, 'BB':0}
#             for field in fields:
#                    aggregate_fields[playerid][baseballdatainfo[field]] = float(aggregate_fields[playerid][baseballdatainfo[field]]) + float(row[baseballdatainfo[field]])

# print(len(aggregate_fields))
        
        
    

18915


In [27]:
print(aggregate_fields)

{'breedda01': {'AB': 65.0, 'H': 10.0, '2B': 1.0, '3B': 0.0, 'HR': 0.0, 'BB': 9.0}, 'druhoca01': {'AB': 0.0, 'H': 0.0, '2B': 0.0, '3B': 0.0, 'HR': 0.0, 'BB': 0.0}, 'fryerer01': {'AB': 78.0, 'H': 17.0, '2B': 2.0, '3B': 1.0, 'HR': 0.0, 'BB': 10.0}, 'boyleja02': {'AB': 25.0, 'H': 7.0, '2B': 1.0, '3B': 0.0, 'HR': 0.0, 'BB': 1.0}, 'jerzemi01': {'AB': 0.0, 'H': 0.0, '2B': 0.0, '3B': 0.0, 'HR': 0.0, 'BB': 0.0}, 'creelja01': {'AB': 26.0, 'H': 2.0, '2B': 1.0, '3B': 0.0, 'HR': 0.0, 'BB': 1.0}, 'cruzto03': {'AB': 4.0, 'H': 0.0, '2B': 0.0, '3B': 0.0, 'HR': 0.0, 'BB': 0.0}, 'stemfr01': {'AB': 245.0, 'H': 51.0, '2B': 2.0, '3B': 3.0, 'HR': 0.0, 'BB': 12.0}, 'kinnewa01': {'AB': 6.0, 'H': 1.0, '2B': 0.0, '3B': 0.0, 'HR': 1.0, 'BB': 0.0}, 'hearnji01': {'AB': 2.0, 'H': 0.0, '2B': 0.0, '3B': 0.0, 'HR': 0.0, 'BB': 1.0}, 'bergowi01': {'AB': 38.0, 'H': 5.0, '2B': 0.0, '3B': 0.0, 'HR': 0.0, 'BB': 0.0}, 'karkoro01': {'AB': 138.0, 'H': 25.0, '2B': 3.0, '3B': 0.0, 'HR': 6.0, 'BB': 11.0}, 'lyonsba01': {'AB': 64.0,

# Testing

In [None]:
import pandas as pd 
import numpy as np
df = pd.read_csv("Batting_2016.csv")
# cols = [6,15]
# df = df[df.columns[cols]]
# print(df)
ID = np.matrix(df.iloc[:,0:2])
print(ID.shape)
c=0
for col in range(len(ID)):
    if ID[2,0]==ID[col,0]: # 
        c+=1
        print(ID[col])
    if(col>100000):
        break

print(c)




In [43]:
a = ['1','s','a2']
x,y,z,d = 10,20,30,40
a.append([x,y,z,d])
print(a)

['1', 's', 'a2', [10, 20, 30, 40]]
