In [6]:
"""
Project for Week 4 of "Python Data Analysis".
Processing CSV files with baseball stastics.

Be sure to read the project description page for further information
about the expected behavior of the program.
"""

import csv



    #
    # Dictionary containing information needed to access baseball statistics
    # This information is all tied to the format and contents of the CSV files
    #
baseballdatainfo = {"masterfile": "Master_2016.csv",   # Name of Master CSV file
                    "battingfile": "Batting_2016.csv", # Name of Batting CSV file
                    "separator": ",",                  # Separator character in CSV files
                    "quote": '"',                      # Quote character in CSV files
                    "playerid": "playerID",            # Player ID field name
                    "firstname": "nameFirst",          # First name field name
                    "lastname": "nameLast",            # Last name field name
                    "yearid": "yearID",                # Year field name
                    "atbats": "AB",                    # At bats field name
                    "hits": "H",                       # Hits field name
                    "doubles": "2B",                   # Doubles field name
                    "triples": "3B",                   # Triples field name
                    "homeruns": "HR",                  # Home runs field name
                    "walks": "BB",                     # Walks field name
                    "battingfields": ["AB", "H", "2B", "3B", "HR", "BB"]}

statistics = []  # batting_2016.csv file extracted as list_of_dictionaries
with open(baseballdatainfo['battingfile'], newline='') as csvfile:
    csvreader = csv.DictReader(csvfile, delimiter=',', quotechar="'")
    for row in csvreader:
        statistics.append(row) 
##
## Provided code from Week 3 Project
##

def read_csv_as_list_dict(filename, separator, quote):
    """
    Inputs:
      filename  - name of CSV file
      separator - character that separates fields
      quote     - character used to optionally quote fields
    Output:
      Returns a list of dictionaries where each item in the list
      corresponds to a row in the CSV file.  The dictionaries in the
      list map the field names to the field values for that row.
    """
    table = []
    with open(filename, newline='') as csvfile:
        csvreader = csv.DictReader(csvfile, delimiter=separator, quotechar=quote)
        for row in csvreader:
            table.append(row)
    return table


def read_csv_as_nested_dict(filename, keyfield, separator, quote):
    """
    Inputs:
      filename  - name of CSV file
      keyfield  - field to use as key for rows
      separator - character that separates fields
      quote     - character used to optionally quote fields
    Output:
      Returns a dictionary of dictionaries where the outer dictionary
      maps the value in the key_field to the corresponding row in the
      CSV file.  The inner dictionaries map the field names to the
      field values for that row.
    """
    table = {}
    with open(filename, newline='') as csvfile:
        csvreader = csv.DictReader(csvfile, delimiter=separator, quotechar=quote)
        for row in csvreader:
            rowid = row[keyfield]
            table[rowid] = row
    return table

##
## Provided formulas for common batting statistics
##

# Typical cutoff used for official statistics
MINIMUM_AB = 500

def batting_average(info, batting_stats):
    """
    Inputs:
      batting_stats - dictionary of batting statistics (values are strings)
    Output:
      Returns the batting average as a float
    """
    hits = float(batting_stats[info["hits"]])
    at_bats = float(batting_stats[info["atbats"]])
    if at_bats >= MINIMUM_AB:
        return hits / at_bats
    else:
        return 0

def onbase_percentage(info, batting_stats):
    """
    Inputs:
      batting_stats - dictionary of batting statistics (values are strings)
    Output:
      Returns the on-base percentage as a float
    """
    hits = float(batting_stats[info["hits"]])
    at_bats = float(batting_stats[info["atbats"]])
    walks = float(batting_stats[info["walks"]])
    if at_bats >= MINIMUM_AB:
        return (hits + walks) / (at_bats + walks)
    else:
        return 0

def slugging_percentage(info, batting_stats):
    """
    Inputs:
      batting_stats - dictionary of batting statistics (values are strings)
    Output:
      Returns the slugging percentage as a float
    """
    hits = float(batting_stats[info["hits"]])
    doubles = float(batting_stats[info["doubles"]])
    triples = float(batting_stats[info["triples"]])
    home_runs = float(batting_stats[info["homeruns"]])
    singles = hits - doubles - triples - home_runs
    at_bats = float(batting_stats[info["atbats"]])
    if at_bats >= MINIMUM_AB:
        return (singles + 2 * doubles + 3 * triples + 4 * home_runs) / at_bats
    else:
        return 0


##
## Part 1: Functions to compute top batting statistics by year
##

def filter_by_year(statistics, year, yearid):
    """
    Inputs:
      statistics - List of batting statistics dictionaries
      year       - Year to filter by
      yearid     - Year ID field in statistics
    Outputs:
      Returns a list of batting statistics dictionaries that
      are from the input year.
    """
    
    filtered_stats = []
    for row in statistics:
        if year == int(row[yearid]): # Comparing every row's year to the input year value
            filtered_stats.append(row) # Appending row, belonging to the input year
    return filtered_stats

filter_by_year(statistics, 1935, "yearID") # filters list of dictionaries are 513 for year 1935

def top_player_ids(info, statistics, formula, numplayers):
    """
    Inputs:
      info       - Baseball data information dictionary
      statistics - List of batting statistics dictionaries
      formula    - function that takes an info dictionary and a
                   batting statistics dictionary as input and
                   computes a compound statistic
      numplayers - Number of top players to return
    Outputs:
      Returns a list of tuples, player ID and compound statistic
      computed by formula, of the top numplayers players sorted in
      decreasing order of the computed statistic.
    """
    # preparing list of playerid and list of stats
    playerid_list = []
    stats_list = []
    for row in statistics:
        playerid_list.append(row[info['playerid']])
        stats_list.append(onbase_percentage(info, row))
    
    # building the list of tuples
    player_list_of_tuple = list(map(lambda player, stats : (player, stats), playerid_list, stats_list)) 

    # reversing the obtained list of tuples based on the stats value
    player_list_of_tuple.sort(key = lambda x:x[1], reverse=True)   
    
    # extracting the top 10 players from the obtained list of tuples
    top_ids_and_stats = []
    for x in range(numplayers):
        top_ids_and_stats.append(player_list_of_tuple[x])
        
    return top_ids_and_stats

top_ids_and_stats = top_player_ids(baseballdatainfo, statistics, onbase_percentage,  10)


def lookup_player_names(info, top_ids_and_stats):
    """
    Inputs:
      info              - Baseball data information dictionary
      top_ids_and_stats - list of tuples containing player IDs and
                          computed statistics
    Outputs:
      List of strings of the form "x.xxx --- FirstName LastName",
      where "x.xxx" is a string conversion of the float stat in
      the input and "FirstName LastName" is the name of the player
      corresponding to the player ID in the input.
    """
    player_names = []
    with open("Master_2016.csv", newline = '') as master_file:
        master_reader = csv.DictReader(master_file, delimiter=',', quotechar = "'")
        for row in master_reader:
            for id in top_ids_and_stats:
                if id[0] == row[info['playerid']]:
                    player_names.append(str(f'{id[1]:.3f}')+" --- " + row[info['firstname']]+ " " + row[info['lastname']])  
    
    return player_names


lookup_player_names(baseballdatainfo, top_ids_and_stats)


def compute_top_stats_year(info, formula, numplayers, year):
    """
    Inputs:
      info        - Baseball data information dictionary
      formula     - function that takes an info dictionary and a
                    batting statistics dictionary as input and
                    computes a compound statistic
      numplayers  - Number of top players to return
      year        - Year to filter by
    Outputs:
      Returns a list of strings for the top numplayers in the given year
      according to the given formula.
    """
    yearbased_list_of_dictionary = []
    for row in statistics:
        if int(row['yearID']) == year:
            yearbased_list_of_dictionary.append(row)
    
    list_of_top_players_by_year = []
    
    for row in yearbased_list_of_dictionary:
#   list_of_top_players_by_year.append(slugging_percentage(baseballdatainfo, yearbased_list_of_dictionary))
        stat = f'{slugging_percentage(baseballdatainfo, row):.5f}'
        with open("Master_2016.csv", newline = '') as master_file:
            master_reader = csv.DictReader(master_file, delimiter=',', quotechar = "'")
            for row2 in master_reader:
                if row2[info['playerid']] == row[info['playerid']]:
                    list_of_top_players_by_year.append(str(stat)+" --- " + row2[info['firstname']]+ " " + row2[info['lastname']])  

    
    return list_of_top_players_by_year

# xyz = compute_top_stats_year(baseballdatainfo, slugging_percentage, 20, 1936)


##
## Part 2: Functions to compute top batting statistics by career
##

def aggregate_by_player_id(statistics, playerid, fields):
    """
    Inputs:
      statistics - List of batting statistics dictionaries
      playerid   - Player ID field name
      fields     - List of fields to aggregate
    Output:
      Returns a nested dictionary whose keys are player IDs and whose values
      are dictionaries of aggregated stats.  Only the fields from the fields
      input will be aggregated in the aggregated stats dictionaries.
    """
    aggregate_fields = {playerid:{'AB':0, 'H':0, '2B':0, '3B':0, 'HR':0, 'BB':0}}

    for row in statistics:
        if row['playerID'] == playerid:            
            for field in fields:
                   aggregate_fields[playerid][baseballdatainfo[field]] = float(aggregate_fields[playerid][baseballdatainfo[field]]) + float(row[baseballdatainfo[field]])
    
    return aggregate_fields

def compute_top_stats_career(info, formula, numplayers):
    """
    Inputs:
      info        - Baseball data information dictionary
      formula     - function that takes an info dictionary and a
                    batting statistics dictionary as input and
                    computes a compound statistic
      numplayers  - Number of top players to return
    """
    unique_statistics = statistics
    player_ids = []
    for row in statistics:
        player_ids.append(row['playerID'])
    player_ids = list(set(player_ids)) 
    # the set function eliminates the repititive values and creates unique values set (18915 unique IDs)

    fields = ['atbats','hits', 'doubles', 'triples', 'homeruns','walks']
    
    career_high_score = {}
    aggregate_fields = {} # {playerid:{'AB':0, 'H':0, '2B':0, '3B':0, 'HR':0, 'BB':0}}

    for playerid in player_ids:
        temp = aggregate_by_player_id(statistics, playerid, fields)
        aggregate_fields[playerid] = temp[playerid]
        for battle_stats in statistics:
            if formula == slugging_percentage:
                career_high_score[playerid] =  slugging_percentage(baseballdatainfo, battle_stats)
            elif formula == onbase_percentage:
                career_high_score[playerid] = onbase_percentage(baseballdatainfo, battle_stats)
            else:
                career_high_score[playerid] = batting_average(baseballdatainfo, battle_stats)

    list_career_high = [list(x,y) for x,y in career_high_score.items]
    list_career_high.sort(key = lambda pair:pair[1])
    print(list_career_high[0:numplayers])
    return list_career_high

print(compute_top_stats_career(baseballdatainfo, slugging_percentage, 20))


##
## Provided testing code
##

def test_baseball_statistics():
    """
    Simple testing code.
    """
  

    print("Top 5 batting averages in 1923")
    top_batting_average_1923 = compute_top_stats_year(baseballdatainfo, batting_average, 5, 1923)
    for player in top_batting_average_1923:
        print(player)
    print("")

    print("Top 10 batting averages in 2010")
    top_batting_average_2010 = compute_top_stats_year(baseballdatainfo, batting_average, 10, 2010)
    for player in top_batting_average_2010:
        print(player)
    print("")

    print("Top 10 on-base percentage in 2010")
    top_onbase_2010 = compute_top_stats_year(baseballdatainfo, onbase_percentage, 10, 2010)
    for player in top_onbase_2010:
        print(player)
    print("")

    print("Top 10 slugging percentage in 2010")
    top_slugging_2010 = compute_top_stats_year(baseballdatainfo, slugging_percentage, 10, 2010)
    for player in top_slugging_2010:
        print(player)
    print("")

    # You can also use lambdas for the formula
    #  This one computes onbase plus slugging percentage
    print("Top 10 OPS in 2010")
    top_ops_2010 = compute_top_stats_year(baseballdatainfo,
                                          lambda info, stats: (onbase_percentage(info, stats) +
                                                               slugging_percentage(info, stats)),
                                          10, 2010)
    for player in top_ops_2010:
        print(player)
    print("")

    print("Top 20 career batting averages")
    top_batting_average_career = compute_top_stats_career(baseballdatainfo, batting_average, 20)
    for player in top_batting_average_career:
        print(player)
    print("")


# Make sure the following call to test_baseball_statistics is
# commented out when submitting to OwlTest/CourseraTest.

test_baseball_statistics()

KeyboardInterrupt: 