In [1]:
# Import Dependencies
import pandas as pd

In [2]:
# Set File Paths
filepath1 = "Data\HallOfFame.csv"

filepath2 = "Data\Batting.csv"

filepath3 = "Data\BattingPost.csv"

filepath4 = "Data\People.csv"

filepath5 = "Data\Fielding.csv"

In [3]:
# Read CSV Files and Convert Them to DataFrames
hof_df = pd.read_csv(filepath1)

batting_stats = pd.read_csv(filepath2)

postseason_batting_stats = pd.read_csv(filepath3)

people_df = pd.read_csv(filepath4)

fielding_df = pd.read_csv(filepath5)

In [4]:
# Filter HOF DataFrame to Only Include People Inducted into Hall of Fame
hof_df = hof_df.loc[hof_df["inducted"] == "Y"]

In [5]:
# Filter HOF DataFrame to Only Inclue Players Inducted into Hall of Fame
hof_df = hof_df.loc[hof_df["category"] == "Player"]

In [6]:
# Create a new DataFrame that Includes playerIDs for All Hall of Fame Players
hof_player_id = pd.DataFrame(hof_df["playerID"])

In [7]:
# Merge Hall of Fame playerID DataFrame with Regular/Postseason Batting Stats DataFrames,
# Utilizing an Inner Merge to Remove All Non-Hall of Fame Players and Any Years with No Batting Data
hof_batting_stats = pd.merge(hof_player_id, batting_stats, on="playerID", how="inner")

hof_postseason_batting_stats = pd.merge(hof_player_id, postseason_batting_stats, on="playerID", how="inner")

In [8]:
# Organize DataFrame Columns and Remove Unwanted Columns
hof_batting_stats = hof_batting_stats[["playerID", "yearID", "G", "AB", "R", "H", "2B", "3B", "HR", "RBI", "BB", "IBB", "HBP", "SH"]]

hof_postseason_batting_stats = hof_postseason_batting_stats[["playerID", "yearID", "G", "AB", "R", "H", "2B", "3B", "HR", "RBI", "BB", "IBB", "HBP", "SH"]]

In [9]:
# Organize DataFrame Columns and Remove Unwanted Columns
hof_player_names = people_df[["playerID", "nameFirst", "nameLast"]]

In [10]:
# Merge Hall of Fame Player Names DataFrame with Regular/Postseason Batting Stats DataFrames
# Utilizing an Inner Merge to Add Player Names to the Batting Stats DataFrames
hof_batting_stats_name = pd.merge(hof_player_names, hof_batting_stats, on="playerID", how="inner")

hof_postseason_batting_stats_name = pd.merge(hof_player_names, hof_postseason_batting_stats, on="playerID", how="inner")

In [11]:
# Combine nameFirst and nameLast Columns into a Single Name Column
hof_batting_stats_name = hof_batting_stats_name.assign(Name = hof_batting_stats_name.nameFirst.astype(str) + " " +
                                                       hof_batting_stats_name.nameLast.astype(str))

hof_postseason_batting_stats_name = hof_postseason_batting_stats_name.assign(Name = hof_postseason_batting_stats_name.nameFirst.astype(str) 
                                                                             + " " + hof_postseason_batting_stats_name.nameLast.astype(str))

In [12]:
# Reposition Name Column and Remove nameFirst and nameLast Columns from DataFrames
hof_batting_stats_name = hof_batting_stats_name[["playerID", "Name", "yearID", "G", "AB", "R", "H", "2B", "3B", "HR", "RBI", "BB", "IBB", "HBP", "SH"]]

hof_postseason_batting_stats_name = hof_postseason_batting_stats_name[["playerID", "Name", "yearID", "G", "AB", "R", "H", "2B", "3B", "HR", "RBI", "BB", "IBB", "HBP", "SH"]]

In [13]:
# Replace NA Values with a 0
hof_batting_stats_name = hof_batting_stats_name.fillna(0)

hof_postseason_batting_stats_name = hof_postseason_batting_stats_name.fillna(0)

In [16]:
# Remove All Years Before 1950 From the DataFrames
sorted_hof_rs_stats = hof_batting_stats_name.loc[hof_batting_stats_name["yearID"] >= 1950]

sorted_hof_ps_stats = hof_postseason_batting_stats_name.loc[hof_postseason_batting_stats_name["yearID"] >= 1950]

In [17]:
# Organize DataFrame Columns and Remove Unwanted Columns
fielding_df = fielding_df[["playerID", "POS"]]

In [19]:
# Keep All Position Players and Remove Pitchers From Position Players DataFrame
position_players = fielding_df.loc[(fielding_df["POS"] == "C") | (fielding_df["POS"] == "1B") | (fielding_df["POS"] == "2B") | (fielding_df["POS"] == "SS") | (fielding_df["POS"] == "3B") | (fielding_df["POS"] == "OF")]

In [20]:
# Create a DataFrame of Unique Position Player Ids
position_player_id = pd.DataFrame({"playerID": position_players["playerID"].unique()})

In [21]:
# Merge Position Player ID DataFrame with Sorted Regular/Postseason Stats DataFrames, Utilizing an Inner Merge
# to Remove Pitcher's Batting Stats From the Resulting DataFrames
sorted_hof_pp_rs_stats = pd.merge(sorted_hof_rs_stats, position_player_id, on="playerID", how="inner")

sorted_hof_pp_ps_stats = pd.merge(sorted_hof_ps_stats, position_player_id, on="playerID", how="inner")

In [26]:
# Re-Order DataFrames by yearID with Oldest Years First
sorted_hof_pp_rs_stats = sorted_hof_pp_rs_stats.sort_values("yearID")

sorted_hof_pp_ps_stats = sorted_hof_pp_ps_stats.sort_values("yearID")

In [27]:
# Save Final DataFrames as CSV Files in Output Directory
sorted_hof_pp_rs_stats.to_csv("Output/sorted_hof_pp_rs_stats.csv", index=False, header=True)

sorted_hof_pp_ps_stats.to_csv("Output/sorted_hof_pp_ps_stats.csv", index=False, header=True)