In [6]:
# you can run the whole thing and not worry about deleting anything accidentally
# changes will only happen in the working_csv_data if the csv file doesn't already exist.
# To modify the filtering, find the right function, change it, delete the existing file and rerun the whole thing.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from utils import read_table
import os

In [7]:
latest_acceptable_year = 1985

# Basic use case of this function is to simply pass the table_name argument and it will transfer the table, doing nothing to change it
def transfer_table_to_working_data(table_name, func=None, columns_to_keep=None):
    # you can pass a function that filters out rows based on some conditions
    # if columns_to_keep is None, all columns are KEPT
    new_file_path = "working_csv_data/" + table_name + ".csv"
    if not os.path.exists(new_file_path):
        
        if func == None: 
            df = read_table(table_name, selected_columns=columns_to_keep, read_from_archive = True)
        else:
            df = func(read_table(table_name, read_from_archive = True))
            if columns_to_keep != None: df = df[columns_to_keep]
                
        df.to_csv(new_file_path, index=False)
        

In [8]:
def filter_Master_table(df):
    # remove the few (220) players for which the bbrefID is not the same for some reason, not sure what it does anyway
    df["is_equal"] = df.apply(lambda row: row["playerID"] == row["bbrefID"], axis=1)
    df = df[(df["is_equal"])]

    # remove all players who didn't play after the "latest_acceptable_year"
    df["finalGame"] =  df["finalGame"].fillna("2015-01-01")
    df["is_after_latest_year"] = df.apply(lambda row: int(row["finalGame"][:4]) >= latest_acceptable_year, axis=1)
    df = df[(df["is_after_latest_year"])]
    
    return df

transfer_table_to_working_data("Master", func=filter_Master_table, columns_to_keep = ["playerID", "birthYear", "nameFirst", "nameLast", "weight", "height", "bats", "throws", "debut"])

In [9]:
def filter_Salaries_table(df):
    df["is_recent_enough"] = df.apply(lambda row: int(row["yearID"]) >= latest_acceptable_year, axis=1)
    df = df[df["is_recent_enough"]]
    df = df.drop(columns=["is_recent_enough"])
    return df
    
transfer_table_to_working_data("Salaries", func=filter_Salaries_table)

In [10]:
def filter_Batting_table(df):
    
    # removing stats from old ass years
    df["is_recent_enough"] = df.apply(lambda row: int(row["yearID"]) >= latest_acceptable_year, axis=1)
    df = df[df["is_recent_enough"]]
    

    # removing the players who have not gone to home base enough
    at_base_agglo_data = df[["playerID", "AB"]].groupby(['playerID']).sum()
    at_base_agglo_data["keep"] = at_base_agglo_data["AB"].apply(lambda x: np.log10(x + 1.) > 3) # cutoff can be changed up to have a bigger proportion of "elite" players in the dataset
    playerID_to_keep = at_base_agglo_data.index[at_base_agglo_data['keep']].tolist()
    df["has_played_enough"] = df.apply(lambda row: row["playerID"] in playerID_to_keep, axis=1) 
    df = df[df["has_played_enough"]]
    
    
    # we can do this because only the float columns contain nans, i checked
    df = df.fillna(0.0)
    
    # clean up
    df = df.drop(columns=["is_recent_enough"])
    df = df.drop(columns=["has_played_enough"])
    df = df.sort_values(by=['playerID', 'yearID'])

    return df
    
transfer_table_to_working_data("Batting", func=filter_Batting_table)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["has_played_enough"] = df.apply(lambda row: row["playerID"] in playerID_to_keep, axis=1)
