In [1]:
from sleeper_wrapper import Stats, Players
import pandas as pd
import numpy as np


class StatsDF:
    """
    Class to manage player stats for a specific season.
    """
    def __init__(self, season_type: str = None, season_year: int = None):
        self.season_type = season_type
        self.season_year = season_year
        self.stats = Stats()
        self.players = Players()
        self.players_df = None  # This will store the DataFrame with players
        self.stats_df = None  # This will store the DataFrame with stats
        self.all_stats = None  # This will store the merged DataFrame

        self.selected_columns = [
            'search_full_name','player_id', 'team', 'fantasy_positions', 'years_exp',
            'active','age','height','weight','depth_chart_order'
        ]

        if season_year is not None and season_type is not None:
            self.refresh_stats()

    def refresh_stats(self):
        """
        Get stats for the specified season type and year
        and assign the DataFrame with players.
        """
        self.stats_df = pd.DataFrame(self.stats.get_all_stats(self.season_type, self.season_year)).T
        self.stats_df.index.name = "player_id"
        self.players_df = self.get_players_df()
        self.all_stats = self.merge_players_df()
        self.all_stats = self.make_column_first('search_full_name')
        self.convert_height_to_cm()  # Call the height conversion function here
        self.convert_to_nan() # call the method to convert '' to nan

    def convert_to_nan(self):
        """
        Replace all empty strings with NaN in the entire DataFrame.
        """
        self.all_stats.replace('', np.nan, inplace=True)

    def get_players_df(self) -> pd.DataFrame:
        """
        Retrieve all players and filter selected columns.
        """
        players_df = pd.DataFrame(self.players.get_all_players()).T
        return players_df[self.selected_columns]

    def merge_players_df(self) -> pd.DataFrame:
        """
        Merge player stats and details.
        """
        return pd.merge(self.stats_df, self.players_df, how='outer', on='player_id')

    def get_stats_df(self) -> pd.DataFrame:
        """
        Return the DataFrame with stats.
        """
        return self.stats_df

    def display_stats_df(self) -> None:
        """
        Display the first few rows of the DataFrame.
        """
        print(self.all_stats.head())

    def make_column_first(self, col_name: str) -> pd.DataFrame:
        """
        Move specified column to the first position and sort the DataFrame by column.
        """
        if col_name in self.all_stats.columns:
            col_to_move = self.all_stats.pop(col_name)
            self.all_stats.insert(0, col_name, col_to_move)
            self.all_stats.sort_values(by=col_name, axis=0, inplace=True)
        else:
            print(f'Column "{col_name}" does not exist in the DataFrame.')
        return self.all_stats
    
    def get_positions_df(self, player_position: str) -> pd.DataFrame:
        """
        Return the DataFrame with player positions.
        """
        return self.all_stats[self.all_stats['fantasy_positions'].apply(lambda x: player_position in x if hasattr(x, '__iter__') else False)]
        
    def drop_empty_columns(self) -> pd.DataFrame:
        """
        Drop empty columns from the DataFrame.
        """
        self.all_stats = self.all_stats.dropna(axis=1, how='all')
        return self.all_stats
        

    def parse_height(self, height) -> float:
        """
        Parse height string in the format 'X\'Y"' (e.g. '6\'1"') and convert to centimeters.
        If the height is already a number (presumably in centimeters), just return that number.
        If the height is not a string (e.g., it's a float or NaN), return None.
        """
        if isinstance(height, str):
            if '\'' in height and '"' in height:
                parts = height.split('\'')
                if len(parts) == 2:  # Make sure parts contains two elements
                    feet = int(parts[0])
                    inches_part = parts[1]
                    inches = int(inches_part.replace('"', '')) if inches_part.replace('"', '').isdigit() else 0
                    return self.feet_to_inches(feet, inches)
            elif height.isdigit():
                return float(height)
        return None

    def export_to_csv(self):
        df = self.get_stats_df()
        df.to_csv('stats.csv')

    @staticmethod
    def feet_to_inches(feet, inches):
        total_inches = feet * 12 + inches
        return total_inches

    def convert_height_to_cm(self):
        """
        Convert height column from feet and inches to centimeters.
        """
        self.all_stats['height'] = self.all_stats['height'].apply(self.parse_height)




In [2]:
def preprocess_position(df, position: str) -> pd.DataFrame:
    """
    Preprocess the DataFrame for a specific position.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    position (str): The position for which to preprocess the DataFrame.

    Returns:
    pd.DataFrame: The preprocessed DataFrame for the specified position.
    """
    df = df.get_positions_df(position)
    df = df.dropna(axis=1, how='all')
    df = df.dropna(subset=['pos_rank_ppr'])
    return df


In [3]:
df_stats_22 = StatsDF("regular",2022)
df_stats_21 = StatsDF("regular",2021)
df_stats_20 = StatsDF("regular",2020)
df_stats_19 = StatsDF("regular",2019)
df_stats_18 = StatsDF("regular",2018)
df_stats_17 = StatsDF("regular",2017)
df_stats_16 = StatsDF("regular",2016)
df_stats_15 = StatsDF("regular",2015)
df_stats_14 = StatsDF("regular",2014)

In [8]:
df_stats_22.refresh_stats()
df_stats_21.refresh_stats()
df_stats_20.refresh_stats()
df_stats_19.refresh_stats()
df_stats_18.refresh_stats()
df_stats_17.refresh_stats()
df_stats_16.refresh_stats()
df_stats_15.refresh_stats()
df_stats_14.refresh_stats()


<__main__.StatsDF at 0x1e545b15db0>

In [10]:


# Assuming df_stats_22, df_stats_21, ..., df_stats_14 are your dataframes
all_dataframes = [df_stats_22.all_stats, df_stats_21.all_stats, df_stats_20.all_stats,
                  df_stats_19.all_stats, df_stats_18.all_stats, df_stats_17.all_stats,
                  df_stats_16.all_stats, df_stats_15.all_stats, df_stats_14.all_stats]

all_dataframes_wo_22= [df_stats_21.all_stats, df_stats_20.all_stats,
                  df_stats_19.all_stats, df_stats_18.all_stats, df_stats_17.all_stats,
                  df_stats_16.all_stats, df_stats_15.all_stats, df_stats_14.all_stats]

all_dataframes_combined = pd.concat(all_dataframes, ignore_index=True)

all_dataframes_combined_wo_22 = pd.concat(all_dataframes_wo_22, ignore_index=True)

# Now you have all the data from different years in one combined dataframe


In [13]:
exploded_pos = all_dataframes_combined['fantasy_positions'].explode()
unique_pos = exploded_pos.unique()
print(unique_pos)

['DB' 'LB' 'DL' 'QB' 'OG' 'OL' 'LS' 'WR' 'RB' 'TE' 'OT' None 'K' 'P' 'LEO'
 'DEF' nan]


In [17]:
def get_positions_df(dataframe, player_position: str) -> pd.DataFrame:
    """
    Return the DataFrame with player positions.
    """
    return dataframe[dataframe['fantasy_positions'].apply(lambda x: player_position in x if hasattr(x, '__iter__') else False)]


def drop_empty_columns(dataframe):

    return dataframe.dropna(axis=1, how = 'all')

In [30]:
all_qb_df = get_positions_df(all_dataframes_combined_wo_22, "QB")


In [31]:
##all_rb_df = get_positions_df(all_dataframes_combined, "RB")
#all_wr_df = get_positions_df(all_dataframes_combined, "WR")
#all_K_df = get_positions_df(all_dataframes_combined, "K")
#all_def_df = get_positions_df(all_dataframes_combined, "DEF")
#all_te_df = get_positions_df(all_dataframes_combined, "TE")
drop_empty_columns(all_qb_df)

Unnamed: 0,search_full_name,player_id,pass_inc,rec_yd,pts_ppr,rec_tgt,pr_yd,rec_td_lng,pass_yd,pr_lng,...,punt_blkd,team,fantasy_positions,years_exp,active,age,height,weight,depth_chart_order,snp
2,aaronbailey,4683,,,,,,,,,...,,BAL,[QB],1,True,,73.0,230,4.0,
26,aaronmurray,2019,,,,,,,,,...,,,[QB],6,False,29,73.0,210,,
33,aaronrodgers,96,165.0,-4.0,334.30,1.0,,,4115.0,,...,,NYJ,[QB],18,True,39,74.0,223,1.0,
101,adrianmartinez,11065,,,,,,,,,...,,DET,[QB],0,True,23,74.0,220,,
117,aidanoconnell,10866,,,,,,,,,...,,LV,[QB],0,True,24,75.0,210,3.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77616,zacdysert,1455,,,,,,,,,...,,,[QB],9,False,31,75.0,221,,
77624,zachconque,4607,,,,,,,,,...,,,[QB],5,True,27,78.0,237,,
77640,zachmettenberger,1935,72.0,,73.88,,,,1412.0,,...,,,[QB],6,False,29,77.0,224,,313.0
77654,zachterrell,4924,,,,,,,,,...,,BAL,[QB],1,True,24,73.0,206,,


In [32]:
all_qb_df.to_csv("2014_22-all_qb_df.csv")

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer
import pandas as pd
from sklearn.impute import KNNImputer
import pandas as pd


from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer

from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer

from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

class Preprocessor:
    def __init__(self, threshold=0.01):
        self.threshold = threshold
        self.features_to_keep = None

    def fit(self, X, y):
        # Impute missing values
        self.imputer = KNNImputer(n_neighbors=5)
        X_imputed = self.imputer.fit_transform(X)

        # Fit a model to get feature importances
        rf = RandomForestRegressor()
        rf.fit(X_imputed, y)

        # Get feature importances
        importances = rf.feature_importances_

        # Select features with importances above the threshold
        self.features_to_keep = X.columns[importances > self.threshold]

    def transform(self, X):
        # Impute missing values
        X_imputed = self.imputer.transform(X)

        # Filter features based on feature importance
        X_filtered = pd.DataFrame(X_imputed, columns=X.columns)[self.features_to_keep]

        # Create a scaler object
        scaler = StandardScaler()

        # Fit and transform the features
        X_scaled = scaler.fit_transform(X_filtered)

        return X_scaled

In [28]:
def preprocess_data(df, target_col='pos_rank_ppr', threshold=0.01, inplace=False):
    """
    Preprocess the DataFrame and return the features and target arrays.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    target_col (str): The column name of the target variable in df.
    threshold (float): The minimum feature importance for a feature to be kept.

    Returns:
    tuple: A tuple containing the preprocessed features array and the target array.
    """
    cols_to_drop = ["search_full_name","rank_ppr","pos_rank_ppr","pos_rank_half_ppr",
    "rank_std","rank_half_ppr","pos_rank_std","team","fantasy_positions","player_id",]
    
    df.sort_values(by='player_id', inplace=True)
    
    target = df[target_col]
    target = target.fillna(0)
    # Exclude target column from features
    features = df.drop(columns=cols_to_drop)

    preprocessor = Preprocessor(threshold)
    preprocessor.fit(features, target)
    features_processed = preprocessor.transform(features)

    feature_names = preprocessor.features_to_keep.tolist()

    return features_processed, target, feature_names, df

features_processed, target, feature_names, clean_all_qb_df = preprocess_data(all_qb_df)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sort_values(by='player_id', inplace=True)


IndexError: boolean index did not match indexed array along dimension 0; dimension is 255 but corresponding boolean dimension is 112