In [None]:
df_stats_22 = StatsDF("regular",2022)
df_stats_21 = StatsDF("regular",2021)
#df_stats_20 = StatsDF("regular",2020)
#df_stats_19 = StatsDF("regular",2019)
#df_stats_18 = StatsDF("regular",2018)

In [None]:
from sleeper_wrapper import Stats, Players
import pandas as pd
import numpy as np


class StatsDF:
    """
    Class to manage player stats for a specific season.
    """
    def __init__(self, season_type: str = None, season_year: int = None):
        self.season_type = season_type
        self.season_year = season_year
        self.stats = Stats()
        self.players = Players()
        self.players_df = None  # This will store the DataFrame with players
        self.stats_df = None  # This will store the DataFrame with stats
        self.all_stats = None  # This will store the merged DataFrame

        self.selected_columns = [
            'search_full_name','player_id', 'team', 'fantasy_positions', 'years_exp',
            'active','age','height','weight','depth_chart_order'
        ]

        if season_year is not None and season_type is not None:
            self.refresh_stats()

    def refresh_stats(self):
        """
        Get stats for the specified season type and year
        and assign the DataFrame with players.
        """
        self.stats_df = pd.DataFrame(self.stats.get_all_stats(self.season_type, self.season_year)).T
        self.stats_df.index.name = "player_id"
        self.players_df = self.get_players_df()
        self.all_stats = self.merge_players_df()
        self.all_stats = self.make_column_first('search_full_name')
        self.convert_height_to_cm()  # Call the height conversion function here
        self.convert_to_nan() # call the method to convert '' to nan

    def convert_to_nan(self):
        """
        Replace all empty strings with NaN in the entire DataFrame.
        """
        self.all_stats.replace('', np.nan, inplace=True)

    def get_players_df(self) -> pd.DataFrame:
        """
        Retrieve all players and filter selected columns.
        """
        players_df = pd.DataFrame(self.players.get_all_players()).T
        return players_df[self.selected_columns]

    def merge_players_df(self) -> pd.DataFrame:
        """
        Merge player stats and details.
        """
        return pd.merge(self.stats_df, self.players_df, how='outer', on='player_id')

    def get_stats_df(self) -> pd.DataFrame:
        """
        Return the DataFrame with stats.
        """
        return self.stats_df

    def display_stats_df(self) -> None:
        """
        Display the first few rows of the DataFrame.
        """
        print(self.all_stats.head())

    def make_column_first(self, col_name: str) -> pd.DataFrame:
        """
        Move specified column to the first position and sort the DataFrame by column.
        """
        if col_name in self.all_stats.columns:
            col_to_move = self.all_stats.pop(col_name)
            self.all_stats.insert(0, col_name, col_to_move)
            self.all_stats.sort_values(by=col_name, axis=0, inplace=True)
        else:
            print(f'Column "{col_name}" does not exist in the DataFrame.')
        return self.all_stats
    
    def get_positions_df(self, player_position: str) -> pd.DataFrame:
        """
        Return the DataFrame with player positions.
        """
        return self.all_stats[self.all_stats['fantasy_positions'].apply(lambda x: player_position in x if hasattr(x, '__iter__') else False)]
        
    def drop_empty_columns(self) -> pd.DataFrame:
        """
        Drop empty columns from the DataFrame.
        """
        self.all_stats = self.all_stats.dropna(axis=1, how='all')
        return self.all_stats
        

    def parse_height(self, height) -> float:
        """
        Parse height string in the format 'X\'Y"' (e.g. '6\'1"') and convert to centimeters.
        If the height is already a number (presumably in centimeters), just return that number.
        If the height is not a string (e.g., it's a float or NaN), return None.
        """
        if isinstance(height, str):
            if '\'' in height and '"' in height:
                parts = height.split('\'')
                if len(parts) == 2:  # Make sure parts contains two elements
                    feet = int(parts[0])
                    inches_part = parts[1]
                    inches = int(inches_part.replace('"', '')) if inches_part.replace('"', '').isdigit() else 0
                    return self.feet_to_inches(feet, inches)
            elif height.isdigit():
                return float(height)
        return None

    def export_to_csv(self):
        df = self.get_stats_df()
        df.to_csv('stats.csv')

    @staticmethod
    def feet_to_inches(feet, inches):
        total_inches = feet * 12 + inches
        return total_inches

    def convert_height_to_cm(self):
        """
        Convert height column from feet and inches to centimeters.
        """
        self.all_stats['height'] = self.all_stats['height'].apply(self.parse_height)




In [None]:
class Preprocessor:
    def __init__(self, threshold=0.01):
        self.threshold = threshold
        self.features_to_keep = None

    def fit(self, X, y):
        # Impute missing values
        self.imputer = KNNImputer(n_neighbors=5)
        X_imputed = self.imputer.fit_transform(X)

        # Fit a model to get feature importances
        rf = RandomForestRegressor()
        rf.fit(X_imputed, y)

        # Get feature importances
        importances = rf.feature_importances_

        # Select features with importances above the threshold
        self.features_to_keep = X.columns[importances > self.threshold]

    def transform(self, X):
        # Impute missing values
        X_imputed = self.imputer.transform(X)

        # Filter features based on feature importance
        X_filtered = pd.DataFrame(X_imputed, columns=X.columns)[self.features_to_keep]

        # Create a scaler object
        scaler = StandardScaler()

        # Fit and transform the features
        X_scaled = scaler.fit_transform(X_filtered)

        return X_scaled

In [None]:
def preprocess_position(df, position: str) -> pd.DataFrame:
    """
    Preprocess the DataFrame for a specific position.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    position (str): The position for which to preprocess the DataFrame.

    Returns:
    pd.DataFrame: The preprocessed DataFrame for the specified position.
    """
    df = df.get_positions_df(position)
    df = df.dropna(axis=1, how='all')
    df = df.dropna(subset=['pos_rank_ppr'])
    return df


from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer
import pandas as pd
from sklearn.impute import KNNImputer
import pandas as pd


from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer

from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer

from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

def preprocess_data(df, target_col='pos_rank_ppr', threshold=0.01, inplace=False):
    """
    Preprocess the DataFrame and return the features and target arrays.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    target_col (str): The column name of the target variable in df.
    threshold (float): The minimum feature importance for a feature to be kept.

    Returns:
    tuple: A tuple containing the preprocessed features array and the target array.
    """
    cols_to_drop = ["search_full_name","rank_ppr","pos_rank_ppr","pos_rank_half_ppr",
    "rank_std","rank_half_ppr","pos_rank_std","team","fantasy_positions","player_id",]
    
    df.sort_values(by='player_id', inplace=True)
    
    target = df[target_col]

    # Exclude target column from features
    features = df.drop(columns=cols_to_drop)

    preprocessor = Preprocessor(threshold)
    preprocessor.fit(features, target)
    features_processed = preprocessor.transform(features)

    feature_names = preprocessor.features_to_keep.tolist()

    return features_processed, target, feature_names, df




def yearly_pos_rank_model(features_imputed_knn, target, feature_names):
    features_train, features_test, target_train, target_test = train_test_split(features_imputed_knn, target, test_size=0.2, random_state=42)

    model = RandomForestRegressor()
    model.fit(features_train, target_train)

    predictions = model.predict(features_test)
    mse = mean_squared_error(target_test, predictions)

    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]  # Sort feature importances in descending order
    names = [feature_names[i] for i in indices]  # Rearrange feature names so they match the sorted feature importances
    print("Feature ranking:")

    for i in range(features_train.shape[1]):
        print(f"{i+1}. Feature {names[i]} ({importances[indices[i]]})")

    return model, mse


In [None]:
import numpy as np

def drop_features_not_in_other_dataset(arr1, arr2):
    """
    Drop features from arr1 that are not present in arr2.

    Parameters:
    arr1 (np.ndarray): First NumPy array.
    arr2 (np.ndarray): Second NumPy array.

    Returns:
    np.ndarray: NumPy array with features not in arr2 dropped.
    """
    columns_to_drop = set(arr1[0]) - set(arr2[0])
    indices_to_keep = [i for i, col in enumerate(arr1[0]) if col not in columns_to_drop]
    return arr1[:, indices_to_keep]


In [None]:

qb_stats_22 = preprocess_position(df_stats_22, "QB")
qb_features_22 , qb_target_22 ,features_22, proc_qb_stats_22= preprocess_data(qb_stats_22)



In [None]:

qb_22_pred = qb_model_21.predict(qb_features_22)

# Create a DataFrame from the features
# Create a DataFrame from the features
player_prediction_22 = proc_qb_stats_22.copy()

# Add the predictions as a new column
player_prediction_22['predicted_rank'] = qb_22_pred

# Fill missing predictions with 0
player_prediction_22['predicted_rank'] = player_prediction_22['predicted_rank'].fillna(0)


In [None]:
player_prediction_22.to_csv('player_prediction_22.csv')

In [None]:

qb_stats_21 = preprocess_position(df_stats_21, "QB")
qb_features_21, qb_target_21,features_21, proc_qb_stats_21 = preprocess_data(qb_stats_21)





In [None]:
qb_model_21 , qb_mse_21= yearly_pos_rank_model(qb_features_21, qb_target_21, features_21)


In [None]:
print(qb_mse_21)

In [None]:

qb_stats_20 = preprocess_position(df_stats_20, "QB")
qb_features_20, qb_target_20 = preprocess_data(qb_stats_20)

In [None]:

qb_stats_19 = preprocess_position(df_stats_19, "QB")
qb_features_19, qb_target_19 , qb_feature_names_19= preprocess_data(qb_stats_19,"QB")

In [None]:

qb_stats_18 = preprocess_position(df_stats_18, "QB")

qb_features_18, qb_target_18, qb_feature_names_18 = preprocess_data(qb_stats_18,"QB")

In [None]:
qb_model_18 , qb_mse_18 = yearly_pos_rank_model(qb_features_18, qb_target_18, qb_feature_names_18)



In [None]:
qb_stats_19.sort_values(by='player_id', inplace=True)
qb_19_pred = qb_model_18.predict(qb_features_19)

# Create a DataFrame from the features
# Create a DataFrame from the features
player_predictions = qb_stats_19.copy()

# Add the predictions as a new column
player_predictions['predicted_rank'] = qb_19_pred

# Fill missing predictions with 0
player_predictions['predicted_rank'] = player_predictions['predicted_rank'].fillna(0)


In [None]:
qb_features_20 = drop_features_not_in_other_dataset(qb_features_20, qb_features_21)

In [None]:
qb_stats_19.sort_values(by='player_id', inplace=True)
qb_19_pred = qb_model_18.predict(qb_features_19)

# Create a DataFrame from the features
# Create a DataFrame from the features
player_predictions = qb_stats_19.copy()

# Add the predictions as a new column
player_predictions['predicted_rank'] = qb_19_pred

# Fill missing predictions with 0
player_predictions['predicted_rank'] = player_predictions['predicted_rank'].fillna(0)


In [None]:
player_predictions.to_csv("player_prediction.csv")

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer





In [None]:
# For the averaged data, you would first need to calculate the averages over 5-year periods
df_average = qb_stats_21.groupby(['player_id', qb_stats_21['year'] // 5]).mean().reset_index()

# Split into features and target
features_avg = df_average.drop('rank', axis=1)
target_avg = df_average['rank']

# Split into training and test datasets
features_train_avg, features_test_avg, target_train_avg, target_test_avg = train_test_split(features_avg, target_avg, test_size=0.2, random_state=42)

# Define and fit the model on 5-year averaged data
model_average_years = RandomForestRegressor()
model_average_years.fit(features_train_avg, target_train_avg)

# Predict and evaluate
predictions_average_years = model_average_years.predict(features_test_avg)
mse_average_years = mean_squared_error(target_test_avg, predictions_average_years)
print(f"5-year average MSE: {mse_average_years}")

In [None]:
# Weighted predictions
weights = [mse_average_years / (mse_individual_years + mse_average_years), 
           mse_individual_years / (mse_individual_years + mse_average_years)]
predictions_weighted = weights[0] * predictions_individual_years + weights[1] * predictions_average_years


In [None]:
# Assume last_year_data is a DataFrame holding all your players' data from the last year.


# Don't forget to apply the same preprocessing to this new data as you did to the training data.
# This might involve cleaning the data, dealing with missing values, encoding categorical variables, etc.

# Predict the player ranks for next year
next_year_pred = model.predict(last_year_data)

# Create a DataFrame to hold the player names and their predicted ranks
player_predictions = pd.DataFrame({
    'player_name': last_year_data['player_name'],
    'predicted_rank': next_year_pred
})

# Sort players by their predicted rank
player_predictions = player_predictions.sort_values('predicted_rank')


In [None]:
class Preprocessor:
    def __init__(self, threshold=0.001):
        self.threshold = threshold
        self.features_to_keep = None
        self.scaler = None

    def fit(self, X, y):
        # Impute missing values
        self.imputer = KNNImputer(n_neighbors=5)
        X_imputed = self.imputer.fit_transform(X)

        # Fit a model to get feature importances
        rf = RandomForestRegressor()
        rf.fit(X_imputed, y)

        # Get feature importances
        importances = rf.feature_importances_

        # Select features with importances above the threshold
        self.features_to_keep = X.columns[importances > self.threshold]

        # Create a scaler object and fit it
        self.scaler = StandardScaler()
        self.scaler.fit(X[self.features_to_keep])

    def transform(self, X):
        # Impute missing values
        X_imputed = self.imputer.transform(X)

        # Filter features based on feature importance
        X_filtered = pd.DataFrame(X_imputed, columns=X.columns)[self.features_to_keep]

        # Scale the features
        X_scaled = self.scaler.transform(X_filtered)

        return X_scaled


def preprocess_data(df, target_col='pos_rank_ppr', threshold=0.001, preprocessor=None, training_features=None):
    cols_to_drop = ["search_full_name","rank_ppr","pos_rank_ppr","pos_rank_half_ppr",
                    "rank_std","rank_half_ppr","pos_rank_std","team","fantasy_positions","player_id"]

    df = df.sort_values(by='player_id').copy()

    target = df[target_col]
    features = df.drop(columns=cols_to_drop)
    
    # If this is not the initial training data, there should be a list of training features available
    if training_features is not None:
        # Get missing columns in the test set
        missing_cols = set(training_features) - set(features.columns)
        # Add a missing column in test set with default value equal to 0
        for c in missing_cols:
            features[c] = 0
        # Remove any extra columns that are in the test set but not the training set
        extra_cols = set(features.columns) - set(training_features)
        features = features.drop(columns=list(extra_cols))
        # Ensure the order of column in the test set is in the same order than in train set
        features = features[training_features]
    # If this is the initial training data, create the preprocessor and fit it
    else:
        preprocessor = Preprocessor(threshold)
        preprocessor.fit(features, target)
    
    features_processed = preprocessor.transform(features)
    feature_names = preprocessor.features_to_keep.tolist()

    return features_processed, target, feature_names, preprocessor, df






def yearly_pos_rank_model(features_imputed_knn, target, feature_names):
    features_train, features_test, target_train, target_test = train_test_split(features_imputed_knn, target, test_size=0.2, random_state=42)

    model = RandomForestRegressor()
    model.fit(features_train, target_train)

    predictions = model.predict(features_test)
    mse = mean_squared_error(target_test, predictions)

    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]  # Sort feature importances in descending order
    names = [feature_names[i] for i in indices]  # Rearrange feature names so they match the sorted feature importances
    print("Feature ranking:")

    for i in range(features_train.shape[1]):
        print(f"{i+1}. Feature {names[i]} ({importances[indices[i]]})")

    return model, mse, importances



In [None]:
from sklearn.metrics import mean_absolute_error

# Define the position you're interested in
position = "QB"

# Preprocess your training data
df_train = preprocess_position(df_stats_21, position)
features_train, target_train, feature_names_train, preprocessor, df_train = preprocess_data(df_train)
# Fit your model
model, mse, importances = yearly_pos_rank_model(features_train, target_train, feature_names_train)
print(f"Training completed. MSE: {mse}")

# Now, let's suppose you have a new DataFrame called df_stats_22 for the year 2022 that you want to make predictions on
df_test = preprocess_position(df_stats_22, position)

# Before making predictions, you need to preprocess your test data in the same way you preprocessed your training data
features_test, target_test, feature_names_test, _, df_test = preprocess_data(df_test, preprocessor=preprocessor, training_features=feature_names_train)

# Now you can make predictions
predictions = model.predict(features_test)

# And calculate the mean absolute error
mae = mean_absolute_error(target_test, predictions)
print(f"MAE on test set: {mae}")

