In [2]:
import pandas as pd
import numpy as np

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
players_fpl = pd.read_csv('/content/drive/MyDrive/full2016-2024list.csv')
players_2024 = pd.read_csv('/content/drive/MyDrive/merged_gw2024.csv')

players_fpl = pd.concat([players_fpl, players_2024], ignore_index=True)

players_fpl.head()
players_fpl[players_fpl['season_x'].isnull()]

players_fpl['season_x'].fillna('2024-25', inplace=True)
players_fpl.tail()

In [None]:

# players_fpl.head()
players_fpl.tail()

In [None]:
players_fpl["name"].value_counts()

In [None]:

players_fpl["was_home"] = players_fpl["was_home"].astype("category").cat.codes
players_fpl["season_x"] = players_fpl["season_x"].astype(str).str.split('-').str[0]
players_fpl["season_x"] = pd.to_datetime(players_fpl["season_x"], format='%Y')
players_fpl.head(129000)

In [None]:
grouped_players = players_fpl.groupby("name")
grouped_players.head()

In [None]:
group = grouped_players.get_group("Joško Gvardiol")
group

In [9]:
def rolling_averages(group,cols,new_cols):
  group = group.sort_values("kickoff_time")
  rolling_stats = group[cols].rolling(5, closed='left').mean()
  group[new_cols] = rolling_stats
  group = group.dropna(subset=new_cols)
  return group

In [None]:
cols = ["assists","bps","clean_sheets","goals_conceded","goals_scored","minutes","own_goals","penalties_missed","red_cards","transfers_in","transfers_out","yellow_cards","GW","total_points"]
new_cols = [f"{c}_rolling" for c in cols]
new_cols

In [None]:
rolling_averages(group,cols,new_cols)

In [None]:
players_rolling = players_fpl.groupby("name").apply(lambda x: rolling_averages(x,cols,new_cols))
#creation of frequency of goals and handling infinity vals
players_rolling['frequency_goals_rolling'] = players_rolling['minutes_rolling'] / players_rolling['goals_scored_rolling']
players_rolling['frequency_goals_rolling'] = players_rolling['frequency_goals_rolling'].replace([np.inf, -np.inf], np.nan).fillna(0)
#creation of frequency of assists and handling infinity vals
players_rolling['frequency_assists_rolling'] = players_rolling['minutes_rolling'] / players_rolling['assists_rolling']
players_rolling['frequency_assists_rolling'] = players_rolling['frequency_assists_rolling'].replace([np.inf, -np.inf], np.nan).fillna(0)

players_rolling['bonus_chance_rolling'] = players_rolling['bps_rolling'] * players_rolling['goals_scored_rolling']
players_rolling

In [None]:
players_rolling.info()

In [None]:
players_rolling = players_rolling.droplevel("name")
players_rolling

In [None]:
group = players_rolling.groupby("name").get_group("Mohamed Salah")
group = rolling_averages(group,cols,new_cols)
group[['GW','total_points_rolling']].tail()

In [None]:
players_rolling.index = range(players_rolling.shape[0])
players_rolling.tail()

In [18]:
players_rolling = players_rolling.drop(['xP', 'starts'], axis=1, errors='ignore')

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from IPython.display import display
import numpy as np
import pandas as pd

def make_predictions_with_gb_regularized(data, predictors):
    train = data[data["season_x"] < '2023-08-01']
    test = data[data["season_x"] >= '2023-08-01']
    predictors = [col for col in predictors if col != 'total_points_rolling']

    X_train = train[predictors]
    X_test = test[predictors]
    y_train = train['total_points_rolling']
    y_test = test['total_points_rolling']

    #GPT ftw
    model = GradientBoostingRegressor(
        n_estimators=500,       # Number of boosting stages
        learning_rate=0.01,     # Shrinkage parameter
        max_depth=2,            # Maximum depth of each tree
        min_samples_split=12,   # Minimum number of samples required to split a node
        min_samples_leaf=8,    # Minimum number of samples required in a leaf node
        subsample=0.8,          # Fraction of samples to be used for fitting the individual base learners
        random_state=42      # Ensuring reproducibility

    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # eval metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    display(f'Regularized Model Results:')
    print(f'MSE: {mse}')
    print(f'MAE: {mae}')
    print(f'R² Score: {r2}')

    # specific player stats
    player_name = "João Pedro Junqueira de Jesus"
    player_test_data = test[test["name"] == player_name]

    if player_test_data.empty:
        print("No data available for the specified player.")
        return

    player_test_data = player_test_data.sort_values(by="GW", ascending=False)
    last_10_matches = player_test_data.head(20)

    last_10_indices = last_10_matches.index
    player_predictions = y_pred[test.index.get_indexer(last_10_indices)]

    results_df = pd.DataFrame({
        "Gameweek": last_10_matches["GW"].values,
        "Actual Points": last_10_matches["total_points_rolling"].values,
        "Predicted Points": player_predictions
    })

    print("\nJoão Pedro Junqueira de Jesus' Last 10 Matches:")
    print(results_df)

    #feature importances
    importances = model.feature_importances_

    feature_importances = pd.DataFrame({
    'Feature': predictors,
    'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    print(feature_importances)

    test['predicted_points'] = y_pred

    # top 10 unik players by position
    positions = test['position'].unique()
    top_players_by_position = {}

    for position in positions:
        # Filter the data by position
        position_data = test[test['position'] == position]

        # Sort by predicted points in descending order
        position_data = position_data.sort_values(by='predicted_points', ascending=False)

        # Drop duplicates to get unique players
        position_data_unique = position_data.drop_duplicates(subset=['name'], keep='first')

        # Get the top 10 unique players
        top_10 = position_data_unique.head(20)
        top_players_by_position[position] = top_10[['name', 'predicted_points', 'value']]

    # top 10 unik player position wise
    for position, df in top_players_by_position.items():
        print(f"\nTop 10 Players for Position: {position}")
        print(df)

# Call func
exclude_columns = ['season_x', 'name', 'position', 'team_x', 'kickoff_time', 'opp_team_name', 'total_points_rolling','penalties_missed_rolling','yellow_cards','was_home','assists','threat','bonus','minutes','bps','clean_sheets','elements','goals_conceded','goals_scored','ict_index','opponent_team','team_h_score','own_goals','penalties_missed','penalties_saved','red_cards','round','team_a_score','GW_rolling','GW','element','influence','transfers_out_rolling','creativity','fixture','transfers_out','total_points','transfers_balance','selected','own_goals_rolling']
predictors = [col for col in players_rolling.columns if col not in exclude_columns]
make_predictions_with_gb_regularized(players_rolling, predictors)


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
from scipy.optimize import linprog

def make_predictions_with_gb_pca(data, predictors, n_components=0.95):

    train = data[data["season_x"] < '2023-01-01']
    test = data[data["season_x"] >= '2023-01-01']
    predictors = [col for col in predictors if col != 'total_points_rolling']

    X_train = train[predictors]
    X_test = test[predictors]
    y_train = train['total_points_rolling']
    y_test = test['total_points_rolling']

    #standard scaler for pca
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # dim reduction
    pca = PCA(n_components=n_components)
    imputer = SimpleImputer(strategy='mean')
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # explained variance ratio
    print("\nExplained Variance Ratio by Principal Component:")
    explained_variance_ratio = pca.explained_variance_ratio_
    for i, var_ratio in enumerate(explained_variance_ratio):
        print(f"PC{i+1}: {var_ratio:.4f}")

    cumulative_variance_ratio = pca.explained_variance_ratio_.cumsum()
    print("\nCumulative Explained Variance Ratio by Principal Component:")
    for i, cum_var_ratio in enumerate(cumulative_variance_ratio):
        print(f"PC{i+1}: {cum_var_ratio:.4f}")


    # GPT ftw
    model = GradientBoostingRegressor(
        n_estimators=500,       # Number of boosting stages
        learning_rate=0.01,     # Shrinkage parameter
        max_depth=2,            # Maximum depth of each tree
        min_samples_split=10,   # Minimum number of samples required to split a node
        min_samples_leaf=5,     # Minimum number of samples required in a leaf node
        subsample=0.8,          # Fraction of samples to be used for fitting the individual base learners
        random_state=42         # Ensuring reproducibility
    )

    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)

    # eval metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'Regularized Model with PCA Results:')
    print(f'MSE: {mse}')
    print(f'MAE: {mae}')
    print(f'R² Score: {r2}')

    # specfic player stats
    player_name = "Mohamed Salah"
    player_test_data = test[test["name"] == player_name]

    if player_test_data.empty:
        print("No data available for the specified player.")
        return

    player_test_data = player_test_data.sort_values(by="GW", ascending=False)
    last_10_matches = player_test_data.head(10)

    last_10_indices = last_10_matches.index
    player_predictions = y_pred[test.index.get_indexer(last_10_indices)]

    results_df = pd.DataFrame({
        "Gameweek": last_10_matches["GW"].values,
        "Actual Points": last_10_matches["total_points_rolling"].values,
        "Predicted Points": player_predictions
    })

    print("\nMohamed Salah's Last 10 Matches:")
    print(results_df)

    #feaure improtance
    importances = model.feature_importances_
    feature_names = [f"PC{i+1}" for i in range(X_train_pca.shape[1])]
    feature_importances = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    print(feature_importances)

    test['predicted_points'] = y_pred

    # top 10 unik players by pos
    positions = test['position'].unique()
    top_players_by_position = {}

    for position in positions:
        # Filter the data by position
        position_data = test[test['position'] == position]

        # Sort by predicted points in descending order
        position_data = position_data.sort_values(by='predicted_points', ascending=False)

        # Drop duplicates to get unique players
        position_data_unique = position_data.drop_duplicates(subset=['name'], keep='first')

        # Get the top 10 unique players
        top_10 = position_data_unique.head(10)

        # Store the result in a dictionary
        top_players_by_position[position] = top_10[['name', 'predicted_points', 'value']]

    #top 10 unik player pos wise
    for position, df in top_players_by_position.items():
        print(f"\nTop 10 Players for Position: {position}")
        print(df)

# main func
exclude_columns = ['season_x', 'name', 'position', 'team_x','team', 'kickoff_time', 'opp_team_name', 'total_points_rolling', 'penalties_missed_rolling', 'yellow_cards', 'was_home', 'assists', 'threat', 'bonus', 'minutes', 'bps', 'clean_sheets', 'elements', 'goals_conceded', 'goals_scored', 'ict_index', 'opponent_team', 'team_h_score', 'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'round', 'team_a_score', 'GW_rolling', 'GW', 'element', 'influence', 'transfers_out_rolling', 'creativity', 'fixture', 'transfers_out', 'total_points', 'transfers_balance', 'selected', 'own_goals_rolling','goals_conceded_rolling','red_cards_rolling','yellow_cards_rolling']
predictors = [col for col in players_rolling.columns if col not in exclude_columns]
make_predictions_with_gb_pca(players_rolling, predictors, n_components=0.95)

In [None]:
#GPT FTW(best team)
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
from scipy.optimize import linprog

def make_predictions_with_gb_pca(data, predictors, n_components=0.95):
    # Step 1: Split the data into train and test
    train = data[data["season_x"] < '2023-08-01']
    test = data[data["season_x"] >= '2023-08-01']

    # Ensure 'total_points_rolling' is not in predictors
    predictors = [col for col in predictors if col != 'total_points_rolling']

    X_train = train[predictors]
    X_test = test[predictors]
    y_train = train['total_points_rolling']
    y_test = test['total_points_rolling']

    # Step 2: Standardize the features before PCA
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Step 3: Apply PCA to reduce dimensionality
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # Step 4: Apply Gradient Boosting with Regularization
    model = GradientBoostingRegressor(
        n_estimators=500,       # Number of boosting stages
        learning_rate=0.01,     # Shrinkage parameter
        max_depth=2,            # Maximum depth of each tree
        min_samples_split=12,   # Minimum number of samples required to split a node
        min_samples_leaf=8,     # Minimum number of samples required in a leaf node
        subsample=0.8,          # Fraction of samples to be used for fitting the individual base learners
        random_state=42,
                            # Ensuring reproducibility
    )

    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)

    # Step 5: Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'Regularized Model with PCA Results:')
    print(f'MSE: {mse}')
    print(f'MAE: {mae}')
    print(f'R² Score: {r2}')

    # Step 6: Add the predictions to the test dataset
    test['predicted_points'] = y_pred

    return test  # Return the test dataset with predictions

def select_best_team_with_key_players(data, unavailable_players, key_players, budget=100):
    num_goalkeepers = 2
    num_defenders = 5
    num_midfielders = 5
    num_forwards = 3

    # Adjust player values (divide by 10 as specified)
    data['adjusted_value'] = data['value'] / 10

    # Step 2: Filter out unavailable players from the data
    next_gw_data = data.dropna(subset=['predicted_points'])
    next_gw_data = next_gw_data[~next_gw_data['name'].isin(unavailable_players)]

    # Separate key players
    key_players_data = next_gw_data[next_gw_data['name'].isin(key_players)]
    non_key_players_data = next_gw_data[~next_gw_data['name'].isin(key_players)]

    # Separate non-key players by position
    goalkeepers = non_key_players_data[non_key_players_data['position'] == 'GK'].copy()
    defenders = non_key_players_data[non_key_players_data['position'] == 'DEF'].copy()
    midfielders = non_key_players_data[non_key_players_data['position'] == 'MID'].copy()
    forwards = non_key_players_data[non_key_players_data['position'] == 'FWD'].copy()

    def select_players(position_data, num_players, budget):
        position_data['points_per_value'] = position_data['predicted_points'] / position_data['adjusted_value']
        position_data = position_data.sort_values(by='points_per_value', ascending=False)
        costs = position_data['adjusted_value'].values
        points = position_data['predicted_points'].values

        c = -points
        A_eq = []
        b_eq = []
        if len(position_data) >= num_players:
            A_eq.append([1] * len(costs))
            b_eq.append(num_players)
        else:
            return pd.DataFrame()  # Not enough players to fill the required positions

        A_ub = [costs]
        b_ub = [budget]

        bounds = [(0, 1) for _ in range(len(costs))]

        result = linprog(c, A_ub=A_ub, b_ub=b_ub, A_eq=A_eq, b_eq=b_eq, bounds=bounds, method='highs')

        if result.success:
            selected_indices = [i for i, x in enumerate(result.x) if x > 0.5]
            selected_data = position_data.iloc[selected_indices]
            return selected_data
        else:
            print(f"No feasible solution found for position. Consider increasing budget or relaxing constraints.")
            return pd.DataFrame()  # Return empty DataFrame if no solution is found

    # Select players for each position
    selected_players = []

    selected_gk = select_players(goalkeepers, num_goalkeepers, budget)
    selected_players.append(selected_gk)

    remaining_budget = budget - selected_gk['adjusted_value'].sum()
    selected_def = select_players(defenders, num_defenders, remaining_budget)
    selected_players.append(selected_def)

    remaining_budget = budget - selected_gk['adjusted_value'].sum() - selected_def['adjusted_value'].sum()
    selected_mid = select_players(midfielders, num_midfielders, remaining_budget)
    selected_players.append(selected_mid)

    remaining_budget = budget - selected_gk['adjusted_value'].sum() - selected_def['adjusted_value'].sum() - selected_mid['adjusted_value'].sum()
    selected_fwd = select_players(forwards, num_forwards, remaining_budget)
    selected_players.append(selected_fwd)

    # Concatenate all selected non-key players
    all_selected_players = pd.concat(selected_players)

    # Add the key players
    all_selected_players = pd.concat([all_selected_players, key_players_data])

    # Ensure all selected players are unique
    all_selected_players = all_selected_players.drop_duplicates(subset='name')

    # Adjust for budget, prioritizing key players
    tolerance = 1.0
    total_value = all_selected_players['adjusted_value'].sum()
    while total_value > budget + tolerance and len(all_selected_players) > 0:
        non_key_players = all_selected_players[~all_selected_players['name'].isin(key_players)]
        if len(non_key_players) > 0:
            non_key_players = non_key_players.sort_values(by='points_per_value', ascending=True)
            all_selected_players = all_selected_players[~all_selected_players['name'].isin([non_key_players.iloc[0]['name']])]
        else:
            print("Unable to adjust the team within the budget without removing key players.")
            break
        total_value = all_selected_players['adjusted_value'].sum()

    # Ensure exactly 15 players
    if len(all_selected_players) < 15:
        print(f"Warning: Selected players are fewer than 15. Adding more players if available.")
        remaining_players = next_gw_data[~next_gw_data['name'].isin(all_selected_players['name'])]
        additional_players = remaining_players.sample(n=15 - len(all_selected_players))
        all_selected_players = pd.concat([all_selected_players, additional_players])

    total_points = all_selected_players['predicted_points'].sum()

    print(f"Selected Squad:\n{all_selected_players[['name', 'position', 'adjusted_value', 'predicted_points']]}")
    print(f"\nTotal Value: {total_value}")
    print(f"Total Predicted Points: {total_points}")

    return all_selected_players

# Call the function with regularization and PCA FIRST
exclude_columns = ['season_x', 'name', 'position', 'team_x','team', 'kickoff_time', 'opp_team_name', 'total_points_rolling', 'penalties_missed_rolling', 'yellow_cards', 'was_home', 'assists', 'threat', 'bonus', 'minutes', 'bps', 'clean_sheets', 'elements', 'goals_conceded', 'goals_scored', 'ict_index', 'opponent_team', 'team_h_score', 'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'round', 'team_a_score', 'GW_rolling', 'GW', 'element', 'influence', 'transfers_out_rolling', 'creativity', 'fixture', 'transfers_out', 'total_points', 'transfers_balance', 'selected', 'own_goals_rolling','goals_conceded_rolling','red_cards_rolling','yellow_cards_rolling']
predictors = [col for col in players_rolling.columns if col not in exclude_columns]
predicted_data = make_predictions_with_gb_pca(players_rolling, predictors, n_components=0.95)
unavailable_players = [
        "Kieran Tierney", "Takehiro Tomiyasu", "Fabio Ferreira Vieira", "Enes Unal",
        "Tyler Adams", "David Brooks", "Boubacar Kamara", "Tyrone Mings",
        "Joshua Dasilva", "Aaron Hickey", "Rico Henry", "Igor Thiago Nascimento Rodrigues",
        "Solomon March", "Bart Verbruggen", "Reece James", "Matheus Franca de Oliveira",
        "Youssef Ramalho Chermiti", "Nathan Patterson", "Jarrad Branthwaite",
        "James Garner", "Seamus Coleman", "Ashley Young", "James Tarkowski",
        "Bamidele Alli", "Harry Clarke", "George Hirst", "Nathan Broadhead",
        "Wesley Burns", "Kalvin Phillips", "Janoi Donacien", "Patson Daka",
        "Conor Coady", "Oscar Bobb", "Leny Yoro", "Tyrell Malacia", "Will Fish",
        "Victor Lindelof", "Luke Shaw", "Rasmus Winther Hojlund", "Sven Botman",
        "Jamaal Lascelles", "Lewis Miley", "Fabian Schar", "Sandro Tonali",
        "Callum Wilson", "Danilo dos Santos de Oliveira", "Gavin Bazunu",
        "Juan Larios", "Dominic Solanke", "Rodrigo Bentancur", "Nelson Semedo",
        "Enso Gonzalez Medina", "Leon Chiwone", "Sasa Kalajdzic","Jarell Quansah"
]

# Usage example
key_players = ['Erling Haaland']  # Specify key players like Haaland
best_team = select_best_team_with_key_players(predicted_data, unavailable_players, key_players)
best_team


In [None]:
#GPT ftw again but with pca(best team)
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
from scipy.optimize import linprog

def make_predictions_with_gb_pca(data, predictors, n_components=0.95):
    # Step 1: Split the data into train and test
    train = data[data["season_x"] < '2023-08-01']
    test = data[data["season_x"] >= '2023-08-01']

    # Ensure 'total_points_rolling' is not in predictors
    predictors = [col for col in predictors if col != 'total_points_rolling']

    X_train = train[predictors]
    X_test = test[predictors]
    y_train = train['total_points_rolling']
    y_test = test['total_points_rolling']

    # Step 2: Standardize the features before PCA
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Step 3: Apply PCA to reduce dimensionality
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # Step 4: Apply Gradient Boosting with Regularization
    model = GradientBoostingRegressor(
        n_estimators=500,       # Number of boosting stages
        learning_rate=0.01,     # Shrinkage parameter
        max_depth=2,            # Maximum depth of each tree
        min_samples_split=12,   # Minimum number of samples required to split a node
        min_samples_leaf=8,     # Minimum number of samples required in a leaf node
        subsample=0.8,          # Fraction of samples to be used for fitting the individual base learners
        random_state=42,
                            # Ensuring reproducibility
    )

    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)

    # Step 5: Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'Regularized Model with PCA Results:')
    print(f'MSE: {mse}')
    print(f'MAE: {mae}')
    print(f'R² Score: {r2}')

    # Step 6: Add the predictions to the test dataset
    test['predicted_points'] = y_pred

    return test  # Return the test dataset with predictions

def select_best_team(data, unavailable_players, budget=100, min_predicted_points=4):
    num_goalkeepers = 2
    num_defenders = 5
    num_midfielders = 5
    num_forwards = 3

    # Adjust player values (divide by 10 to match budget scale)
    data['adjusted_value'] = data['value'] / 10

    # Filter out unavailable players
    available_data = data[~data['name'].isin(unavailable_players)]

    # Filter out players without predicted points and below the minimum threshold
    next_gw_data = available_data.dropna(subset=['predicted_points'])
    next_gw_data = next_gw_data[next_gw_data['predicted_points'] >= min_predicted_points]

    # Weighting for midfielders and forwards
    next_gw_data['weighted_score'] = (
        0.5 * next_gw_data['predicted_points'] +
        0.25 * next_gw_data['frequency_goals_rolling'] +
        0.25 * next_gw_data['frequency_assists_rolling']
    )

    # Separate players by their position
    goalkeepers = next_gw_data[next_gw_data['position'] == 'GK'].copy()
    defenders = next_gw_data[next_gw_data['position'] == 'DEF'].copy()
    midfielders = next_gw_data[next_gw_data['position'] == 'MID'].copy()
    forwards = next_gw_data[next_gw_data['position'] == 'FWD'].copy()

    # Function to select players using linear programming
    def select_players(position_data, num_players, use_weighted_score=False):
        if len(position_data) < num_players:
            print(f"Not enough players available for position, required: {num_players}, available: {len(position_data)}")
            return pd.DataFrame()  # Return an empty DataFrame if not enough players are available

        costs = position_data['adjusted_value'].values  # Adjusted values as costs
        points = position_data['weighted_score'].values if use_weighted_score else position_data['predicted_points'].values

        c = -points  # Objective is to maximize points (minimize -points)
        A_eq = [[1] * len(costs)]  # Ensures the right number of players are selected
        b_eq = [num_players]

        A_ub = [costs]  # Constraint to stay within budget
        b_ub = [budget]

        bounds = [(0, 1) for _ in range(len(costs))]  # Player can either be selected (1) or not (0)

        # Solve the linear programming problem
        result = linprog(c, A_ub=A_ub, b_ub=b_ub, A_eq=A_eq, b_eq=b_eq, bounds=bounds, method='highs')

        if result.success:
            selected_indices = [i for i, x in enumerate(result.x) if x > 0.5]  # Get indices of selected players
            selected_data = position_data.iloc[selected_indices]
            return selected_data
        else:
            print(f"No feasible solution found for position. Consider increasing budget or relaxing constraints.")
            return pd.DataFrame()  # Return empty DataFrame if no solution is found

    # Initialize an empty list to store selected players
    selected_players = []

    # Select players for each position
    selected_gk = select_players(goalkeepers, num_goalkeepers)
    selected_players.append(selected_gk)

    selected_def = select_players(defenders, num_defenders)
    selected_players.append(selected_def)

    # Use weighted score for midfielders and forwards
    selected_mid = select_players(midfielders, num_midfielders, use_weighted_score=True)
    selected_players.append(selected_mid)

    selected_fwd = select_players(forwards, num_forwards, use_weighted_score=True)
    selected_players.append(selected_fwd)

    # Concatenate all selected players into one DataFrame
    all_selected_players = pd.concat(selected_players)

    # Ensure all selected players are unique (no duplicates)
    all_selected_players = all_selected_players.drop_duplicates(subset='name')

    # Ensure exactly 15 players and adjust the total value within the budget
    if len(all_selected_players) < 15:
        remaining_players = next_gw_data[~next_gw_data['name'].isin(all_selected_players['name'])]
        additional_players = remaining_players.sample(n=15 - len(all_selected_players))
        all_selected_players = pd.concat([all_selected_players, additional_players])

    # Adjust the team to fit within the budget if needed
    while True:
        total_value = all_selected_players['adjusted_value'].sum()
        if total_value <= budget:
            break
        # Remove the most expensive player
        all_selected_players = all_selected_players.sort_values(by='adjusted_value', ascending=False)
        all_selected_players = all_selected_players.iloc[:-1]  # Remove the most expensive player

    # If fewer than 15 players, add additional players
    while len(all_selected_players) < 15:
        remaining_players = next_gw_data[~next_gw_data['name'].isin(all_selected_players['name'])]
        additional_players = remaining_players.sample(n=15 - len(all_selected_players))
        all_selected_players = pd.concat([all_selected_players, additional_players])

    # Calculate total predicted points for the final squad
    total_points = all_selected_players['predicted_points'].sum()

    print(f"\nTotal Value: {all_selected_players['adjusted_value'].sum()}")
    print(f"Total Predicted Points: {total_points}")
    print(f"Final selected players count: {len(all_selected_players)}")

    return all_selected_players

# Call the function with regularization and PCA FIRST
exclude_columns = ['season_x', 'name', 'position', 'team_x','team', 'kickoff_time', 'opp_team_name', 'total_points_rolling', 'penalties_missed_rolling', 'yellow_cards', 'was_home', 'assists', 'threat', 'bonus', 'minutes', 'bps', 'clean_sheets', 'elements', 'goals_conceded', 'goals_scored', 'ict_index', 'opponent_team', 'team_h_score', 'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'round', 'team_a_score', 'GW_rolling', 'GW', 'element', 'influence', 'transfers_out_rolling', 'creativity', 'fixture', 'transfers_out', 'total_points', 'transfers_balance', 'selected', 'own_goals_rolling','goals_conceded_rolling','red_cards_rolling','yellow_cards_rolling']
predictors = [col for col in players_rolling.columns if col not in exclude_columns]
predicted_data = make_predictions_with_gb_pca(players_rolling, predictors, n_components=0.95)
unavailable_players = [
        "Kieran Tierney", "Takehiro Tomiyasu", "Fabio Ferreira Vieira", "Enes Unal",
        "Tyler Adams", "David Brooks", "Boubacar Kamara", "Tyrone Mings",
        "Joshua Dasilva", "Aaron Hickey", "Rico Henry", "Igor Thiago Nascimento Rodrigues",
        "Solomon March", "Bart Verbruggen", "Reece James", "Matheus Franca de Oliveira",
        "Youssef Ramalho Chermiti", "Nathan Patterson", "Jarrad Branthwaite",
        "James Garner", "Seamus Coleman", "Ashley Young", "James Tarkowski",
        "Bamidele Alli", "Harry Clarke", "George Hirst", "Nathan Broadhead",
        "Wesley Burns", "Kalvin Phillips", "Janoi Donacien", "Patson Daka",
        "Conor Coady", "Oscar Bobb", "Leny Yoro", "Tyrell Malacia", "Will Fish",
        "Victor Lindelof", "Luke Shaw", "Rasmus Winther Hojlund", "Sven Botman",
        "Jamaal Lascelles", "Lewis Miley", "Fabian Schar", "Sandro Tonali",
        "Callum Wilson", "Danilo dos Santos de Oliveira", "Gavin Bazunu",
        "Juan Larios", "Dominic Solanke", "Rodrigo Bentancur", "Nelson Semedo",
        "Enso Gonzalez Medina", "Leon Chiwone", "Sasa Kalajdzic","Jarell Quansah"
]

# Usage example
key_players = ['Erling Haaland']  # Specify key players like Haaland
best_team = select_best_team_with_key_players(predicted_data, unavailable_players, key_players)
best_team


In [None]:
#clown ahh code but i tried
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd

def make_predictions_with_gb_pca(data, predictors, n_components=0.95):

    train = data[data["season_x"] < '2023-08-01']
    test = data[data["season_x"] >= '2023-08-01']

    # total pts rollin nikaal diya
    predictors = [col for col in predictors if col != 'total_points_rolling']

    X_train = train[predictors]
    X_test = test[predictors]
    y_train = train['total_points_rolling']
    y_test = test['total_points_rolling']

    # standard scaler for pca
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # dim reduction
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # GPT ftw
    model = GradientBoostingRegressor(
        n_estimators=500,       # Number of boosting stages
        learning_rate=0.01,     # Shrinkage parameter
        max_depth=2,            # Maximum depth of each tree
        min_samples_split=12,   # Minimum number of samples required to split a node
        min_samples_leaf=8,     # Minimum number of samples required in a leaf node
        subsample=0.8,          # Fraction of samples to be used for fitting the individual base learners
        random_state=42,
                            # Ensuring reproducibility
    )

    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)

    #eval metric
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'Regularized Model with PCA Results:')
    print(f'MSE: {mse}')
    print(f'MAE: {mae}')
    print(f'R² Score: {r2}')

    #add pred to test
    test['predicted_points'] = y_pred
    return test

def greedy_select_team(data, unavailable_players, budget=100):
    num_goalkeepers = 2
    num_defenders = 5
    num_midfielders = 5
    num_forwards = 3

    data['adjusted_value'] = data['value'] / 10

    # unavailable players
    available_data = data[~data['name'].isin(unavailable_players)]
    next_gw_data = available_data.dropna(subset=['predicted_points'])

    # weight calc karo
    next_gw_data['weighted_score'] = (
        0.5 * next_gw_data['predicted_points'] +
        0.25 * next_gw_data['frequency_goals_rolling'] +
        0.25 * next_gw_data['frequency_assists_rolling']
    )

    # score to cost ratio calc karo
    next_gw_data['score_to_cost'] = next_gw_data['weighted_score'] / next_gw_data['adjusted_value']
    next_gw_data = next_gw_data.sort_values(by='score_to_cost', ascending=False)


    selected_players = pd.DataFrame(columns=next_gw_data.columns)

    # position wise player select karo
    def select_position_players(position_data, num_players):
        nonlocal budget, selected_players

        for _, player in position_data.iterrows():
            if len(selected_players[selected_players['position'] == player['position']]) < num_players and player['adjusted_value'] <= budget:
                selected_players = pd.concat([selected_players, player.to_frame().T])
                budget -= player['adjusted_value']

    # select player by position
    select_position_players(next_gw_data[next_gw_data['position'] == 'GK'], num_goalkeepers)
    select_position_players(next_gw_data[next_gw_data['position'] == 'DEF'], num_defenders)
    select_position_players(next_gw_data[next_gw_data['position'] == 'MID'], num_midfielders)
    select_position_players(next_gw_data[next_gw_data['position'] == 'FWD'], num_forwards)

    # final squad total point
    total_points = selected_players['predicted_points'].sum()

    print(f"\nTotal Value: {selected_players['adjusted_value'].sum()}")
    print(f"Total Predicted Points: {total_points}")
    print(f"Finalised Squad: {len(selected_players)}")

    return selected_players


# Calling func with Pca
exclude_columns = ['season_x', 'name', 'position', 'team_x','team', 'kickoff_time', 'opp_team_name', 'total_points_rolling', 'penalties_missed_rolling', 'yellow_cards', 'was_home', 'assists', 'threat', 'bonus', 'minutes', 'bps', 'clean_sheets', 'elements', 'goals_conceded', 'goals_scored', 'ict_index', 'opponent_team', 'team_h_score', 'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'round', 'team_a_score', 'GW_rolling', 'GW', 'element', 'influence', 'transfers_out_rolling', 'creativity', 'fixture', 'transfers_out', 'total_points', 'transfers_balance', 'selected', 'own_goals_rolling','goals_conceded_rolling','red_cards_rolling','yellow_cards_rolling']
predictors = [col for col in players_rolling.columns if col not in exclude_columns]
predicted_data = make_predictions_with_gb_pca(players_rolling, predictors, n_components=0.95)
unavailable_players = [
        "Kieran Tierney", "Takehiro Tomiyasu", "Fabio Ferreira Vieira", "Enes Unal",
        "Tyler Adams", "David Brooks", "Boubacar Kamara", "Tyrone Mings",
        "Joshua Dasilva", "Aaron Hickey", "Rico Henry", "Igor Thiago Nascimento Rodrigues",
        "Solomon March", "Bart Verbruggen", "Reece James", "Matheus Franca de Oliveira",
        "Youssef Ramalho Chermiti", "Nathan Patterson", "Jarrad Branthwaite",
        "James Garner", "Seamus Coleman", "Ashley Young", "James Tarkowski",
        "Bamidele Alli", "Harry Clarke", "George Hirst", "Nathan Broadhead",
        "Wesley Burns", "Kalvin Phillips", "Janoi Donacien", "Patson Daka",
        "Conor Coady", "Oscar Bobb", "Leny Yoro", "Tyrell Malacia", "Will Fish",
        "Victor Lindelof", "Luke Shaw", "Rasmus Winther Hojlund", "Sven Botman",
        "Jamaal Lascelles", "Lewis Miley", "Fabian Schar", "Sandro Tonali",
        "Callum Wilson", "Danilo dos Santos de Oliveira", "Gavin Bazunu",
        "Juan Larios", "Dominic Solanke", "Rodrigo Bentancur", "Nelson Semedo",
        "Enso Gonzalez Medina", "Leon Chiwone", "Sasa Kalajdzic","Jarell Quansah"
]

# calling func fro printing team
key_players = ['Erling Haaland']
best_team = greedy_select_team(predicted_data, unavailable_players)
best_team


In [None]:
#cross-validation in case
from sklearn.model_selection import cross_val_score

def evaluate_model_with_cv(data, predictors):
    X = data[predictors]
    y = data['total_points_rolling']

    model = GradientBoostingRegressor()
    scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

    print(f'Cross-Validated MSE: {-scores.mean()}')
    print(f'Cross-Validated MAE: {-cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error").mean()}')
    print(f'Cross-Validated R² Score: {cross_val_score(model, X, y, cv=5, scoring="r2").mean()}')

evaluate_model_with_cv(players_rolling, new_cols)
