In [None]:
import numpy as np
import pandas as pd
import joblib
import warnings
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Dense, BatchNormalization, Dropout, Embedding, Flatten, Concatenate, Input
)
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l1, l2

from time_series import create_features, ewma, player_moving_average, team_moving_average, 
from ml_pipeline import encode_cols

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
df = pd.read_csv('player_data.csv')

In [None]:
df = create_features(df)
df = player_moving_average(df)
merged_df = team_moving_average(df)
df = merged_df.dropna()
df = encode_cols(df)

In [None]:
features = [
    'bps_ewma',                
    'total_points_ewma',
    'threat_ewma',
    'creativity_ewma',           
    'yellow_cards_ewma',
    'saves_ewma',                    
    'goals_scored_ewma',
    'assists_ewma',
    'clean_sheets_ewma',        
    'transfers_in_rank',
    'transfers_out_rank',
    'bonus_ewma',                 
     'value',
     'scored_opp_ewma',   
     'conceded_opp_ewma',  
     'scored_team_ewma',   
     'conceded_team_ewma',
     'was_home',
     'position_GK',
     'position_DEF', 
     'position_MID',
     'position_FWD',
]

In [None]:
target = "total_points"

# Split the data into training and testing sets
train_date = '2022-07-01'
test_date = '2024-07-20'

train_data = df[df["date"] < train_date]
val_data = df[(df["date"] >= train_date) & (df["date"] <= test_date)]
test_data = df[df["date"] > test_date]

X_train = train_data[features]
y_train = train_data[target]
X_val = val_data[features]
y_val = val_data[target]
X_test = test_data[features]
y_test = test_data[target]


print(len(train_data))
print(len(val_data))
print(len(test_data))
print('train', len(train_data)/(len(test_data)+len(train_data)+len(val_data))*100)
print('test', len(test_data)/(len(test_data)+len(train_data)+len(val_data))*100)
print('val',len(val_data)/(len(train_data)+len(val_data)+len(test_data))*100)

In [None]:
# Scale the features
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

In [None]:
# Set random seed for reproducibility
np.random.seed(1)
tf.random.set_seed(1)

In [None]:
# Define the EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss',  # Monitor validation loss
                               patience=10,  # Number of epochs with no improvement before stopping
                               restore_best_weights=True)  # Restore the model's best weights

# dropout

In [None]:
modeldrop = Sequential()
modeldrop.add(Dense(units=X_train_scaled.shape[1], activation='relu'))
#modeldrop.add(Dense(units=64, activation='relu'))
modeldrop.add(Dense(units=32, activation='relu'))
modeldrop.add(Dense(units=16, activation='relu'))
modeldrop.add(Dense(units=1, activation='relu'))
modeldrop.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])

dropout = modeldrop.fit(X_train_scaled, y_train,epochs=30, batch_size=32,validation_data=(X_val_scaled, y_val), callbacks=[early_stopping])

# **Predictions**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Make predictions on the test data
y_pred = modeldrop.predict(X_test_scaled)
test_data['predicted_points'] = y_pred
test_data['points_per_mil'] = test_data["predicted_points"] / test_data["value"]

# Select relevant columns and round predicted points
predicted_df = test_data[[
    "name", "season", "round", "predicted_points", "total_points", 
    "team", "position", "value", "opp_team_name"
]]
predicted_df['predicted_points'] = predicted_df['predicted_points'].round(2)

# Group predictions by season and round
grouped_predictions = predicted_df.groupby(["season", "round"]).apply(lambda x: x.nlargest(10, "predicted_points"))
grouped_actual = predicted_df.groupby(["season", "round"]).apply(lambda x: x.nlargest(5, "total_points"))
grouped_all = predicted_df.groupby(["season", "round"]).apply(lambda x: x.sort_values("predicted_points", ascending=False))

# Aggregate total predicted and actual points
total_predicted_points = grouped_all.groupby('name')['predicted_points'].sum()
total_actual_points = grouped_all.groupby('name')['total_points'].sum()

# Create a DataFrame with season totals
season_totals = pd.DataFrame({
    'player': total_predicted_points.index,
    'total predicted points': total_predicted_points,
    'total points': total_actual_points
})

# Sort players by total points
sorted_totals = season_totals.sort_values('total points', ascending=False)

# Plot actual vs predicted points
sorted_totals.plot(
    kind='scatter', x='total predicted points', y='total points', 
    title="Points vs Predicted Points", xlim=(0, 300), ylim=(0, 300)
)
plt.plot([0, 300], [0, 300], color='red', linestyle='--')
plt.show()

# Define required players per position
num_players_needed = {"GK": 1, "DEF": 3, "MID": 5, "FWD": 2}
dream_team = pd.DataFrame(columns=predicted_df.columns)

# Select top players for each season, round, and position
for season, round in predicted_df[['season', 'round']].drop_duplicates().values:
    for position, num_players in num_players_needed.items():
        top_players = predicted_df[
            (predicted_df['season'] == season) & 
            (predicted_df['round'] == round) & 
            (predicted_df['position'] == position)
        ].head(num_players)
        dream_team = pd.concat([dream_team, top_players])

# Calculate cumulative dream team points
dream_team_sum = dream_team.groupby(['season', 'round']).agg({
    'predicted_points': 'sum',
    'total_points': 'sum'
}).reset_index()
dream_team_sum['total_points'] = pd.to_numeric(dream_team_sum['total_points'], errors='coerce')
dream_team_sum['total_points_cumsum'] = dream_team_sum.groupby('season')['total_points'].cumsum()

# Calculate highest predicted points per round
dream_team['max_points'] = dream_team.groupby(['season', 'round'])['predicted_points'].transform('max')

# Double the total points for the highest predicted player in each round
dream_team['doubled_highest_points'] = dream_team.apply(
    lambda row: row['total_points'] * 2 if row['predicted_points'] == row['max_points'] else row['total_points'], 
    axis=1
)
dream_team.drop('max_points', axis=1, inplace=True)

# Print summary statistics
print('Dream team total sum:', dream_team['total_points'].sum())
print('Dream team total sum with captain points doubled:', dream_team['doubled_highest_points'].sum())

# Filter for the 2024-25 season
df_2024_25 = dream_team[dream_team['season'] == '2024-25']
print('Dream team 2024-25 sum:', df_2024_25['total_points'].sum())
print('Dream team 2024-25 sum with captain points doubled:', df_2024_25['doubled_highest_points'].sum())

In [None]:
grouped_all[grouped_all['name']== 'Alisson Ramses Becker'][['predicted_points', 'total_points']].plot()

In [None]:
pd.set_option('display.max_rows', None)
grouped_predictions.iloc[-20:,:]

In [None]:
modeldrop.save('model.h5')
joblib.dump(scaler, 'scaler.joblib') 