<a href="https://colab.research.google.com/github/503N-project-RC/model-training/blob/main/draft_player_count_history.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import holidays
!pip install workalendar
from workalendar.usa import UnitedStates  # can change
from datetime import datetime, timedelta

from google.colab import drive, files
import os



In [17]:
# load dataset
drive.mount('/content/drive')
drive_base_path = '/content/drive/My Drive/503Nproj/player-count-history'
file_path = os.path.join(drive_base_path, 'player_count_history.csv')
df = pd.read_csv(file_path, parse_dates=['date'])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
# feature engineering
us_holidays = holidays.US()
cal = UnitedStates()
df['is_holiday_holidays_lib'] = df['date'].apply(lambda x: int(x in us_holidays))
df['is_holiday_workalendar'] = df['date'].apply(lambda x: int(cal.is_holiday(x)))
df['day_of_week'] = df['date'].dt.dayofweek  # i.e., monday=0, tuesday=1, ..., sunday=6

In [19]:
# target column & features
target_col = 'total players'
player_columns = [col for col in df.columns if col not in ['date']]
features = player_columns + ['day_of_week', 'is_holiday_holidays_lib', 'is_holiday_workalendar']

In [20]:
# scaling
scaler_players = MinMaxScaler()
df[[target_col]] = scaler_players.fit_transform(df[[target_col]])  # train only on total players
print(f"Scaler was trained on {scaler_players.n_features_in_} features")


scaler_features = MinMaxScaler()
df[['day_of_week', 'is_holiday_holidays_lib', 'is_holiday_workalendar']] = scaler_features.fit_transform(df[['day_of_week', 'is_holiday_holidays_lib', 'is_holiday_workalendar']])

Scaler was trained on 1 features


In [21]:
print(df.isnull().sum()) # get number of missing values

date                       0
570                        0
730                        0
578080                     0
1172470                    0
                          ..
1086940                    0
total players              0
is_holiday_holidays_lib    0
is_holiday_workalendar     0
day_of_week                0
Length: 102, dtype: int64


In [22]:
print(df.dtypes)


date                       datetime64[ns]
570                                 int64
730                                 int64
578080                              int64
1172470                             int64
                                ...      
1086940                             int64
total players                     float64
is_holiday_holidays_lib           float64
is_holiday_workalendar            float64
day_of_week                       float64
Length: 102, dtype: object


In [23]:
# sequence preparation
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length, :-1])  # features
        y.append(data[i + seq_length, -1])  # target variable i.e., total players
    return np.array(X), np.array(y)

seq_length = 30  # use last 30 days to predict
data = df[features].values
X, y = create_sequences(data, seq_length)

# 80-20 training split
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

In [24]:
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}") #verify that X train & test shapes make sense
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

X_train shape: (560, 30, 103), y_train shape: (560,)
X_test shape: (140, 30, 103), y_test shape: (140,)


In [25]:
# LSTM model training
model = Sequential([
    LSTM(100, return_sequences=True, input_shape=(seq_length, X.shape[2])),
    Dropout(0.2),
    LSTM(100, return_sequences=False),
    Dropout(0.2),
    Dense(50, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=30, batch_size=16, validation_data=(X_test, y_test))

Epoch 1/30


  super().__init__(**kwargs)


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 108ms/step - loss: 0.0533 - val_loss: 0.0437
Epoch 2/30
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 92ms/step - loss: 0.0281 - val_loss: 0.0459
Epoch 3/30
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 65ms/step - loss: 0.0272 - val_loss: 0.0427
Epoch 4/30
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - loss: 0.0272 - val_loss: 0.0420
Epoch 5/30
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 0.0255 - val_loss: 0.0422
Epoch 6/30
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 0.0364 - val_loss: 0.0419
Epoch 7/30
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step - loss: 0.0255 - val_loss: 0.0419
Epoch 8/30
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - loss: 0.0218 - val_loss: 0.0419
Epoch 9/30
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7d9813ab9890>

In [26]:
# model evaluation
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 147ms/step
RMSE: 0.20759424807690513


In [27]:
print(model.input_shape)  # expecting: (None, sequence_length, feature_size)


(None, 30, 103)


In [28]:
def predict_future(date):
    date = pd.to_datetime(date)
    day_of_week = date.weekday()
    is_holiday_holidays = int(date in us_holidays)
    is_holiday_workalendar = int(cal.is_holiday(date))

    last_seq = df[features].values[-seq_length:]  # ensure the shape is correct

    # ensure extra_features has same number of columns as features
    extra_features = np.zeros(len(features))  # fill NaNs with zeros
    extra_features[-3:] = [day_of_week / 6.0, is_holiday_holidays, is_holiday_workalendar]  # normalization

    last_seq = np.vstack([last_seq[1:], extra_features])  # (seq_length, num_features)
    last_seq = np.expand_dims(last_seq, axis=0)  # reshape sequence to (1, seq_length, num_features)
    last_seq = last_seq[:, :, :103]  # ensure correct feature count

    print(f"last_seq shape before prediction: {last_seq.shape}")  # ensure last sequence shape is what we expect
    print(f"Last input sequence:\n{last_seq}")

    prediction = model.predict(last_seq)
    print(f"Raw model output: {prediction[0][0]}")  # debugging

    predicted_players_scaled = np.array([[prediction[0][0]]])  # shape should be (1, 1)

    predicted_players = scaler_players.inverse_transform(predicted_players_scaled)[0][0]


    print(f"Predicted total players (scaled): {prediction[0][0]}")
    print(f"Predicted total players after inverse transform: {predicted_players}")

    return predicted_players

future_date = "2025-03-20"
predicted_players = predict_future(future_date)
print(f"Predicted total players on {future_date}: {int(predicted_players)}")

print(f"Scaler min: {scaler_players.min_}")
print(f"Scaler scale: {scaler_players.scale_}")
print("Training data min:", df["total players"].min())
print("Training data max:", df["total players"].max())

last_seq shape before prediction: (1, 30, 103)
Last input sequence:
[[[6.75521000e+05 1.65963400e+06 7.63746000e+05 ... 8.33333333e-01
   8.33333333e-01 0.00000000e+00]
  [6.91868000e+05 1.74401500e+06 7.94475000e+05 ... 1.00000000e+00
   1.00000000e+00 0.00000000e+00]
  [5.95634000e+05 1.55349000e+06 7.62311000e+05 ... 0.00000000e+00
   0.00000000e+00 1.00000000e+00]
  ...
  [5.67165000e+05 1.73793500e+06 8.53729000e+05 ... 6.66666667e-01
   6.66666667e-01 0.00000000e+00]
  [6.27292000e+05 1.82498900e+06 8.77807000e+05 ... 8.33333333e-01
   8.33333333e-01 0.00000000e+00]
  [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
   5.00000000e-01 0.00000000e+00]]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
Raw model output: 0.02458631992340088
Predicted total players (scaled): 0.02458631992340088
Predicted total players after inverse transform: 4784530.5
Predicted total players on 2025-03-20: 4784530
Scaler min: [-1.79565116]
Scaler scale: [3.80442