# NRL Feature Map and Machine Learning Model

## Imports

In [None]:
!pip install scikit-learn
!pip install tensorflow
!pip install numpy
!pip install pandas

In [2]:
import pandas as pd 
import json
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import r2_score
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

## Extracting Data from the JSON

In [3]:
teams = ["Broncos", "Roosters", "Wests Tigers", "Rabbitohs", "Storm", "Eels", "Raiders", "Knights", "Dragons", "Sea Eagles", "Panthers", "Sharks", "Bulldogs", "Dolphins", "Titans", "Cowboys", "Warriors"]
variables =["Year", "Win", "Defense", "Attack", "Margin", "Home", "Versus",  "Round"]
years =  [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2022, 2023]

In [5]:
# Initialize an empty dictionary to store data for each year
years_arr = {}

# Open the JSON file containing NRL data
with open('../data/nrl_data_multi_years_2.json', 'r') as file:
    # Load JSON data from the file
    data = json.load(file)
    
    # Extract NRL data from the loaded JSON
    data = data['NRL']
    
    # Iterate over each year in the years list
    for year in years:
        # Extract data for the current year and store it in the years_arr dictionary
        # Note: years.index(year) returns the index of the current year in the years list
        #       This index is then used to access the corresponding data for that year
        years_arr[year] = data[years.index(year)][str(year)]


In [None]:
# Create a DataFrame with columns representing combinations of team and variable names
df = pd.DataFrame(columns=[f"{team} {variable}" for team in teams for variable in variables])

In [None]:
import numpy as np

# Initialize an empty list to store data for all rounds
all_store = []

# Iterate over each year in the years list
for year in years:
    # Iterate over each round (assuming 26 rounds)
    for round in range(0, 26):
        try:
            # Extract data for the current round
            round_data = years_arr[year][round][str(round+1)]
            
            # Create an empty feature array 
            round_store = np.zeros([len(teams)*len(variables)], dtype=int)
            round_teams = []
            
            # Iterate over each game in the round data
            for game in round_data:
                # Extract information about the game
                h_team = game['Home']
                h_score = int(game['Home_Score'])
                a_team = game['Away']
                a_score = int(game['Away_Score'])
                
                # Determine win or lose for each team
                h_team_win = h_score >= a_score
                a_team_win = a_score >= h_score
                
                # Determine home team status
                h_home = 1
                a_home = 0
                
                # Determine versus index
                h_versus = teams.index(a_team)
                a_versus = teams.index(h_team)
                
                # Determine defense (points let in)
                h_team_defense = a_score
                a_team_defense = h_score  
                
                # Determine attack points scored
                h_team_attack = h_score 
                a_team_attack = a_score   
                
                # Determine margin
                h_team_margin =  h_score - a_score   
                a_team_margin =  a_score - h_score        
                
                # Keep track of which teams played to work out which teams had a bye 
                round_teams.append(h_team)
                round_teams.append(a_team)
                
                # Find the index of the team in the overarching array 
                a_team_idx = teams.index(a_team)
                h_team_idx = teams.index(h_team)
                
                # Determine feature map index
                a_team_idx_fm = a_team_idx * len(variables)
                h_team_idx_fm = h_team_idx * len(variables)
                
                # Populate the data for away team
                round_store[a_team_idx_fm] = year
                round_store[a_team_idx_fm+1] = a_team_win
                round_store[a_team_idx_fm+2] = a_team_defense
                round_store[a_team_idx_fm+3] = a_team_attack
                round_store[a_team_idx_fm+4] = a_team_margin
                round_store[a_team_idx_fm+5] = a_home
                round_store[a_team_idx_fm+6] = a_versus
                round_store[a_team_idx_fm+7] = round+1
                
                # Populate the data for home team
                round_store[h_team_idx_fm] = year
                round_store[h_team_idx_fm+1] = h_team_win
                round_store[h_team_idx_fm+2] = h_team_defense
                round_store[h_team_idx_fm+3] = h_team_attack
                round_store[h_team_idx_fm+4] = h_team_margin
                round_store[h_team_idx_fm+5] = h_home
                round_store[h_team_idx_fm+6] = h_versus
                round_store[h_team_idx_fm+7] = round+1
                
            # Determine teams with a bye
            bye_teams = list(set(teams) - set(round_teams))
            
            # Assign values for teams with a bye
            for bye_team in bye_teams:
                b_team_idx = teams.index(bye_team)
                b_team_idx_fm = b_team_idx * len(variables)
                round_store[b_team_idx_fm] = year
                round_store[b_team_idx_fm+1] = -1
                round_store[b_team_idx_fm+2] = -1
                round_store[b_team_idx_fm+3] = -1
                round_store[b_team_idx_fm+4] = 0
                round_store[b_team_idx_fm+5] = -1
                round_store[b_team_idx_fm+6] = -1
                round_store[b_team_idx_fm+7] = round+1
                
            # Append the round data to the all_store list
            all_store.append(round_store)
            
            # Add the new row to the DataFrame using loc
            df.loc[len(df)] = round_store
        except:
            pass

In [None]:
# Display the DataFrame
df

### Creating the Feature Map

In [None]:
GAME_HISTORY = 3

def get_game_history(year, round_, team):
    # Filter the DataFrame by year
    filtered_df = df[df[team + " Year"] == year]
    
    # Remove all byes from the game history
    filtered_df = filtered_df.iloc[round_-GAME_HISTORY-1:round_-1] 
    
    # Count number of byes
    byes = len(filtered_df[filtered_df[team + " Win"] == -1])
    
    # Remove bye rows from the filtered DataFrame
    filtered_df = filtered_df[filtered_df[team + " Win"] != -1]
    
    # Calculate mean values for win, defense, attack, and margin
    win = filtered_df[team + " Win"].mean()
    defense = filtered_df[team + " Defense"].median()
    attack = filtered_df[team + " Attack"].median()
    margin = filtered_df[team + " Margin"].median()
    
    # Calculate mean values for defense, attack, and margin
    defense_mean = filtered_df[team + " Defense"].mean()
    attack_mean = filtered_df[team + " Attack"].mean()
    margin_mean = filtered_df[team + " Margin"].mean()
    
    # Calculate the proportion of games played at home
    games_at_home = filtered_df[team + " Home"].mean()
    
    return win, defense, attack, margin, byes, games_at_home, defense_mean, attack_mean, margin_mean, year


### Creating the Learning Data / extending upon the feature map

In [None]:
X, y = [], []

# Input: Team, Other Team Team Stats, Other Team Stats
# Output: Team, win/lose, other team, win/lose

for team in teams:
    # Extract relevant columns from the DataFrame
    versed_teams = df[team + " Versus"]
    wins = df[team + " Win"]
    rounds = df[team + " Round"]
    years = df[team + " Year"]
    margins = df[team + " Margin"]
    
    # Get the index of the current team
    c_team_idx = teams.index(team)
    
    # Iterate over each game in the DataFrame
    for versed_team, win, round, year, margin in zip(versed_teams, wins, rounds, years, margins):
        # Skip games with byes or games with no momentum
        if win == -1 or round <= GAME_HISTORY:
            continue
        
        # Determine the winning team
        winning_team = -1
        if win == 1:
            v_win_ = 0
            winning_team = c_team_idx
        else:
            v_win_ = 1
            winning_team = versed_team
            
        # Check if it's a big win
        big_win = 1 if abs(margin) > 13 else 0 
        
        # Current team, versus team, who wins, current_team_stats, versus_team_stats
        X.append([c_team_idx, versed_team, *get_game_history(year, round, team), *get_game_history(year, round, teams[versed_team])])
        y.append([c_team_idx, versed_team, win, v_win_, big_win])


### Training the Data

In [None]:
# Create a Random Forest Regressor model
X_train, X_test, y_train, y_test = train_test_split(np.array(X), np.array(y), test_size=0.3, shuffle=True)


# Convert lists to NumPy arrays
X_train, X_val, y_train, y_val = np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


# Create a neural network model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(BatchNormalization())
model.add(Dropout(0.3))  # Dropout layer to reduce overfitting
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))  # Dropout layer to reduce overfitting
model.add(Dense(32, activation='relu'))
model.add(Dense(5))

# Training loop to track R-squared scores on both training and validation sets
num_epochs = 1000  # You can adjust the number of epochs as needed
# Define a learning rate schedule
initial_learning_rate = 0.00001
final_learning_rate = 0.001
batch_size = 32
learning_rate_decay_factor = (final_learning_rate / initial_learning_rate)**(1/num_epochs)
steps_per_epoch = int(len(X_train_scaled)/batch_size)

lr_schedule = ExponentialDecay(
                initial_learning_rate=initial_learning_rate,
                decay_steps=steps_per_epoch,
                decay_rate=learning_rate_decay_factor,
                staircase=True)
previous_loss = None
no_loss_change_epochs = 0
loss_change_threshold = 1e-5

# Compile the model
model.compile(optimizer=Adam(learning_rate=lr_schedule), loss='mse')

# Lists to store the R-squared scores during training
train_r2_scores = []
val_r2_scores = []
train_losses = []  


for epoch in range(num_epochs):
    # Train the model on the training data
    history = model.fit(X_train_scaled, y_train, batch_size=batch_size, epochs=1, verbose=2)
    
    # Calculate R-squared scores on the training and validation sets
    y_train_pred = model.predict(X_train_scaled)
    y_val_pred = model.predict(X_val_scaled)
    
    train_r2 = r2_score(y_train, y_train_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    
    # Store the R-squared scores for each epoch
    train_r2_scores.append(train_r2)
    val_r2_scores.append(val_r2)
    
    
    train_loss = history.history['loss'][0]
    train_losses.append(train_loss)
    
    if previous_loss is not None and abs(previous_loss - train_loss) < loss_change_threshold:
        no_loss_change_epochs += 1
    else:
        no_loss_change_epochs = 0
    
    # Set the current loss as the previous loss for the next epoch
    previous_loss = train_loss
    
    # If there have been no loss changes for a certain number of consecutive epochs, stop training
    if no_loss_change_epochs >= 5:
        print(f"Training stopped early at epoch {epoch + 1} due to no significant loss change.")
        break
    

# Final R-squared scores
final_train_r2, final_val_r2 = train_r2_scores[-1], val_r2_scores[-1]

print(f"Final Training R-squared: {final_train_r2:.4f}")
print(f"Final Validation R-squared: {final_val_r2:.4f}")

#### Visualising the ML Model

In [None]:

# Plotting the Loss
plt.figure(figsize=(8, 6))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error (MSE)')
plt.title('Training Loss over Epochs')
plt.legend()
plt.grid(True)
plt.show()

# Plotting R-squared scores
plt.figure(figsize=(8, 6))
plt.plot(train_r2_scores, label='Training R-squared')
plt.plot(val_r2_scores, label='Validation R-squared')
plt.xlabel('Epoch')
plt.ylabel('R-squared Score')
plt.title('R-squared Score over Epochs')
plt.legend()
plt.grid(True)
plt.show()

# Scatter plot of true vs. predicted values
y_train_pred = model.predict(X_train_scaled)
y_val_pred = model.predict(X_val_scaled)

fig, axs = plt.subplots(2, 2, figsize=(10, 10))
axs[0, 0].scatter(y_train[:, 0], y_train_pred[:, 0], alpha=0.5)
axs[0, 0].set_title('Target 1')
axs[0, 1].scatter(y_train[:, 1], y_train_pred[:, 1], alpha=0.5)
axs[0, 1].set_title('Target 2')
axs[1, 0].scatter(y_train[:, 2], y_train_pred[:, 2], alpha=0.5)
axs[1, 0].set_title('Target 3')
axs[1, 1].scatter(y_train[:, 3], y_train_pred[:, 3], alpha=0.5)
axs[1, 1].set_title('Target 4')

for ax in axs.flat:
    ax.set(xlabel='True Values', ylabel='Predicted Values')

plt.tight_layout()
plt.show()

### Testing the Prediction

In [None]:


wkd_matches = [["Broncos", "Rabbitohs"], ["Sharks", "Bulldogs"], ["Panthers", "Eels"], ["Raiders", "West Tigers"], ["Cowboys", "Knights"], ["Storm", "Warriors"], ["Sea Eagles", "Roosters"], ["Dolphins", "Dragons"]]

# Iterate over each weekend match
for wkd_match in wkd_matches:
    # Extract the teams
    team_1 = int(teams.index(wkd_match[0]))
    team_2 = int(teams.index(wkd_match[1]))

    # Prepare input data for prediction
    pred_in_2 = [team_1, team_2,  *get_game_history(2023, 22, teams[int(team_1)]), *get_game_history(2023, 22, teams[int(team_2)])]

    # Get predictions from the model
    predictions = model.predict([pred_in_2], verbose=0)
    predictions = predictions[0]
    
    # Determine the winner based on predictions
    if predictions[2] > predictions[3]:
        print(f"{teams[team_1]} wins\t\t {teams[team_1]}: {predictions[2]:.4f}\t{teams[team_2]}: {predictions[3]:.4f}\t\tBig Win {predictions[4]}")
    else:
        print(f"{teams[team_2]} wins\t\t {teams[team_1]}: {predictions[2]:.4f}\t{teams[team_2]}: {predictions[3]:.4f}\t\tBig Win {predictions[4]}")
