In [None]:
%load_ext autoreload
%autoreload 2
import torch
from torch import nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import json
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error
from neural import *
from dataset import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# load data plus some additional preprocessing
input_features = []
output_labels = []
with open(f"playerData_features.txt", mode="r",encoding="utf-8") as file:
    num_no_prenhl_seasons = 0
    for line in file:
        player_data = json.loads(line.strip())

        player_input_features = []
        player_output_labels = []

        # multihot encode position
        position_mapping = {"C": [0], "LW": [1], "RW": [2], "D": [3], "W": [1,2], "F": [0,1,2]}
        positions = [0] * 4
        for pos in player_data.get("detailedPosition"):
            for i in position_mapping[pos]:
                    positions[i] = 1
        player_input_features.extend(positions)

        left_right_mapping = {"L": 0, "R": 1, "N": 0.5}  # N for neither or unknown
        player_input_features.append(left_right_mapping[player_data.get("shoots") if player_data.get("shoots") else "N"])

        # if player_data.get("height"):
        #     player_input_features.append(player_data.get("height"))
        # else:
        #     continue
        # if player_data.get("weight"):
        #     player_input_features.append(player_data.get("weight"))
        # else:
        #     continue

        nhl_seasons = torch.zeros(6, dtype=torch.float32)
        num_nhl_seasons = 0
        nonnhl_seasons = torch.zeros(6, dtype=torch.float32)
        num_nonnhl_seasons = 0
        # average stats for nhl seasons and non nhl seasons
        for season in player_data.get("seasonStats", []):
            # maybe could add year later
            # could use normal plus minus instead of per game
            season_data = torch.tensor([season.get("gamesPlayed_log"), season.get("goalsPerGame"), season.get("assistsPerGame"), season.get("pointsPerGame"), season.get("penaltyMinsPerGame"), season.get("plusMinusPerGame")], dtype=torch.float32)
            if season.get("league", "") == "NHL":
                nhl_seasons += season_data
                num_nhl_seasons += 1
            else:
                nonnhl_seasons += season_data
                num_nonnhl_seasons += 1

        if num_nonnhl_seasons == 0:
            num_no_prenhl_seasons += 1
            continue
        numSamples +=1
        # average stats over all seasons
        nhl_seasons /= num_nhl_seasons if num_nhl_seasons > 0 else 1
        nonnhl_seasons /= num_nonnhl_seasons if num_nonnhl_seasons > 0 else 1

        player_input_features.extend(nhl_seasons.tolist())
        player_output_labels.extend(nonnhl_seasons.tolist())

        input_features.append(player_input_features)
        output_labels.append(player_output_labels)

print(f"number of players with no non NHL seasons: {num_no_prenhl_seasons}")
print(f"number of samples: {len(input_features)}")

print("numInputFeatures:", len(input_features[0]))
print("numOutputLabels:", len(output_labels[0]))


train_dataset = NHLDataset(input_features[:-800], output_labels[:-800])
validation_dataset = NHLDataset(input_features[-800:-400], output_labels[-800:-400])
test_dataset = NHLDataset(input_features[-400:], output_labels[-400:])


batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
# model definition

# TODO change model shape
model = MLP(input_size=11, output_size=6, hidden_size=30, num_hidden_layers=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

print(model)
print(f"number of parameters: {sum(p.numel() for p in model.parameters())}")

train_losses, validation_losses = [], [] 
cur_epoch = 0

def trainModel(model: nn.Module, dataloader: torch.utils.data.DataLoader, criterion: nn.Module, optimizer: torch.optim.Optimizer, device: torch.device) -> float:
    # trains the model for one epoch
    # return average loss over epoch
    model.train()
    totalLoss = 0
    n = len(dataloader)
    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        output = model(x)
        loss = criterion(output, y)

        totalLoss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return totalLoss/n

def evaluateModel(model: nn.Module, dataloader: torch.utils.data.DataLoader, criterion: nn.Module, device: torch.device,) -> float:
    # evaluates the model
    # return average loss
    model.eval()
    totalLoss = 0
    n = len(dataloader)
    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        output = model(x)
        loss = criterion(output, y)
        
        totalLoss += loss.item()
    return totalLoss/n



In [None]:
# TODO add ability to save model

num_epochs = 100
for _ in range(num_epochs):
    train_loss = trainModel(model, train_loader, criterion, optimizer, device)
    validation_loss = evaluateModel(model, validation_loader, criterion, device)

    train_losses.append(train_loss)
    validation_losses.append(validation_loss)

    cur_epoch += 1

    print(f"Epoch {cur_epoch}| Train Loss: {train_loss:.4f}| Validation Loss: {validation_loss:.4f}")

In [None]:
# plot training and validation loss
plt.figure(figsize=(6, 4))
plt.title(f"Training and Validation Loss")
plt.plot(train_losses, label=f"Train")
plt.plot(validation_losses, label=f"Validationn")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid(True)
plt.legend()
plt.show()


def print_metrics(model, dataloader, device):
    model.eval()
    allPreds, trueOutputs = [], []
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            preds = model(x)
            allPreds.extend(preds.cpu().numpy())
            trueOutputs.extend(y.cpu().numpy())

        print(mean_squared_error(trueOutputs, allPreds))
        print(root_mean_squared_error(trueOutputs, allPreds))
        print(mean_absolute_error(trueOutputs, allPreds))
        print(r2_score(trueOutputs, allPreds))


print(f"\nMetrics - Testing\n")
print_metrics(model, test_loader, device)
print(f"\nMetrics - Validation\n")
print_metrics(model, validation_loader, device)



In [None]:
# test the model - when training complete
test_loss = evaluateModel(model, test_loader, criterion, device)
print(f"Test Loss: {test_loss:.4f}")
print(f"Final Train Loss: {train_losses[-1]:.4f}")
print(f"Final Validation Loss: {validation_losses[-1]:.4f}")