<a href="https://colab.research.google.com/github/Elliot-CRT/103-Final-Game/blob/main/NBA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Get Data

In [65]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
import sys
import os
import copy
import numpy as np
import pandas as pd

import joblib

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

class TeamAuxMLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim=32, hidden_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.Sequential(
            nn.Linear(embedding_dim * num_players_per_team, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(x.size(0), -1)
        return self.encoder(x)

class SiameseAuxModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=32, hidden_dim=64):
        super().__init__()
        self.team_net = TeamAuxMLP(vocab_size, embedding_dim, hidden_dim)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 1)
        )

    def forward(self, x):
        mid = x.size(1) // 2
        team1 = x[:, :mid]
        team2 = x[:, mid:]
        t1 = self.team_net(team1)
        t2 = self.team_net(team2)
        combined = torch.cat([t1, t2], dim=1)
        return torch.sigmoid(self.classifier(combined))


# for reproducibility
seed = 23
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

colab = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [66]:
if colab:
    from google.colab import drive
    drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [67]:
load_saved_data = True # False
colab_folder_path = "/content/gdrive/MyDrive/NBA/"

# data_file_name_X = "NBA_X.pt"
# data_file_name_y = "NBA_y.pt"
# num_players_per_team = 6
# encoder_name = "nba_encoder.joblib"
# model_name = "nba_model"

data_file_name_X = "LOL_X.pt"
data_file_name_y = "LOL_y.pt"
num_players_per_team = 5
encoder_name = "lol_encoder.joblib"
model_name = "lol_model"


# data_file_name_X = "amateurX.pt"
# data_file_name_y = "amateury.pt"
# num_players_per_team = 5
# encoder_name = "amateur_encoder.joblib"
# model_name = "amateur_model"

# data_file_name_X = "hero_games_X.pt"
# data_file_name_y = "hero_games_y.pt"
# num_players_per_team = 3
# encoder_name = "hero_games_encoder.joblib"
# model_name = "hero_games_model"

# data_file_name_X = "hero_games_X_10.pt"
# data_file_name_y = "hero_games_y_10.pt"
# num_players_per_team = 10
# encoder_name = "hero_games_encoder_10.joblib"
# model_name = "hero_games_model_10"

In [68]:
if(load_saved_data):
    ##TODO needs rework for lol
    X = torch.load(colab_folder_path + data_file_name_X)
    true_y = torch.load(colab_folder_path + data_file_name_y)
    X = torch.tensor(X).type(torch.int32)
    unique_values_in_X = np.unique(X)

    print("IMPORTANT, certain values missing after downsampling in data preprocessing")
    print(len(unique_values_in_X))

    # prompt: which numbers are missing in unique_values_in_X
    all_possible_numbers = np.arange(np.min(unique_values_in_X), np.max(unique_values_in_X) + 1)
    missing_numbers = np.setdiff1d(all_possible_numbers, unique_values_in_X)
    print("Missing numbers:", missing_numbers)

    # prompt: rearrange values in X so there is no gap
    mapping = {old_val: new_val for new_val, old_val in enumerate(np.unique(X))}
    X_rearranged = np.vectorize(mapping.get)(X)

    print("Original X:", X[:10])
    print("Rearranged X:", X_rearranged[:10])

    X = torch.tensor(X_rearranged).type(torch.int32)
    # y = torch.from_numpy(true_y)
    y = true_y

    # prompt: load the encoder

    # Load the encoder
    encoder = joblib.load(colab_folder_path + encoder_name)


IMPORTANT, certain values missing after downsampling in data preprocessing
168
Missing numbers: []
Original X: tensor([[ 38, 140,   2,  31,  87,  46,  42, 127,   8,  57],
        [115, 151, 149,  53,  74, 136, 112,  67,  31,  83],
        [109,  69,  68, 154, 101,   0, 147,   2,  73,  80],
        [ 63, 147, 144,   8, 134,  54, 148,   7, 142,  87],
        [ 50, 115,  94,  19,  75,  82,  40,  12,  52,  83],
        [117,  64,  23, 142, 167,  36,  49,  94,  52,  57],
        [ 20,  40, 130,   9,  57,  34,  47,  94,  19,  16],
        [  0, 130,  34, 143,  87,  63,  49,   2, 154, 101],
        [106,  93,  12, 135, 101, 151, 140, 110,  55,   4],
        [ 33,  78,  51, 142, 101,  76,  69,  39, 154,  70]], dtype=torch.int32)
Rearranged X: [[ 38 140   2  31  87  46  42 127   8  57]
 [115 151 149  53  74 136 112  67  31  83]
 [109  69  68 154 101   0 147   2  73  80]
 [ 63 147 144   8 134  54 148   7 142  87]
 [ 50 115  94  19  75  82  40  12  52  83]
 [117  64  23 142 167  36  49  94  52  5

  X = torch.tensor(X).type(torch.int32)


FileNotFoundError: [Errno 2] No such file or directory: '/content/gdrive/MyDrive/NBA/lol_encoder.joblib'

In [69]:
# prompt: generate statistics about X, how many rows, dimension,

print("Shape of X:", X.shape)
print("Number of rows in X:", X.shape[0])
print("Dimension of X:", X.shape[1])
print("Number of unique values in y:", len(np.unique(y)))

Shape of X: torch.Size([76318, 10])
Number of rows in X: 76318
Dimension of X: 10
Number of unique values in y: 2


In [70]:
def prep_player_names(df):
  names = df.values.flatten()
  names = np.unique(names)
  names.sort()
  return names

##Data Checking

In [71]:
def percent_wins(y_list):
  count_ones = sum(1 for outcome in y_list if outcome == 1)

  # Calculate the percentage
  percentage = (count_ones / len(y_list)) * 100

  return percentage
  # print(f"Percentage of tensors with value 1: {percentage:.2f}%")

percent_wins(y)

50.0

#Preprocessing  NBA

IMPORTANT:: If load_saved_data is true, don't need to run this section

Old: Using team information to build our X

In [72]:
# Example 2D lists
list1 = [
    ['a', 'b', 'c'],
    ['d', 'e', 'f']
]

list2 = [
    ['g', 'h', 'i'],
    ['j', 'k', 'l']
]

# Ensure both lists have the same number of rows
if len(list1) != len(list2):
    raise ValueError("Both 2D lists must have the same number of rows")

# Concatenate the 2D lists column-wise
concatenated_list = [row1 + row2 for row1, row2 in zip(list1, list2)]

# Print the concatenated list
for row in concatenated_list:
    print(row)


['a', 'b', 'c', 'g', 'h', 'i']
['d', 'e', 'f', 'j', 'k', 'l']


In [None]:
# if(load_saved_data):
#     X = torch.load(colab_folder_path + data_file_name_X)
#     true_y = torch.load(colab_folder_path + data_file_name_y)

# else:
#     colab_folder_path = "/content/gdrive/MyDrive/2023 research/NBA/"
#     games_df = pd.read_csv(colab_folder_path + "NBA_Games_History.csv") #, dtype = str, delimiter = ",", skip_header=1)
#     teams_df = pd.read_csv(colab_folder_path + "NBA_Teams_Training0515.csv") #, dtype = str, delimiter = ",", skip_header=1)
#     #drop irrelevant columns or columns with data leakage issue
#     teams_df = teams_df.drop(columns=['GP', 'MPG'])
#     print(games_df.loc[0]["SEASON"].dtype)
#     teams_df["Season"] = teams_df["Season"].astype(int)
#     list_X = []
#     for index, row in games_df.iterrows():
#         ti1 = teams_df[ (teams_df["Season"]==row['SEASON'] ) &  (teams_df["Team"]==row['HOME_TEAM'] ) ]
#         ti2 = teams_df[ (teams_df["Season"]==row['SEASON'] ) &  (teams_df["Team"]==row['AWAY_TEAM'] ) ]
#         #concatenate the two into x.
#         #drop everything except team
#         ti1 = ti1.drop(columns=['Season', 'Team'])
#         ti1 = torch.tensor(ti1.values)
#         # do the same thing with ti2
#         ti2 = ti2.drop(columns=['Season', 'Team'])
#         ti2 = torch.tensor(ti2.values)
#         # concatenate ti1 and ti2 as a row
#         list_X.append(torch.cat((ti1, ti2), dim = 1))
#     ##return all the rows of concatenated ti1 ti2
#     X = torch.cat(list_X, dim=0)
#     print(X[0])

#     true_y = games_df.drop(columns=['GAME_DATE','HOME_TEAM','AWAY_TEAM','SEASON','PTS_HOME','PTS_AWAY'])
#     # drops all columns but 'HOME_TEAM_WINS'

#     true_y = torch.tensor(true_y.values)

#     # save the X so we don't have to recompute it every time
#     torch.save(X, colab_folder_path + data_file_name_X)
#     # save the y so we don't have to recompute it every time
#     torch.save(true_y, colab_folder_path + data_file_name_y)


New: supposed to work with lineup

In [None]:
# prompt: create a dictionary that maps the all caps abbreviated NBA city names to their respective capitilized city names. For example "GSW" should map to "Golden State"

nba_teams_short2long_dict = {
    "ATL": "Atlanta",
    "BOS": "Boston",
    "BRK": "Brooklyn",
    "CHA": "Charlotte",
    "CHO": "Charlotte",
    "CHI": "Chicago",
    "CLE": "Cleveland",
    "DAL": "Dallas",
    "DEN": "Denver",
    "DET": "Detroit",
    "GSW": "Golden State",
    "HOU": "Houston",
    "IND": "Indiana",
    "LAC": "L.A. Clippers",
    "LAL": "L.A. Lakers",
    "MEM": "Memphis",
    "MIA": "Miami",
    "MIL": "Milwaukee",
    "MIN": "Minnesota",
    "NOP": "New Orleans",
    "NOH": "New Orleans",
    "NOK": "New Orleans",
    "NYK": "New York",
    "OKC": "Oklahoma City",
    "ORL": "Orlando",
    "PHI": "Philadelphia",
    "PHO": "Phoenix",
    "POR": "Portland",
    "SAC": "Sacramento",
    "SAS": "San Antonio",
    "TOR": "Toronto",
    "UTA": "Utah",
    "WAS": "Washington",
    "NJN": "Brooklyn",
    "SEA": "Oklahoma City"
}


In [None]:
def prep_city_names(df):

  box_df.iloc[:,1]

In [None]:
# prompt: write a function that iterates over the second column of each row in a dataframe and converts it from a key in a dictionary to its corresponding value in that dictionary

def convert_dict_values(df, dict_to_use):
  """
  This function iterates over the second column of each row in a dataframe and converts it from a key in a dictionary to its corresponding value in that dictionary.

  Args:
      df: The dataframe to process.
      dict_to_use: The dictionary to use for conversion.

  Returns:
      A new dataframe with the converted values.
  """

  new_df = df.copy()
  for index, row in df.iterrows():
    new_df.iloc[index, 1] = dict_to_use[row[1]]
  return new_df

# Example usage
#new_df = convert_dict_values(box_df, nba_teams_short2long_dict)


In [None]:
def prep_player_names(box_df):
  names = box_df.iloc[:, 2:].values.flatten()
  names = np.unique(names)
  names.sort()
  return names

In [None]:
def one_hot_encoding(name, names):
  one_hot_encoding = np.zeros(len(names))
  index = np.where(names == name)[0]
  one_hot_encoding[index] = 1
  return one_hot_encoding


def multi_one_hot_encoding(name, names):
  one_hot_encoding = np.zeros(len(names))
  indices = np.where(np.isin(names, name))[0]
  one_hot_encoding[indices] = 1
  return one_hot_encoding


In [None]:
box_df = pd.read_csv(colab_folder_path + "/lineup data/combined_data.csv", header=None)
games_df = pd.read_csv(colab_folder_path + "/NBA_Games_History.csv") #, dtype = str, delimiter = ",", skip_header=1)
pattern = r'(?P<month>10)/(?P<day>\d{1,2})/(?P<year>2005)'
games_df = games_df[~games_df['GAME_DATE'].str.contains(pattern)]
box_df.head()

  games_df = games_df[~games_df['GAME_DATE'].str.contains(pattern)]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,11/1/2005,PHI,Allen Iverson,Chris Webber,Kyle Korver,Andre Iguodala,Steven Hunter,John Salmons,Michael Redd,Bobby Simmons,Jamaal Magloire,Andrew Bogut,T.J. Ford,Mo Williams
1,11/1/2005,NOK,Chris Paul,P.J. Brown,J.R. Smith,David West,Boštjan Nachbar,Chris Andersen,Peja Stojaković,Shareef Abdur-Rahim,Bonzi Wells,Mike Bibby,Brad Miller,Kevin Martin
2,11/1/2005,SAS,Tim Duncan,Tony Parker,Manu Ginóbili,Bruce Bowen,Rasho Nesterović,Michael Finley,Andre Miller,Voshon Lenard,Carmelo Anthony,Kenyon Martin,Marcus Camby,Earl Boykins
3,11/1/2005,PHO,Raja Bell,Shawn Marion,James Jones,Steve Nash,Kurt Thomas,Jim Jackson,Josh Howard,Dirk Nowitzki,Jason Terry,Doug Christie,Erick Dampier,Keith Van Horn
4,11/2/2005,CLE,LeBron James,Larry Hughes,Drew Gooden,Zydrunas Ilgauskas,Eric Snow,Damon Jones,Chris Paul,P.J. Brown,David West,J.R. Smith,Boštjan Nachbar,Chris Andersen


In [None]:
# prompt: drop all games in games_df that were played in October. Use a regular expression

#pattern = r'(?P<month>10)/(?P<day>\d{1,2})/(?P<year>2005)'
pattern = r'(^\d{4}-10-\d{2}$)|(^10/\d{2}/\d{4}$)|(^\d{2}-10-\d{4}$)'
test_game_df = games_df[~games_df['GAME_DATE'].str.contains(pattern)]

test_box_df = box_df[~box_df[0].str.contains(pattern)]

print(len(test_game_df))
print(len(test_box_df))

14159
16452


  test_game_df = games_df[~games_df['GAME_DATE'].str.contains(pattern)]
  test_box_df = box_df[~box_df[0].str.contains(pattern)]


In [None]:
colab_folder_path = "/content/gdrive/MyDrive/NBA/"
games_df = pd.read_csv(colab_folder_path + "NBA_Games_History.csv") #, dtype = str, delimiter = ",", skip_header=1)

box_df = pd.read_csv(colab_folder_path + "lineup data/combined_data.csv", header=None)
player_names = prep_player_names(box_df)

In [None]:
len(player_names)

1229

Extract relevant information

In [None]:
from sklearn.preprocessing import LabelEncoder


if(load_saved_data):
    X = torch.load(colab_folder_path + data_file_name_X)
    true_y = torch.load(colab_folder_path + data_file_name_y)

else:
    colab_folder_path = "/content/gdrive/MyDrive/NBA/"
    games_df = pd.read_csv(colab_folder_path + "NBA_Games_History.csv") #, dtype = str, delimiter = ",", skip_header=1)

    box_df = pd.read_csv(colab_folder_path + "lineup data/combined_data.csv", header=None)
    player_names = prep_player_names(box_df)

    # prompt: convert by box_df so that the second column's values are converted to long names using the dictionary above

    #box_df.iloc[:,1] = box_df.iloc[:,1].map(nba_teams_short2long_dict)

    #drop irrelevant columns or columns with data leakage issue
    pattern = r'(?P<month>10)/(?P<day>\d{1,2})/(?P<year>2005)'
    games_df = games_df[~games_df['GAME_DATE'].str.contains(pattern)]

    games_df.head()


    # print(box_df.iloc[:, 1].unique())
    # for key, value in nba_teams_dict.items():
    #   print(f"Key: {key}, Value: {value}")
    # print(games_df['HOME_TEAM'].unique())


    # converts box_df home_team abbreviations to games_df home_team city name
    unique_box_teams = set()
    for index, row in games_df.iterrows():
      # Check if the value exists as a key in nba_teams_dict
      team_value = nba_teams_short2long_dict.get(box_df.iloc[index, 1])
      if team_value is not None:
          unique_box_teams.add(team_value)

    print(len(unique_box_teams))
    box_df = convert_dict_values(box_df, nba_teams_short2long_dict)

    list_X = []

    # create separate lists for home and away team for Siamese Network training
    list_hometeam = []
    list_awayteam = []

    true_y = games_df.drop(columns=['GAME_DATE','HOME_TEAM','AWAY_TEAM','SEASON','PTS_HOME','PTS_AWAY'])
    # drops all columns but 'HOME_TEAM_WINS'
    true_y = torch.tensor(true_y.values)
    list_true_y = []

    for index, row in games_df.iterrows():
        # print(sum(box_df.iloc[:, 0] == row['GAME_DATE'] )) #"06/19/2016")) #row['GAME_DATE'] ))
        # print(sum(box_df.iloc[:, 1] == row['HOME_TEAM'] )) #nba_teams_dict.get(row['HOME_TEAM']) ))
        box_row = box_df[ (box_df.iloc[:, 0] ==row['GAME_DATE'] ) &
                          (box_df.iloc[:, 1] ==row['HOME_TEAM'] ) ]
        ##bandit fix for now. some game does not have a corresponding box
        ##TODO:: investigate.
        if box_row.shape[0] == 0:
          #TODO debug here
          continue
        #get values from column 2 to column 7 in box_row
        # print("the box_row")
        # print(box_row)
        # print("the box_row end")
        #+ home team, - away team.
        # ti1_one_hot = multi_one_hot_encoding(box_row.iloc[:, 2:8].values, player_names)
        # ti2_one_hot = multi_one_hot_encoding(box_row.iloc[:, 8:].values, player_names)
        ti1_string = box_row.iloc[:, 2:8].values
        ti2_string = box_row.iloc[:, 8:].values


        # Ensure both lists have the same number of rows
        if len(ti1_string) != len(ti2_string):
            raise ValueError("Both 2D lists must have the same number of rows")

        # Concatenate the 2D lists column-wise
        concatenated_array = np.concatenate((ti1_string, ti2_string), axis=1)

        list_X.append(concatenated_array.tolist()[0])
        list_true_y.append(true_y[index])

        # print(box_row.iloc[:, 2:8].values)
        # print(ti1_one_hot)
        # print(box_row.iloc[:, 8:].values)
        # print(ti2_one_hot)
        # ti1 = torch.tensor(ti1_one_hot)
        # ti2 = torch.tensor(ti2_one_hot)

        # list_hometeam.append(ti1)
        # list_awayteam.append(ti2)

        # print(ti1)
        # print(ti2)
        # ti1 = teams_df[ (teams_df["Season"]==row['SEASON'] ) &  (teams_df["Team"]==row['HOME_TEAM'] ) ]
        ## ti2 = teams_df[ (teams_df["Season"]==row['SEASON'] ) &  (teams_df["Team"]==row['AWAY_TEAM'] ) ]
        # bs = box_df[ (box_df.iloc[:, 1]==row['SEASON'] )]
        # #concatenate the two into x.
        # #drop everything except team
        # ti1 = ti1.drop(columns=['Season', 'Team'])
        # ti1 = torch.tensor(ti1.values)
        # # do the same thing with ti2
        # ti2 = ti2.drop(columns=['Season', 'Team'])
        # ti2 = torch.tensor(ti2.values)
        # concatenate ti1 and ti2 as a row
        # list_X.append(torch.cat((ti1, ti2), dim = 1))


        # list_X.append(ti1-ti2)




In [None]:
X = list_X
y = list_true_y

NameError: name 'list_X' is not defined

#Preprocessing casual LOL

https://www.kaggle.com/code/adityadesai13/visualisation-analysis-champions-overall-game


the list of champion IDs
https://www.kaggle.com/code/adityadesai13/visualisation-analysis-champions-overall-game/input

In [85]:
colab_path = "/content/gdrive/MyDrive/NBA/Public Games"
filepath = f"{colab_path}/matches.csv"
matches = pd.read_csv(filepath)
filepath = f"{colab_path}/champs.csv"
champs = pd.read_csv(filepath)
filepath = f"{colab_path}/participants.csv"
participants = pd.read_csv(filepath)
filepath = f"{colab_path}/stats1.csv"
stats1 = pd.read_csv(filepath)
filepath = f"{colab_path}/stats2.csv"
stats2 = pd.read_csv(filepath)
stats = pd.concat([stats1, stats2], ignore_index=True)
filepath = f"{colab_path}/teamstats.csv"
teamstats = pd.read_csv(filepath)

  stats2 = pd.read_csv(filepath)


In [86]:
matches.head()

Unnamed: 0,id,gameid,platformid,queueid,seasonid,duration,creation,version
0,10,3187427022,EUW1,420,8,1909,1495068946860,7.10.187.9675
1,11,3187425281,EUW1,420,8,1693,1495066760778,7.10.187.9675
2,12,3187269801,EUW1,420,8,1482,1495053375889,7.10.187.9675
3,13,3187252065,EUW1,420,8,1954,1495050993613,7.10.187.9675
4,14,3187201038,EUW1,420,8,2067,1495047893400,7.10.187.9675


In [87]:
df = pd.merge(participants, stats, how = 'left', on = ['id'], suffixes=('', '_y'))
df = pd.merge(df, champs, how = 'left', left_on = 'championid', right_on = 'id', suffixes=('', '_y'))
df = pd.merge(df, matches, how = 'left', left_on = 'matchid', right_on = 'id', suffixes=('', '_y'))
df.head()

Unnamed: 0,id,matchid,player,championid,ss1,ss2,role,position,win,item1,...,name,id_y,id_y.1,gameid,platformid,queueid,seasonid,duration,creation,version
0,9,10,1,19,4,11,NONE,JUNGLE,0.0,3748.0,...,Warwick,19,10,3187427022,EUW1,420,8,1909,1495068946860,7.10.187.9675
1,10,10,2,267,3,4,DUO_SUPPORT,BOT,0.0,2301.0,...,Nami,267,10,3187427022,EUW1,420,8,1909,1495068946860,7.10.187.9675
2,11,10,3,119,7,4,DUO_CARRY,BOT,0.0,1055.0,...,Draven,119,10,3187427022,EUW1,420,8,1909,1495068946860,7.10.187.9675
3,12,10,4,114,12,4,SOLO,TOP,0.0,1029.0,...,Fiora,114,10,3187427022,EUW1,420,8,1909,1495068946860,7.10.187.9675
4,13,10,5,112,4,3,SOLO,MID,0.0,3020.0,...,Viktor,112,10,3187427022,EUW1,420,8,1909,1495068946860,7.10.187.9675


In [88]:
df.columns

Index(['id', 'matchid', 'player', 'championid', 'ss1', 'ss2', 'role',
       'position', 'win', 'item1', 'item2', 'item3', 'item4', 'item5', 'item6',
       'trinket', 'kills', 'deaths', 'assists', 'largestkillingspree',
       'largestmultikill', 'killingsprees', 'longesttimespentliving',
       'doublekills', 'triplekills', 'quadrakills', 'pentakills',
       'legendarykills', 'totdmgdealt', 'magicdmgdealt', 'physicaldmgdealt',
       'truedmgdealt', 'largestcrit', 'totdmgtochamp', 'magicdmgtochamp',
       'physdmgtochamp', 'truedmgtochamp', 'totheal', 'totunitshealed',
       'dmgselfmit', 'dmgtoobj', 'dmgtoturrets', 'visionscore', 'timecc',
       'totdmgtaken', 'magicdmgtaken', 'physdmgtaken', 'truedmgtaken',
       'goldearned', 'goldspent', 'turretkills', 'inhibkills',
       'totminionskilled', 'neutralminionskilled', 'ownjunglekills',
       'enemyjunglekills', 'totcctimedealt', 'champlvl', 'pinksbought',
       'wardsbought', 'wardsplaced', 'wardskilled', 'firstblood', 'name

In [89]:
df = df[["matchid", "championid", "win", "gameid", "queueid"]] ## TODO, add more info
df = df[df["queueid"] == 420]
df = df.drop(columns=["queueid"])

In [90]:
df.head(22)

Unnamed: 0,matchid,championid,win,gameid
0,10,19,0.0,3187427022
1,10,267,0.0,3187427022
2,10,119,0.0,3187427022
3,10,114,0.0,3187427022
4,10,112,0.0,3187427022
5,10,72,1.0,3187427022
6,10,3,1.0,3187427022
7,10,103,1.0,3187427022
8,10,222,1.0,3187427022
9,10,161,1.0,3187427022


In [91]:
from sklearn.preprocessing import LabelEncoder

load_saved_data = False
if(not load_saved_data):
  X = []
  y = []
  list_X = []
  prev_row = df.iloc[0]
  prev_match_id = prev_row["matchid"]
  for i, row in df.iterrows():
    match_id = row["matchid"]
    if match_id != prev_match_id:
      if len(list_X) == 10:
        y.append(prev_row["win"])
        X.append(list_X)
      prev_match_id = match_id
      prev_row = row
      list_X = []
    list_X.append(row["championid"])

  flattened_data = [item for sublist in X for item in sublist]
  # encoder = LabelEncoder()
  # integer_encoded = encoder.fit_transform(flattened_data)

  num_rows = len(X)
  num_columns = len(X[0])
  flattened_data = np.array(flattened_data).reshape(num_rows, num_columns)

  X = flattened_data
  y = np.array(y)

  colab_folder_path = "/content/gdrive/MyDrive/NBA"
  data_file_name_X = "amateurX.pt"
  data_file_name_y = "amateury.pt"
  # save the X so we don't have to recompute it every time
  torch.save(X, colab_folder_path + data_file_name_X)
  # save the y so we don't have to recompute it every time
  torch.save(y, colab_folder_path + data_file_name_y)

In [92]:
data_file_name_X

'amateurX.pt'

In [93]:
print(len(X))
print(len(y))
X[0]

161064
161064


array([ 19., 267., 119., 114., 112.,  72.,   3., 103., 222., 161.])

In [135]:
X_stats = stats[['totdmgdealt']].values.astype('float32')
y_stats = stats['win'].values.astype('float32')

aux_info = torch.tensor(np.concatenate((X_stats, y_stats.reshape(-1, 1)), axis=1))

print(aux_info.shape)

torch.Size([1834517, 2])


#Preprocessing Artificial Dataset

In [84]:
# Parameters
num_samples = 1000  # Number of games in the dataset

# Initialize lists for X and y
X = []
y = []

# Generate the dataset
for _ in range(num_samples):
    # Generate random numbers for home and away teams
    home_team = np.random.randint(1, 100, size=5)
    away_team = np.random.randint(1, 100, size=5)

    # Calculate variances
    home_variance = np.var(home_team)
    away_variance = np.var(away_team)

    # Determine the winner
    if home_variance > away_variance:
        label = 0  # Away team wins
    else:
        label = 1  # Home team wins

    # Append the data
    X.append(np.concatenate([home_team, away_team]))
    y.append(label)

# Convert lists to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# Print the shape of the tensors
print(f"X shape: {X_tensor.shape}")  # Should be [num_samples, 10]
print(f"y shape: {y_tensor.shape}")  # Should be [num_samples]


X shape: torch.Size([1000, 10])
y shape: torch.Size([1000])


  X_tensor = torch.tensor(X, dtype=torch.float32)


#Preprocessing Pro LOL

In [None]:
from sklearn.preprocessing import LabelEncoder

if(not load_saved_data):
    df_updated = torch.load("/content/gdrive/MyDrive/2023 research/LoL/merged_df")

    selected_rows = df_updated.loc[[i for n in range(len(df_updated) // 12 + 1) for i in [n * 12 + 10, n * 12 + 11] if i < len(df_updated)]]

    y = selected_rows['result']
    X = selected_rows[['pick1', 'pick2', 'pick3', 'pick4', 'pick5']]
    t1 = []
    t2 = []
    listX = []
    list_true_y = []
    for i, (index, row) in enumerate(X.iterrows()):
      if index % 2 == 0:
        t1_team = X.iloc[i].tolist()
      else:
        t2_team = X.iloc[i].tolist()

        concat_arr = np.concatenate((t1_team, t2_team))
        listX.append(concat_arr.tolist())
        list_true_y.append(y[index])

    flattened_data = [item for sublist in listX for item in sublist]
    # encoder = LabelEncoder()
    # integer_encoded = encoder.fit_transform(flattened_data)

    num_rows = len(listX)
    num_columns = len(listX[0])
    flattened_data = np.array(flattened_data).reshape(num_rows, num_columns)

    X = flattened_data
    true_y = np.array(list_true_y)

    # save the X so we don't have to recompute it every time
    torch.save(X, colab_folder_path + data_file_name_X)
    # save the y so we don't have to recompute it every time
    torch.save(true_y, colab_folder_path + data_file_name_y)


In [None]:
joblib.dump(encoder, colab_folder_path + encoder_name)

In [None]:
lolX = X
loly = y

In [None]:
print(lolX.shape)
print(loly.shape)

In [None]:
lolX[0]

In [None]:
X[0][:5]
X[0][5:]

In [None]:
loly[1]

#Preprocessing Hero Games

In [None]:
# prompt: read in the CSV file hero_games_fixed_heroes.csv

import pandas as pd

# Assuming 'hero_games_fixed_heroes.csv' is in the current working directory
# If not, provide the correct file path
# file_path = '/content/gdrive/MyDrive/2023 research/NBA/hero_game_dataset.csv'
# file_path = '/content/gdrive/MyDrive/2023 research/NBA/game_dataset_of10.csv'
file_path = '/content/gdrive/MyDrive/NBA/game_dataset_of10.csv'

try:
  df = pd.read_csv(file_path)
  print(df.head())  # Display the first few rows of the DataFrame
except FileNotFoundError:
  print(f"Error: File '{file_path}' not found.")
except pd.errors.ParserError:
    print(f"Error: Could not parse the CSV file '{file_path}'. Check its format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

                                 team1                                 team2  \
0   [16, 0, 7, 1, 19, 6, 2, 13, 15, 8]   [19, 5, 6, 17, 15, 8, 10, 4, 18, 9]   
1  [13, 1, 17, 3, 9, 5, 12, 11, 15, 6]   [3, 5, 9, 12, 16, 19, 6, 14, 13, 7]   
2  [3, 18, 12, 16, 10, 7, 0, 2, 8, 11]    [3, 1, 13, 18, 11, 6, 19, 7, 5, 9]   
3  [5, 0, 6, 3, 18, 11, 10, 15, 9, 13]    [15, 2, 1, 7, 4, 11, 19, 18, 6, 9]   
4  [16, 17, 1, 2, 13, 7, 0, 3, 19, 12]  [13, 18, 5, 8, 7, 4, 10, 11, 19, 14]   

   outcome  
0        1  
1        2  
2        2  
3        1  
4        2  


In [None]:
df["team1"]

Unnamed: 0,team1
0,"[16, 0, 7, 1, 19, 6, 2, 13, 15, 8]"
1,"[13, 1, 17, 3, 9, 5, 12, 11, 15, 6]"
2,"[3, 18, 12, 16, 10, 7, 0, 2, 8, 11]"
3,"[5, 0, 6, 3, 18, 11, 10, 15, 9, 13]"
4,"[16, 17, 1, 2, 13, 7, 0, 3, 19, 12]"
...,...
995,"[8, 7, 5, 4, 14, 6, 3, 19, 0, 10]"
996,"[16, 8, 13, 11, 15, 9, 6, 12, 1, 19]"
997,"[18, 16, 14, 0, 15, 9, 13, 17, 8, 4]"
998,"[16, 7, 0, 8, 19, 3, 9, 2, 13, 17]"


In [None]:
import pandas as pd
import numpy as np

def combine_teams_no_ast(row):
    team1_str = row['team1'].replace('[', '').replace(']', '').split(',')
    team2_str = row['team2'].replace('[', '').replace(']', '').split(',')

    team1 = [int(x.strip()) for x in team1_str if x.strip()]
    team2 = [int(x.strip()) for x in team2_str if x.strip()]

    return np.concatenate((team1, team2))

X = df.apply(combine_teams_no_ast, axis=1).to_numpy()

In [None]:
num_players_per_team = 10

In [None]:
X

array([array([16,  0,  7,  1, 19,  6,  2, 13, 15,  8, 19,  5,  6, 17, 15,  8, 10,
               4, 18,  9])                                                       ,
       array([13,  1, 17,  3,  9,  5, 12, 11, 15,  6,  3,  5,  9, 12, 16, 19,  6,
              14, 13,  7])                                                       ,
       array([ 3, 18, 12, 16, 10,  7,  0,  2,  8, 11,  3,  1, 13, 18, 11,  6, 19,
               7,  5,  9])                                                       ,
       array([ 5,  0,  6,  3, 18, 11, 10, 15,  9, 13, 15,  2,  1,  7,  4, 11, 19,
              18,  6,  9])                                                       ,
       array([16, 17,  1,  2, 13,  7,  0,  3, 19, 12, 13, 18,  5,  8,  7,  4, 10,
              11, 19, 14])                                                       ,
       array([ 8,  6, 14, 11,  7, 15,  3, 19,  1, 18, 13,  2, 16,  8, 18, 11,  4,
              14,  3, 15])                                                       ,
       arr

In [None]:
X[0]

array([16,  0,  7,  1, 19,  6,  2, 13, 15,  8, 19,  5,  6, 17, 15,  8, 10,
        4, 18,  9])

In [None]:
X = np.array([np.array(xi) for xi in X])
# prompt: convert X to a new shape of (1000,6)

X = X.reshape(1000, 20)

In [None]:
X.shape

(1000, 20)

In [None]:
# prompt: load the df["outcome"] into y

y = df["outcome"].values

In [None]:
y

array([1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2,
       1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2,
       1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1,
       1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2,
       2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2,
       2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2,
       2, 1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1,
       1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2,
       1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1,
       2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 1, 2,
       1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2,
       1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1,
       1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 1,

In [None]:
y = y -1

In [None]:
y

array([0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,

In [None]:
y.shape

(1000,)

#Preprocessing (SHARED)

##Downsampling

In [94]:
# prompt: get the count of different y values

import collections
y = torch.tensor(y)
y_values = y.flatten().tolist()
counter = collections.Counter(y_values)

print(counter)
print(counter[1]+ counter[0])

Counter({1.0: 82890, 0.0: 78174})
161064


In [95]:
X

array([[ 19., 267., 119., ..., 103., 222., 161.],
       [115.,  69., 157., ..., 134.,  43.,  19.],
       [119.,  40.,  99., ...,  51., 267.,  29.],
       ...,
       [ 35.,  67., 163., ...,  81.,  37.,  28.],
       [103.,  40., 126., ...,   4., 202., 412.],
       [114., 202., 432., ...,  57.,  32.,  37.]])

In [96]:
# prompt: downsample y so the labels are the same amount

from imblearn.under_sampling import RandomUnderSampler
##TODO:: is this a valid thing to do?
rus = RandomUnderSampler(sampling_strategy='majority')

X_train_resampled, y_train_resampled = rus.fit_resample(X, y_values)

y_values = y_train_resampled
counter = collections.Counter(y_values)
X = X_train_resampled
print(counter)


Counter({0.0: 78174, 1.0: 78174})


In [None]:
# from collections import Counter
# from imblearn.over_sampling import RandomOverSampler
# over_sampler = RandomOverSampler(random_state=42)
# X_res, y_res = over_sampler.fit_resample(X, true_y)
# print(f"Training target statistics: {Counter(y_res)}")
# print(f"Testing target statistics: {Counter(true_y)}")

## Label Encoding X

In [97]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Flatten the X to fit LabelEncoder
flattened_data = [item for sublist in X for item in sublist]
# Convert string categories to integer indices
encoder = LabelEncoder()
integer_encoded = encoder.fit_transform(flattened_data)

# Reshape back to the original structure
num_rows = len(X)
num_columns = len(X[0])
integer_encoded = np.array(integer_encoded).reshape(num_rows, num_columns)

X = integer_encoded
true_y = np.array(y_values)

In [98]:
X[0]

array([ 18, 126,  98,  95,  93,  63,   2,  86, 118, 112])

In [99]:
X[1]

array([ 96,  62, 111,   3,  27,  47,  57, 106,  42,  18])

In [100]:
encoder.inverse_transform(X[0])

array([ 19., 267., 119., 114., 112.,  72.,   3., 103., 222., 161.])

In [101]:
encoder.inverse_transform(X[1])

array([115.,  69., 157.,   4.,  28.,  51.,  62., 134.,  43.,  19.])

In [102]:
# prompt: get unique values of X

unique_values_in_X = np.unique(X)
print(unique_values_in_X)


[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135]


In [103]:
print(len(unique_values_in_X))

136


In [104]:
# prompt: convert X_res to tensor

X_res = torch.tensor(X)
y_res = torch.tensor(true_y, dtype=torch.float32)

In [105]:
X = X_res
y = y_res

In [106]:
X.shape

torch.Size([156348, 10])

In [107]:
y.shape

torch.Size([156348])

If oversampling, we have 18072 samples in total
If undersampling, we have 10360 samples in total

Here for better reliability in data, we chose undersampling.

In [108]:
# prompt: A function that checks whether two rows of X share the same values, but maybe in different order

def rows_have_same_elements_unordered(row1, row2):
  """Checks if two rows have the same two teams, regardless of order.

  Args:
    row1: The first row (list or array).
    row2: The second row (list or array).

  Returns:
    True if the rows have the same two teams, False otherwise.
  """
  return  sorted(row1[0:6]) == sorted(row2[0:6]) and sorted(row1[6:]) == sorted(row2[6:])


In [None]:
# prompt: check on rows in X with identical values

# import numpy as np

# # Find rows with x where rows_have_same_elements_unordered is true
# matching_rows = []
# for i in range(len(X)):
#   for j in range(i+1, len(X)):
#     if rows_have_same_elements_unordered(X[i], X[j]):
#       matching_rows.append((i, j))

# # Print the matching rows
# for row in matching_rows:
#   print(row)


In [None]:
# matching_rows

## Inspection

In [109]:
X[3]

tensor([  6, 118,  18,  68, 121,  17,  69,  10, 134, 103])

In [110]:
X[167]

tensor([112,   3, 105, 101,  47,  86,  60, 128, 100,  80])

In [None]:
##TODO
## take out those rows, sum them up in y

In [111]:

# save the X so we don't have to recompute it every time
torch.save(X, colab_folder_path + data_file_name_X)
# save the y so we don't have to recompute it every time
torch.save(y, colab_folder_path + data_file_name_y)

In [112]:
# prompt: save the encoder

import joblib

# Assuming 'encoder' is your LabelEncoder object
joblib.dump(encoder, colab_folder_path + encoder_name)

['/content/gdrive/MyDrive/NBAlol_encoder.joblib']

#Model

In [113]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn

from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True)

In [115]:
models = {
    'Decision Tree' : DecisionTreeClassifier(),
    'Neural Network' : MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(8, 8), random_state=1),
    'KNN' : KNeighborsClassifier(n_neighbors=3)
}

# results = {}

def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    win_per = percent_wins(y_pred)
    print(y_pred)

    return recall, precision, f1, accuracy, win_per

# iterate through all models
for name, model in models.items():
    #DEBUG ONLY comment out this below to run the baselines
    break
    # Evaluate model, calculate metrics and store them
    recall, precision, f1, accuracy, win_per = evaluate_model(model, X_train, y_train, X_test, y_test)

    print(name)
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")
    print(f"F1-Score: {f1}")
    print(f"Accuracy: {accuracy}")
    print(f"Percentage of tensors with value 1: {win_per:.2f}%")
    print()



## pass hidden size as arg? (see linked stackoverflow in issue w/ loss)

In [116]:
vocab_size = len(np.unique(X))
print(vocab_size)

136


### Baseline NN

In [117]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, embedding_dim):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # Assuming vocab size is 200000
        self.ln1 = nn.Linear(embedding_dim * num_players_per_team * 2, 16)  # Changed in_features to embedding_dim * 10 = 80
        self.ln2 = nn.Linear(16, 8)
        self.ln3 = nn.Linear(8, 1)


    def forward(self, x):
        x = self.embedding(x)
        x = x.view(x.size(0), -1) # Flatten the embedding output

        x = self.ln1(x)
        x = F.relu(x)
        x = self.ln2(x)
        x = F.relu(x)
        x = self.ln3(x)
        x = torch.sigmoid(x) # Applying sigmoid for binary classification
        return x



```
# This is formatted as code
```

### Siamese Network

In [118]:
# prompt: use the above model to create a siamese network

class SiameseNetwork(nn.Module):
  def __init__(self, embedding_dim):
    super(SiameseNetwork, self).__init__()
    self.embedding = nn.Embedding(num_embeddings= vocab_size, embedding_dim=embedding_dim)
    self.ln1 = nn.Linear(embedding_dim * num_players_per_team, 16)
    self.ln2 = nn.Linear(16, 16)
    self.ln3 = nn.Linear(16, 1)
    ##TODO:: dropout layers, maybe

  def forward(self, x):
    # split x into x1 and x2
    x1, x2 = x[:, :(len(X[0])//2)], x[:, (len(X[0])//2):]
    # Pass the input through the embedding layer
    x1 = self.embedding(x1)
    x2 = self.embedding(x2)
    # Reshape the output of the embedding layer to a 2D tensor
    x1 = x1.view(x1.size(0), -1)
    x2 = x2.view(x2.size(0), -1)
    x1 = self.ln1(x1)
    x1 = F.relu(x1)
    x2 = self.ln1(x2)
    x2 = F.relu(x2)
    x1 = self.ln2(x1)
    x1 = F.relu(x1)
    x2 = self.ln2(x2)
    x2 = F.relu(x2)
    x1 = self.ln3(x1)
    x1 = F.relu(x1)
    x2 = self.ln3(x2)
    x2 = F.relu(x2)
    # ##0 for left side win, 1 for right side win
    output1 = torch.sigmoid(x1)
    output2 = torch.sigmoid(x2)
    # IMPORTANT:: right now output win/lose ratio, can be any fraction between 0 and 1, not necessarily 1/0 win/lose.
    output1 = x1
    output2 = x2
    output = torch.sigmoid(output1 - output2)

    return output


### Transformer



In [119]:
# prompt: A transformer that treats each value in every row of X as a token, outputs a single value
# a standard transformer
class TransformerModel(nn.Module):
    def __init__(self, embedding_dim, input_dim = vocab_size,
                 num_heads = 2, num_layers = 1, hidden_dim = 64, output_dim = 1):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        #encoder layer but no masking applied
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim,
                                                   nhead=num_heads,
                                                   dim_feedforward=hidden_dim,
                                                   batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        # x begins with shape (batch_size, seq_len)
        x = self.embedding(x)
        # x  with shape (batch_size, seq_len, embedding_dim)
        x = x.permute(1, 0, 2)  # Reshape for Transformer input (seq_len, batch_size, embedding_dim)
        x = self.transformer_encoder(x)
        x = x.mean(dim=0)  # Average the embeddings across the sequence
        x = self.fc(x)
        return torch.sigmoid(x)

    def get_embedding(self, x):
        return self.embedding(x)

# Example usage:
# input_dim = len(unique_values_in_X)  # Number of unique tokens in X
# embedding_dim = 16
# num_heads = 2
# num_layers = 2
# hidden_dim = 32
# output_dim = 1

# model = TransformerModel(input_dim, embedding_dim, num_heads, num_layers, hidden_dim, output_dim)

# Assuming X_train is a tensor of shape (batch_size, seq_len)
# output = model(X_train)


### Auxillary MLP (multilayer perceptron)

In [120]:
# prompt: A secondary neural network that take in a feature

class Aux_MLP(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=64, embedding_dim=64, dropout=0.2):
        super(Aux_MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_dim, embedding_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)
        return x

### Siamese + Auxillary MLP

In [121]:
# prompt: a siamese auxillary MLP

class ModelB(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim):
    super(ModelB, self).__init__()
    self.fc1 = nn.Linear(input_dim, hidden_dim*4 )
    self.fc2 = nn.Linear(hidden_dim*4 , hidden_dim)
    self.fc3 = nn.Linear(hidden_dim, output_dim)
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    ## note: this x input here will be auxillary information only, no team comp info
    # split x into x1 and x2
    x1, x2 = x[:, :(len(X[0])//2)], x[:, (len(X[0])//2):]

    x1 = self.fc1(x1)
    x1 = self.relu(x1)
    x1 = self.fc2(x1)
    x1 = self.relu(x1)
    x1 = self.fc3(x1)
    x1 = self.sigmoid(x1)

    x2 = self.fc1(x2)
    x2 = self.relu(x2)
    x2 = self.fc2(x2)
    x2 = self.relu(x2)
    x2 = self.fc3(x2)
    x2 = self.sigmoid(x2)

    output = torch.sigmoid(x1 - x2)
    return output


### Siamese + Transformer

In [122]:
# prompt: A transformer that treats each value in every row of X as a token, outputs a single value

class SiamTransformerModel(nn.Module):
    def __init__(self, embedding_dim, input_dim = vocab_size,
                 num_heads = 2, num_layers = 1, hidden_dim = 64, output_dim = 1):
        super(SiamTransformerModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        #encoder layer but no masking applied
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, dim_feedforward=hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        # split x into x1 and x2
        x1, x2 = x[:, :(len(X[0])//2)], x[:, (len(X[0])//2):]

        x1 = self.embedding(x1)
        x1 = x1.permute(1, 0, 2)  # Reshape for Transformer input (seq_len, batch_size, embedding_dim)
        x1 = self.transformer_encoder(x1)
        x1 = x1.mean(dim=0)  # Average the embeddings across the sequence
        x1 = self.fc(x1)
        x1 = torch.sigmoid(x1)

        x2 = self.embedding(x2)
        x2 = x2.permute(1, 0, 2)  # Reshape for Transformer input (seq_len, batch_size, embedding_dim)
        x2 = self.transformer_encoder(x2)
        x2 = x2.mean(dim=0)  # Average the embeddings across the sequence
        x2 = self.fc(x2)
        x2 = torch.sigmoid(x2)

        output = torch.sigmoid(x1 - x2)
        return output

    def get_embedding(self, x):
        return self.embedding(x)

###Combine A+B

We treat the siamese transformer as model A and the Siamese Aux MLP as model B

In [129]:
# prompt: the combined models of model A (Siamese Transformer) and model B (Siamese Auxillary MLP)
class ModelAnB(nn.Module):
  def __init__(self, transformer_model, aux_model, transformer_output_dim=1):
        super(ModelAnB, self).__init__()
        # include models
        self.transformer = transformer_model
        self.aux_model = aux_model
        # Combine both outputs
        self.final_fc = nn.Linear(transformer_output_dim, 1)
        #self.aux_weight = nn.Parameter(torch.tensor(0.5))
        #force the weight to be 1 or 0 to turn on/off model A or B.
        #self.aux_weight = nn.Tensor([1])
        self.aux_weight = nn.Tensor([0])


##TODO must be passed team_comp and aux_info somehow before we can test
  def forward(self, team_comp, aux_info):
        # Transformer processes team comp
        comp_output = self.transformer(team_comp)
        # ModelB processes aux features
        aux_output = self.aux_model(aux_info)

        # idea1: Combine via addition maybe add weight to outputs for better results
        combined = (1 - self.aux_weight) * comp_output + self.aux_weight * aux_output

        # idea2: Combine via concat and pass through a final linear layer idk which would be best
        # combined = torch.cat([comp_output, aux_output], dim=1)  # shape: (batch_size, 2)

        combined = self.final_fc(combined)
        return torch.sigmoid(combined)


### issue with loss? (fixed in oversampling)
https://stackoverflow.com/questions/65219569/pytorch-gives-incorrect-results-due-to-broadcasting

Code example

```
# Define the model
model = NeuralNetwork()

# Define the loss function
loss_function = CrossEntropyLoss()

# Define the optimizer
optimizer = GradientDescentOptimizer()

# Train the model
for epoch in range(200):
    # Get the data
    data = get_data()

    # Calculate the loss
    loss = loss_function(model(data), labels)

    # X and true_y

    # Backpropagate the loss
    gradient = loss_function.backward()

    # Update the parameters
    optimizer.update(parameters, gradient)

    print("loss: "+ str(loss))
# Evaluate the model
accuracy = model.evaluate(data, labels)
```

#Split Training and Testing

Can we split based on season? (season feature in the dataset)

In [123]:
embedding_dim = 8

# model = Net
# model_name += "NN.pth"

# model = SiameseNetwork
# model_name += "SiameseNetwork.pth"

# model = TransformerModel
# model_name += "TransformerModel.pth"

# model = Aux_MLP
# model_name += "Aux_MLP.pth"

# model = SiamTransformerModel
# model_name += "SiamTransformerModel.pth"

# Instantiate your base models
siamese_transformer = SiamTransformerModel(embedding_dim=embedding_dim)
aux_mlp = Aux_MLP(hidden_dim=64, embedding_dim=embedding_dim)

# Instantiate the combined model
model = ModelAnB(transformer_model=siamese_transformer, aux_model=aux_mlp)




In [124]:
model_name

'lol_model'

In [126]:
print("model:", model)
print("type(model):", type(model))
print("embedding_dim:", embedding_dim)
print("type(embedding_dim):", type(embedding_dim))

if(load_saved_data):
  net = model(embedding_dim).to(device)
  # net = torch.load(colab_folder_path + model_name)
  #net.load_state_dict(torch.load(colab_folder_path + model_name))
  #net.eval()
print(torch.cuda.is_available())
  #encoder = joblib.load(colab_folder_path + encoder_name)

model: ModelAnB(
  (transformer): SiamTransformerModel(
    (embedding): Embedding(136, 8)
    (transformer_encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=8, out_features=8, bias=True)
          )
          (linear1): Linear(in_features=8, out_features=64, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=64, out_features=8, bias=True)
          (norm1): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (fc): Linear(in_features=8, out_features=1, bias=True)
  )
  (aux_model): Aux_MLP(
    (fc1): Linear(in_features=1, out_features=64, bias=True)
    (dropout): Dropout(p=0.2, in

##Testing code here

So I can debug the code and inspect the values

In [127]:
report_cycle = 10

In [130]:
from torch.utils.data import DataLoader, TensorDataset

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

k_fold = KFold(n_splits=10, shuffle=True, random_state=123)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
criterion = torch.nn.BCELoss()

for train_index, test_index in k_fold.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, shuffle=True)

    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    net = model(embedding_dim).to(device)
    optimizer = optim.Adam(net.parameters(), lr=0.00001)
    patience = 100
    best_loss = float('inf')
    counter = 0

    best_model_path = "best_model.pt"

    for epoch in range(10000):
        net.train()
        for X_batch, y_batch in train_loader:
            # Move data to GPU
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            pred_y = net(X_batch)
            loss = criterion(pred_y.flatten(), y_batch.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if epoch % report_cycle == 0:
            print(f"Epoch {epoch} training loss: {loss.item()}")

        net.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                # Move data to GPU
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)

                val_pred_y = net(X_batch)
                val_loss += criterion(val_pred_y.flatten(), y_batch.float()).item()
        val_loss /= len(val_loader)

        if epoch % report_cycle == 0:
            print(f"Validation loss: {val_loss}")

        if val_loss < best_loss:
            best_loss = val_loss
            counter = 0
            torch.save(net.state_dict(), best_model_path)
        else:
            counter += 1
            if counter >= patience:
                print(f"Early stopping on epoch: {epoch}")
                break
    # Load the best model for evaluation
    net.load_state_dict(torch.load(best_model_path))
    net.eval()
    with torch.no_grad():
        # Move test data to GPU
        X_test, y_test = X_test.to(device), y_test.to(device)

        pred_y = net(X_test)
        pred_y[pred_y > 0.5] = 1
        pred_y[pred_y <= 0.5] = 0
        acc = torch.sum(pred_y.flatten() == y_test) / len(y_test)
        # print(f"Accuracy: {acc}")
        accuracy_scores.append(acc)

        # Convert to numpy arrays for metric calculations
        y_test_np = y_test.cpu().numpy()
        pred_y_np = pred_y.flatten().cpu().numpy()

        precision = precision_score(y_test_np, pred_y_np)
        recall = recall_score(y_test_np, pred_y_np)
        f1 = f1_score(y_test_np, pred_y_np)

        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

        print(f"Accuracy: {acc}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1-score: {f1}")

    #right now only running one 10-fold, comment to run ten 10-folds.
    # break

# avg_accuracy = np.mean(accuracy_scores.cpu())
# avg_precision = np.mean(precision_scores.cpu())
# avg_recall = np.mean(recall_scores.cpu())
# avg_f1 = np.mean(f1_scores.cpu())

# print("\n--- Average Metrics Across Folds ---")
# print(f"Average Accuracy: {avg_accuracy}")
# print(f"Average Precision: {avg_precision}")
# print(f"Average Recall: {avg_recall}")
# print(f"Average F1-score: {avg_f1}")

Using device: cuda


TypeError: ModelAnB.forward() missing 1 required positional argument: 'aux_info'

In [131]:
# prompt: count how many pred_y are 1 or 0

count_ones = (pred_y == 1).sum().item()
count_zeros = (pred_y == 0).sum().item()

print(f"Number of 1s in pred_y: {count_ones}")
print(f"Number of 0s in pred_y: {count_zeros}")


NameError: name 'pred_y' is not defined

In [None]:
# prompt: count how many pred_y are 1 or 0

count_ones = (y_test == 1).sum().item()
count_zeros = (y_test == 0).sum().item()

print(f"Number of 1s in y_test: {count_ones}")
print(f"Number of 0s in y_test: {count_zeros}")


Number of 1s in y_test: 40
Number of 0s in y_test: 55


In [None]:
import pandas as pd

# Convert tensors to numpy arrays (detach from computation graph first)
pred_y_np = pred_y.detach().cpu().numpy().flatten()
y_test_np = y_test.detach().cpu().numpy().flatten()

# Align lengths to the smaller of the two
min_len = min(len(pred_y_np), len(y_test_np))
pred_y_np = pred_y_np[:min_len]
y_test_np = y_test_np[:min_len]

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({
    'pred_y': pred_y_np,
    'y_test': y_test_np
})

# Display the DataFrame
print(comparison_df)

      pred_y  y_test
0   0.500582     0.0
1   0.491270     0.0
2   0.522304     0.0
3   0.483105     0.0
4   0.483382     0.0
5   0.514840     0.0
6   0.538636     0.0
7   0.513737     0.0
8   0.484987     0.0
9   0.486364     0.0
10  0.519036     0.0
11  0.492561     0.0
12  0.495548     0.0
13  0.523847     0.0
14  0.478033     0.0
15  0.480926     0.0
16  0.525774     0.0
17  0.498437     0.0
18  0.507686     0.0
19  0.506999     0.0
20  0.508821     0.0
21  0.500972     0.0
22  0.497374     0.0
23  0.491954     0.0
24  0.461200     0.0
25  0.494932     0.0
26  0.487836     0.0
27  0.490095     0.0
28  0.491961     0.0
29  0.494598     0.0
30  0.518208     0.0
31  0.525704     0.0


In [None]:
import pandas as pd

# Detach and convert tensors to NumPy arrays
pred_y_np = pred_y.detach().cpu().numpy().flatten()
y_test_np = y_test.detach().cpu().numpy().flatten()

# Create a DataFrame for side-by-side comparison
comparison_df = pd.DataFrame({'pred_y': pred_y_np, 'y_test': y_test_np})

# Display the first and last 40 rows (if available)
n = 40
if len(comparison_df) > 2 * n:
    display_df = pd.concat([comparison_df.head(n), comparison_df.tail(n)])
else:
    display_df = comparison_df  # show entire DataFrame if too small

print(display_df)


ValueError: All arrays must be of the same length

In [None]:
net

Aux_MLP(
  (fc1): Linear(in_features=8, out_features=64, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
)

In [None]:
unique_values_in_X

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [None]:
X_train

tensor([[13,  7, 17,  ...,  0, 12, 18],
        [16,  8, 13,  ..., 13, 18,  4],
        [15, 16,  6,  ...,  8,  4, 19],
        ...,
        [ 4,  1, 12,  ..., 13, 16, 19],
        [ 5, 14, 10,  ...,  9, 15, 18],
        [15, 14,  7,  ...,  7, 13, 18]])

In [None]:
accuracy_scores

[tensor(0.8421)]

In [None]:
recall_scores

[0.8636363636363636]

In [None]:
# Convert PyTorch tensors in the lists to Python floats
accuracy_scores = [score.item() if isinstance(score, torch.Tensor) else score for score in accuracy_scores]
precision_scores = [score.item() if isinstance(score, torch.Tensor) else score for score in precision_scores]
recall_scores = [score.item() if isinstance(score, torch.Tensor) else score for score in recall_scores]
f1_scores = [score.item() if isinstance(score, torch.Tensor) else score for score in f1_scores]

# Calculate averages using NumPy
avg_accuracy = np.mean(accuracy_scores)
avg_precision = np.mean(precision_scores)
avg_recall = np.mean(recall_scores)
avg_f1 = np.mean(f1_scores)

# Print average metrics
print("\n--- Average Metrics Across Folds ---")
print(f"Average Accuracy: {avg_accuracy}")
print(f"Average Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")
print(f"Average F1-score: {avg_f1}")


--- Average Metrics Across Folds ---
Average Accuracy: 0.8421052694320679
Average Precision: 0.8085106382978723
Average Recall: 0.8636363636363636
Average F1-score: 0.8351648351648352


In [None]:
# prompt: same as above but taking out 0 values and corresponding values from the lists accuracy_scores precision_scores recall_scores f1_scores

import numpy as np

# Assuming accuracy_scores, precision_scores, recall_scores, and f1_scores are lists of numerical values

# Find indices where any of the scores is 0
zero_indices = [i for i, (acc, prec, rec, f1) in enumerate(zip(accuracy_scores, precision_scores, recall_scores, f1_scores)) if acc == 0 or prec == 0 or rec == 0 or f1 == 0]


# Remove elements at zero_indices from all lists
accuracy_scores = [score for i, score in enumerate(accuracy_scores) if i not in zero_indices]
precision_scores = [score for i, score in enumerate(precision_scores) if i not in zero_indices]
recall_scores = [score for i, score in enumerate(recall_scores) if i not in zero_indices]
f1_scores = [score for i, score in enumerate(f1_scores) if i not in zero_indices]

# Convert lists to numpy arrays for easier calculations (if necessary)
accuracy_scores = np.array(accuracy_scores)
precision_scores = np.array(precision_scores)
recall_scores = np.array(recall_scores)
f1_scores = np.array(f1_scores)


# Calculate and print averages
avg_accuracy = np.mean(accuracy_scores) if len(accuracy_scores)>0 else 0
avg_precision = np.mean(precision_scores) if len(precision_scores)>0 else 0
avg_recall = np.mean(recall_scores) if len(recall_scores)>0 else 0
avg_f1 = np.mean(f1_scores) if len(f1_scores)>0 else 0

print("\n--- Average Metrics Across Folds (excluding zero values)---")
print(f"Average Accuracy: {avg_accuracy}")
print(f"Average Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")
print(f"Average F1-score: {avg_f1}")


--- Average Metrics Across Folds (excluding zero values)---
Average Accuracy: 0.8421052694320679
Average Precision: 0.8085106382978723
Average Recall: 0.8636363636363636
Average F1-score: 0.8351648351648352


### Save the model

You don't need to save the encoder here. The label encoder is set up early on, never trained, so it is saved early too.

In [None]:
torch.save(net.state_dict(), colab_folder_path + model_name)

## Embedding inspection

In [None]:
net.embedding

Embedding(20, 8)

In [None]:
# prompt: use net's embedding layer to extract the embedding of X[0]

# Assuming 'net' is your trained model and 'X' is your data
embedding = net.embedding(torch.tensor(X[0]).to(device))
print(encoder.inverse_transform(X[0]))
embedding

[ 8  7  5  4 14  6  3 19  0 10 15  3  2 17 14  9  6 19  0  1]


  embedding = net.embedding(torch.tensor(X[0]).to(device))


tensor([[-0.0074, -0.7047,  0.8624,  2.4348,  0.3178,  3.7762,  0.6283,  0.8018],
        [-1.9350,  1.0974, -1.0789, -0.5513,  0.4804, -0.2421, -0.8965,  1.9435],
        [-0.9595, -0.9219, -1.0653, -0.8168, -0.3605, -0.1400, -0.2840,  1.1575],
        [ 0.4398, -0.5294, -1.1961,  0.0926,  0.1025, -0.4355, -0.3560,  0.8012],
        [-0.3059,  1.3865,  1.5282,  0.1190, -1.4767, -0.0671, -0.4755,  0.6376],
        [-1.5113,  0.8695, -0.0570, -0.1340, -0.5749,  2.0666,  0.0700, -1.5822],
        [ 1.2280,  1.0378,  0.7247, -0.7896,  0.5633,  1.2793, -0.3448,  0.3526],
        [ 0.5201,  1.2816,  0.0234,  0.2582, -2.1164,  1.7342, -0.7888,  1.1597],
        [ 1.3020, -0.2660,  0.5155, -0.5791,  1.2082,  0.2068, -0.3337, -0.6095],
        [-1.4330, -0.4111, -1.0544, -0.5251, -0.0776,  1.1126, -1.8244,  2.7839],
        [ 0.3273,  0.6566, -0.7577,  0.5030, -0.4067,  0.2204,  0.7338,  1.0099],
        [ 1.2280,  1.0378,  0.7247, -0.7896,  0.5633,  1.2793, -0.3448,  0.3526],
        [ 0.6251

In [None]:
embedding = net.embedding(torch.tensor(X[1]).to(device))
print(encoder.inverse_transform(X[1]))
embedding

[14 17  6 11  8  5  0 10  2  3 11 10  2  7 16  9  0  6 13 12]


  embedding = net.embedding(torch.tensor(X[1]).to(device))


tensor([[-3.0588e-01,  1.3865e+00,  1.5282e+00,  1.1905e-01, -1.4767e+00,
         -6.7067e-02, -4.7554e-01,  6.3764e-01],
        [-6.3020e-01,  1.0367e+00, -1.1518e+00, -8.4653e-01, -6.9898e-01,
         -6.4084e-02, -1.8948e+00, -4.2788e-01],
        [-1.5113e+00,  8.6950e-01, -5.6973e-02, -1.3397e-01, -5.7486e-01,
          2.0666e+00,  6.9987e-02, -1.5822e+00],
        [-6.7509e-02,  1.3965e+00,  4.6839e-01,  2.0231e+00,  1.6004e+00,
          1.9013e-01,  2.4007e+00, -1.3505e-01],
        [-7.3722e-03, -7.0466e-01,  8.6242e-01,  2.4348e+00,  3.1781e-01,
          3.7762e+00,  6.2828e-01,  8.0180e-01],
        [-9.5945e-01, -9.2188e-01, -1.0653e+00, -8.1679e-01, -3.6046e-01,
         -1.4000e-01, -2.8399e-01,  1.1575e+00],
        [ 1.3020e+00, -2.6605e-01,  5.1549e-01, -5.7915e-01,  1.2082e+00,
          2.0682e-01, -3.3370e-01, -6.0947e-01],
        [-1.4330e+00, -4.1107e-01, -1.0544e+00, -5.2513e-01, -7.7584e-02,
          1.1126e+00, -1.8244e+00,  2.7839e+00],
        [ 6.2512

In [None]:
name_to_embedding = {}
for unique_X in unique_values_in_X:
  embedding = net.embedding(torch.tensor(unique_X).to(device))
  name_to_embedding[encoder.inverse_transform([unique_X])[0]] = embedding

In [None]:
unique_values_in_X

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [None]:
name_to_embedding

{np.int64(0): tensor([ 1.3020, -0.2660,  0.5155, -0.5791,  1.2082,  0.2068, -0.3337, -0.6095],
        grad_fn=<EmbeddingBackward0>),
 np.int64(1): tensor([-1.2169,  0.9630,  0.2599, -1.0199, -0.0987, -0.9060,  1.3007,  1.5454],
        grad_fn=<EmbeddingBackward0>),
 np.int64(2): tensor([ 0.6251, -0.8881, -0.4095, -0.4030, -1.2283, -0.7616, -0.7756, -0.0054],
        grad_fn=<EmbeddingBackward0>),
 np.int64(3): tensor([ 1.2280,  1.0378,  0.7247, -0.7896,  0.5633,  1.2793, -0.3448,  0.3526],
        grad_fn=<EmbeddingBackward0>),
 np.int64(4): tensor([ 0.4398, -0.5294, -1.1961,  0.0926,  0.1025, -0.4355, -0.3560,  0.8012],
        grad_fn=<EmbeddingBackward0>),
 np.int64(5): tensor([-0.9595, -0.9219, -1.0653, -0.8168, -0.3605, -0.1400, -0.2840,  1.1575],
        grad_fn=<EmbeddingBackward0>),
 np.int64(6): tensor([-1.5113,  0.8695, -0.0570, -0.1340, -0.5749,  2.0666,  0.0700, -1.5822],
        grad_fn=<EmbeddingBackward0>),
 np.int64(7): tensor([-1.9350,  1.0974, -1.0789, -0.5513,  0.4

In [None]:
# Joey's spaghetti code
# get embeddings for all NBA players
# name_to_embedding = {}
# for game in X:
#   embeds = net.embedding(torch.tensor(game).to(device))
#   names = encoder.inverse_transform(game)
#   for i in range(len(names)):
#     name_to_embedding[names[i]] = embeds[i]

In [None]:
print(len(name_to_embedding))

20


In [None]:
name_to_embedding[135]

KeyError: 135

In [None]:
## takes two n-dimensional vectors, and calculates the manhattan distance between the two vectors
def manhattan_distance(x1, x2):
    if(len(x1) != len(x2)):
        print("Error: x1 and x2 must be the same length")
    dist = 0
    for i in range(len(x1)):
        diff = x1[i] - x2[i]
        dist += abs(diff)
    return dist

In [None]:
def nearest_embeddings(name):
  if name not in name_to_embedding:
    print("Error: name not in name_to_embedding")
    return
  closest = []
  name_embedding = name_to_embedding[name]
  for name, embedding in name_to_embedding.items():
    #print(name, manhattan_distance(name_embedding, embedding))
    closest.append((name, manhattan_distance(name_embedding, embedding)))
  closest.sort(key=lambda x: x[1])
  return closest[:10]

In [None]:
nearest_embeddings(99)

Error: name not in name_to_embedding


In [None]:
nearest_embeddings(44)

Error: name not in name_to_embedding


In [None]:
nearest_embeddings(37)

Error: name not in name_to_embedding


In [None]:
nearest_embeddings("Yao Ming")

Error: name not in name_to_embedding


In [None]:
nearest_embeddings("Kobe Bryant")

Error: name not in name_to_embedding


In [None]:
nearest_embeddings("Shaquille O'Neal")

Error: name not in name_to_embedding


In [None]:
nearest_embeddings("Paul George")

Error: name not in name_to_embedding


In [None]:
nearest_embeddings("Calbert Cheaney")

Error: name not in name_to_embedding


A even better idea is to analysis their key, query, value separately. However, this is not easy in the current implementation. You will need to have custom transformer encoder to do that.

#Real Train Test Experiments

In [None]:
def run_model(model, X, y, embedding_dim):
  #X = torch.tensor(X, dtype=torch.long)  # Use torch.long for indices
  #y = torch.tensor(y, dtype=torch.float) # Use torch.float for labels
  k_fold = KFold(n_splits=10, shuffle=True, random_state=123)
  # Create a list to store the accuracy scores for each fold
  accuracy_scores = []
  # criterion = torch.nn.MSELoss()
  criterion = torch.nn.BCELoss()
  for train_index, test_index in k_fold.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #split training data for validation
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, shuffle=True)

    # initialize the net
    net = model(embedding_dim)  # Training
    # net = SiameseNetwork(embedding_dim) # Siamese

    optimizer = optim.Adam(net.parameters(), lr= 0.0001)
    patience= 100
    best_loss = float('inf')
    counter = 0
    for epoch in range(1000):
        ##batch training
        #TODO try minibatch
        #pred_y = net(X_train.long())
        pred_y = net(X_train)

        # Calculate the loss
        loss = criterion(pred_y.flatten(), y_train.float())

        # Zero the gradients
        optimizer.zero_grad()

        # Backpropagate the loss
        loss.backward()

        # Update the parameters
        optimizer.step()

        if(epoch%100 == 0):
          print("Epoch "+ str(epoch) +" training loss: "+ str(loss.item()))
        #comment out the two lines below to make the task more difficult
        #X_val = X_test
        #y_val = y_test

        val_pred_y = net(X_val) # Changed val_pred_y to val_pred_y_1, val_pred_y_2
        val_loss = criterion(val_pred_y.flatten(), y_val.float()) # Changed val_pred_y to val_pred_y_1 and val_loss to val_loss_1

        if(epoch%100 == 0):
          print("validation loss: "+ str(val_loss.item()))

        if val_loss.item() < best_loss:
            best_loss = val_loss.item()
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print("Early stopping on epoch: " + str(epoch))
                break





    # testing
    pred_y = net(X_test)
    #convert all pred_y value more than 0.5 to 1,
    #and the rest to 0
    pred_y[pred_y>0.5] = 1
    pred_y[pred_y<=0.5] = 0
    # Accuracy on the training set.

    acc = torch.sum(pred_y.flatten() == y_test) / len(y_test)
    print("Accuracy:", acc)
    # Add the accuracy score to the list
    accuracy_scores.append(acc)

  print("Average accuracy:", np.mean(accuracy_scores))

I tried the following combinations of learning parameters



In [None]:
class SiameseNetwork(nn.Module):
    def __init__(self, embedding_dim):
        super(SiameseNetwork, self).__init__()
        self.embedding = nn.Embedding(200000, embedding_dim)

        # The input to the linear layer is the embedding dimension (embedding_dim)
        # multiplied by the length of the input sequence. Assuming the input
        # sequence has 5 words, and embedding_dim is 8, the input size would be 40 (8*5).
        # Change `in_features` to reflect the actual size of the input to the linear layer.
        self.ln1 = nn.Linear(embedding_dim * 5, 16)  # Adjust in_features to match embedding_dim * sequence_length
        self.ln2 = nn.Linear(16, 8)
        self.ln3 = nn.Linear(8, 1)

    def forward(self, x):
        x1 = x[:5]
        x2 = x[5:]
        x1 = self.embedding(x1)
        x2 = self.embedding(x2)

        # Reshape to (batch_size, embedding_dim * sequence_length)
        x1 = x1.view(x1.size(0), -1)
        x2 = x2.view(x2.size(0), -1)

        x1 = self.ln1(x1)
        x1 = F.relu(x1)
        x2 = self.ln1(x2)
        x2 = F.relu(x2)
        x = torch.abs(x1 - x2)
        x = self.ln2(x)
        x = F.relu(x)
        x = self.ln3(x)
        x = torch.sigmoid(x)
        return x

In [None]:
run_model(Net, lolX, loly, 8)

NameError: name 'lolX' is not defined

In [None]:
run_model(SiameseNetwork, X, y, 8)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (5x160 and 40x16)

In [None]:
run_model(TransformerModel, lolX, loly, 8)

NameError: name 'lolX' is not defined

#END OF FILE

Use the following code to salvage on the output if google disconnects you when your colab takes too long.

In [None]:
# prompt: read the new 2.txt file into log_output

log_output = ""
with open("2.txt", "r") as file:
    log_output = file.read()
log_output

FileNotFoundError: [Errno 2] No such file or directory: '2.txt'

In [None]:
import re
import numpy as np



# Regular expressions to capture the scores
accuracy_pattern = re.compile(r"Accuracy: ([\d.]+)")
precision_pattern = re.compile(r"Precision: ([\d.]+)")
recall_pattern = re.compile(r"Recall: ([\d.]+)")
f1_score_pattern = re.compile(r"F1-score: ([\d.]+)")

# Extract scores
accuracies = accuracy_pattern.findall(log_output)
precisions = precision_pattern.findall(log_output)
recalls = recall_pattern.findall(log_output)
f1_scores = f1_score_pattern.findall(log_output)

# Print the results
for i in range(len(accuracies)):
    print(f"Run {i+1}:")
    print(f"  Accuracy: {accuracies[i]}")
    print(f"  Precision: {precisions[i]}")
    print(f"  Recall: {recalls[i]}")
    print(f"  F1-score: {f1_scores[i]}")

#convert accuracies to list of number
accuracies = [float(acc) for acc in accuracies]
precisions = [float(prec) for prec in precisions]
recalls = [float(rec) for rec in recalls]
f1_scores = [float(f1) for f1 in f1_scores]

print( "Average Accuracy:", {np.mean(accuracies)})
print( "Average Precision:", {np.mean(precisions)})
print( "Average Recall:", {np.mean(recalls)})
print( "Average F1-score:", {np.mean(f1_scores)})

Average Accuracy: {np.float64(nan)}
Average Precision: {np.float64(nan)}
Average Recall: {np.float64(nan)}
Average F1-score: {np.float64(nan)}


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
