# Basketball Playoffs Predictive Model

## Data Understanding/Preparation (??)

After understanding the data meaning we are ready to start processing it. Our first step in this phase is the creation of a SQLite database in order to speed up the data accesses and also to facilitate the joining of data from the various tables.

To fulfil the data needs we defined this schema:

```sql
CREATE TABLE awards_players (
    playerID TEXT,
    award    TEXT,
    year     INTEGER,
    lgID     TEXT,
    CONSTRAINT fk_playerID FOREIGN KEY (
        playerID
    )
    REFERENCES players (bioID) 
);

CREATE TABLE coaches (
    coachID     TEXT,
    year        INTEGER,
    tmID        TEXT,
    lgID        TEXT,
    stint       INTEGER,
    won         INTEGER,
    lost        INTEGER,
    post_wins   INTEGER,
    post_losses INTEGER,
    PRIMARY KEY (
        coachID,
        year,
        tmID,
        stint
    ),
    CONSTRAINT fk_year_tmID FOREIGN KEY (
        year,
        tmID
    )
    REFERENCES teams (year,
    tmID) 
);

CREATE TABLE players (
    bioID        TEXT    PRIMARY KEY,
    pos          TEXT,
    firstseason  INTEGER,
    lastseason   INTEGER,
    height       REAL,
    weight       INTEGER,
    college      TEXT,
    collegeOther TEXT,
    birthDate    TEXT,
    deathDate    TEXT
);

CREATE TABLE players_teams (
    playerID           TEXT,
    year               INTEGER,
    stint              INTEGER,
    tmID               TEXT,
    lgID               TEXT,
    GP                 INTEGER,
    GS                 INTEGER,
    minutes            INTEGER,
    points             INTEGER,
    oRebounds          INTEGER,
    dRebounds          INTEGER,
    rebounds           INTEGER,
    assists            INTEGER,
    steals             INTEGER,
    blocks             INTEGER,
    turnovers          INTEGER,
    PF                 INTEGER,
    fgAttempted        INTEGER,
    fgMade             INTEGER,
    ftAttempted        INTEGER,
    ftMade             INTEGER,
    threeAttempted     INTEGER,
    threeMade          INTEGER,
    dq                 INTEGER,
    PostGP             INTEGER,
    PostGS             INTEGER,
    PostMinutes        INTEGER,
    PostPoints         INTEGER,
    PostoRebounds      INTEGER,
    PostdRebounds      INTEGER,
    PostRebounds       INTEGER,
    PostAssists        INTEGER,
    PostSteals         INTEGER,
    PostBlocks         INTEGER,
    PostTurnovers      INTEGER,
    PostPF             INTEGER,
    PostfgAttempted    INTEGER,
    PostfgMade         INTEGER,
    PostftAttempted    INTEGER,
    PostftMade         INTEGER,
    PostthreeAttempted INTEGER,
    PostthreeMade      INTEGER,
    PostDQ             INTEGER,
    CONSTRAINT fk_playerID FOREIGN KEY (
        playerID
    )
    REFERENCES players (bioID),
    CONSTRAINT fk_year_tmID FOREIGN KEY (
        year,
        tmID
    )
    REFERENCES teams (year,
    tmID) 
);

CREATE TABLE series_post (
    year       INTEGER,
    round      TEXT,
    series     TEXT,
    tmIDWinner TEXT,
    lgIDWinner TEXT,
    tmIDLoser  TEXT,
    lgIDLoser  TEXT,
    W          INTEGER,
    L          INTEGER,
    CONSTRAINT fk_year_tmIDWinner FOREIGN KEY (
        year,
        tmIDWinner
    )
    REFERENCES teams (year,
    tmID),
    CONSTRAINT fk_year_tmIDLoser FOREIGN KEY (
        year,
        tmIDLoser
    )
    REFERENCES teams (year,
    tmID) 
);

CREATE TABLE teams (
    year       INTEGER,
    lgID       TEXT,
    tmID       TEXT,
    franchID   TEXT,
    confID     TEXT,
    divID      TEXT,
    rank       INTEGER,
    playoff    TEXT,
    seeded     INTEGER,
    firstRound TEXT,
    semis      TEXT,
    finals     TEXT,
    name       TEXT,
    o_fgm      INTEGER,
    o_fga      INTEGER,
    o_ftm      INTEGER,
    o_fta      INTEGER,
    o_3pm      INTEGER,
    o_3pa      INTEGER,
    o_oreb     INTEGER,
    o_dreb     INTEGER,
    o_reb      INTEGER,
    o_asts     INTEGER,
    o_pf       INTEGER,
    o_stl      INTEGER,
    o_to       INTEGER,
    o_blk      INTEGER,
    o_pts      INTEGER,
    d_fgm      INTEGER,
    d_fga      INTEGER,
    d_ftm      INTEGER,
    d_fta      INTEGER,
    d_3pm      INTEGER,
    d_3pa      INTEGER,
    d_oreb     INTEGER,
    d_dreb     INTEGER,
    d_reb      INTEGER,
    d_asts     INTEGER,
    d_pf       INTEGER,
    d_stl      INTEGER,
    d_to       INTEGER,
    d_blk      INTEGER,
    d_pts      INTEGER,
    tmORB      INTEGER,
    tmDRB      INTEGER,
    tmTRB      INTEGER,
    opptmORB   INTEGER,
    opptmDRB   INTEGER,
    opptmTRB   INTEGER,
    won        INTEGER,
    lost       INTEGER,
    GP         INTEGER,
    homeW      INTEGER,
    homeL      INTEGER,
    awayW      INTEGER,
    awayL      INTEGER,
    confW      INTEGER,
    confL      INTEGER,
    min        INTEGER,
    attend     INTEGER,
    arena      TEXT,
    PRIMARY KEY (
        year,
        tmID
    )
);
```

Then we populated the database with the provided data.

In [None]:
%matplotlib inline

import subprocess
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree
import matplotlib.pyplot as plt
import seaborn as sb

import sqlite3
import subprocess

# Create a shell script
# with open('myscript.sh', 'w') as f:
#     f.write('cat ./database/final.sql | sqlite3 ./database/bdfinal.sql')

# Execute the script in WSL
# subprocess.run(["wsl", "./myscript.sh"], check=True,shell=True)

connection = sqlite3.connect("./database/bdfinal.sql")

### Missing Values

Before we train models on this data we need to assess its quality. One important step is finding missing values.

We will look for missing values in all the created tables.

In [None]:
print("\n=========================================\n")

df = pd.read_sql("SELECT * FROM coaches;",connection)

null_mask = df.isnull().any(axis=1)
null_rows = df[null_mask]
print(null_rows)

print("\n=========================================\n")

df = pd.read_sql("SELECT * FROM players;",connection)

null_mask = df.isnull().any(axis=1)
null_rows = df[null_mask]
print(null_rows)

## ---//---
df = pd.read_sql("SELECT * FROM players;",connection)

col_names = df.columns
for index, row in df.iterrows():
    
    ## iterate through columns
    found = False
    if row["pos"] == "":
        found = True
    elif row["height"] == 0:
        found = True
    elif row["weight"] == 0:
        found = True
    elif row["birthDate"] == "" or row["birthDate"] == "0000-00-00":
        found = True
    elif row["college"] == "":
        found = True

    if(found):
        print(row["bioID"])
    
print("\n=========================================\n")

df = pd.read_sql("SELECT * FROM players_teams;",connection)

null_mask = df.isnull().any(axis=1)
null_rows = df[null_mask]
print(null_rows)

print("\n=========================================\n")

df = pd.read_sql("SELECT * FROM series_post;",connection)

null_mask = df.isnull().any(axis=1)
null_rows = df[null_mask]
print(null_rows)

print("\n=========================================\n")

df = pd.read_sql("SELECT * FROM teams;",connection)

null_mask = df.isnull().any(axis=1)
null_rows = df[null_mask]
print(null_rows)

print("\n=========================================\n")

df = pd.read_sql("SELECT * FROM teams_post;",connection)

null_mask = df.isnull().any(axis=1)
null_rows = df[null_mask]
print(null_rows)

### Player

- After a quick glance at the data, it's easy to see that there's a certain amount of players that have many important missing/null values (college, height and weight).

In [None]:
dataframe = pd.read_sql("select bioID from players where weight = 0 or height = 0 or college = '' or pos = '';", connection)
print(dataframe)

- Lets check if any of these players with a null position are actually coaches, since the awards_players tables has a coach award and references the players table.

In [None]:
# select count(*) from players where pos = "";
# execute the query
df = pd.read_sql("select count(*) from players where pos = '';",connection)
num_players = df.values[0][0]
print(num_players)

# select count(*) from players where pos = "" and bioID in (select coachID from coaches);
# execute the query
df = pd.read_sql("select count(*) from players where pos = '' and bioID in (select coachID from coaches);",connection)
num_players = df.values[0][0]
print(num_players)


- 52 out of the 78 players without position are coaches

- From these 208 players, it's important to see which actually were a part of a team

In [None]:
active_missing_values_players = pd.read_sql("select distinct(bioID), weight, height, pos from players where (weight = 0 or height = 0) and pos <> ''", connection)
print(active_missing_values_players)

- Regarding these 83 players, if a player doesn't have their position missing, we decided to replace their missing weight and/or height values with the average value of the players of their same position. 

    - Obtain the average weight and height for each player position:

In [None]:
# query para cada valor
avg_pos_weights = pd.read_sql("select pos, avg(weight) from players where weight <> 0 group by pos;", connection)
print(avg_pos_weights)
avg_pos_heights = pd.read_sql("select pos, avg(height) from players where height <> 0 group by pos;", connection)
print(avg_pos_heights)


- Store the values in two dictionaries, where the key values are the players' positions

In [None]:
# add to a dictionary where the key are the positions and the values are the avg weights
avg_weights = {}

for index, row in avg_pos_weights.iterrows():
    avg_weights[row["pos"]] = row["avg(weight)"]
    
print(avg_weights)

avg_heights = {}

for index, row in avg_pos_heights.iterrows():
    avg_heights[row["pos"]] = row["avg(height)"]
    
print(avg_heights)

In [None]:

for index, row in active_missing_values_players.iterrows():
    player = pd.read_sql("select * from players where bioID = '" + row["bioID"] + "';", connection)
    
    pos = player["pos"].values[0]
    if pos == '':
        continue
    
    if(player["weight"] != 0 and player["height"] != 0):
        # print("Player already has values")
        # print(player)
        continue
    
    print(player)
    
    print("\n===\n")

    ## get average values for the player's position pos
    if(player["weight"].values[0] == 0):
        weight = avg_weights[pos]
    else:
        weight = player["weight"].values[0]
    if (player["height"].values[0] == 0):
        height = avg_heights[pos]
    else:
        height = player["height"].values[0]

    
    
    ## get row index
    pos = row.index[0]
    
    # update player's height and weight
    print("UPDATE players SET height = '" + str(height) + "', weight = '" + str(weight) + "' WHERE bioID = '" + player["bioID"].values[0] + "';")
    
    # update player's height and weight
    connection.execute("UPDATE players SET height = " + str(height) + ", weight = " + str(weight) + " WHERE bioID = '" + player["bioID"].values[0] + "';")
    connection.commit()
    

# Outliers

Now lets check if there are any outliers in the data

## Player

In this table we will be looking for outliers in weight, height and birth dates.

In [None]:
# graph with player weight distribution
df = pd.read_sql("SELECT weight FROM players;",connection)
df = df[df.weight != 0]

plt.figure(figsize=(10, 7))
plt.title("Player weight distribution")
plt.xlabel("Weight")
plt.ylabel("Number of players")
plt.boxplot(df["weight"])
plt.show()


In [None]:
# graph with player height distribution
df = pd.read_sql("SELECT height FROM players;",connection)
df = df[df.height != 0]

plt.figure(figsize=(10, 7))
plt.title("Player height distribution")
plt.xlabel("height")
plt.ylabel("Number of players")
plt.boxplot(df["height"])
plt.show()

We can verify that there is one player with a height of 9.0. We can fix this via mean imputation, which means that her height will be replaced by the average height of the players that play in the same position as her.

In [None]:
# select player with height < 20
df = pd.read_sql("SELECT * FROM players WHERE height < 20 and height > 0;",connection)

# get the average height for the player's position
average_height = avg_heights[df["pos"].values[0]]

# update player's height
connection.execute("UPDATE players SET height = " + str(average_height) + " WHERE bioID = '" + df["bioID"].values[0] + "';")
connection.commit()

In [None]:
# graph with player birth year distribution
df = pd.read_sql("SELECT birthDate FROM players;",connection)
df = df[df.birthDate != "0000-00-00"]

#convert birthdate to year
df["birthDate"] = pd.to_datetime(df["birthDate"])
df["birthDate"] = df["birthDate"].dt.year

plt.figure(figsize=(10, 7))
plt.title("Player birth year distribution")
plt.xlabel("Birth year")
plt.ylabel("Number of players")
plt.boxplot(df["birthDate"])
plt.show()

## Inconsistent data

### Player Awards

- Check if there's any award, that should be given to one player, is given to two or more players.

In [None]:
dataframe = pd.read_sql(" select count(playerID), award, year from awards_players group by award, year;", connection)

# print rows 
print(dataframe)

However, we noticed that there's an award missing part of its title. Therefore, we'll have to fix it.

In [None]:
connection.execute("UPDATE awards_players SET award = 'Kim Perrot Sportsmanship Award' WHERE award = 'Kim Perrot Sportsmanship';")
connection.commit()

### Teams Post

- Check if, in any year, no more than 8 teams passed to the playoffs.

In [None]:
dataframe = pd.read_sql("select count(tmID) as num, year from teams_post group by year having num > 8;", connection)
print(dataframe)

- Check if, in any year, only one team won the playoff.

In [None]:
dataframe = pd.read_sql("select year, tmID, finals from teams where finals = 'W' order by year;", connection)
print(dataframe)

### Teams

- Check if the sum of games won and lost by a player is equal to the total games played by a team

In [None]:
dataframe = pd.read_sql("select year, tmID, won, lost, GP, (won + lost) as Games from teams where Games <> GP;", connection)
print(dataframe)

- Check if the sum of rebounds made by a team is equal to the sum of offensive rebounds and defensive rebounds

In [None]:
dataFrame = pd.read_sql("select year, tmID, o_oreb, o_dreb, o_reb, (o_oreb + o_dreb) as rebounds from teams where o_reb <> rebounds;", connection)
print(dataFrame)
print("===============================")

dataFrame = pd.read_sql("select year, tmID, d_oreb, d_dreb, d_reb, (d_oreb + d_dreb) as rebounds from teams where d_reb <> rebounds;", connection)
print(dataFrame)

- Check if the stats (field goals, 3 pointers, free throws, etc.) attempted are in a bigger quantity than the stats made

In [None]:
dataframe = pd.read_sql("select year, tmID from teams where o_fgm > o_fga;", connection)
print(dataframe)
print("===============================")

dataframe = pd.read_sql("select year, tmID from teams where o_ftm > o_fta;", connection)
print(dataframe)
print("===============================")

dataframe = pd.read_sql("select year, tmID from teams where o_3pm > o_3pa;", connection)
print(dataframe)
print("===============================")

dataframe = pd.read_sql("select year, tmID from teams where d_fgm > d_fga;", connection)
print(dataframe)
print("===============================")

dataframe = pd.read_sql("select year, tmID from teams where d_ftm > d_fta;", connection)
print(dataframe)
print("===============================")

dataframe = pd.read_sql("select year, tmID from teams where d_3pm > d_3pa;", connection)
print(dataframe)
print("===============================")


## Removing irrelevant Columns

Columns that are entirely composed by missing values can be removed. The league id column can also be dropped since the entire dataset belongs to the same league.

In [None]:
# remove tmORB, tmDRB, tmTRB, opptmORB, opptmDRB, opptmTRB from teams using a query
connection.execute("ALTER TABLE teams DROP COLUMN tmORB;")
connection.execute("ALTER TABLE teams DROP COLUMN tmDRB;")
connection.execute("ALTER TABLE teams DROP COLUMN tmTRB;")
connection.execute("ALTER TABLE teams DROP COLUMN opptmORB;")
connection.execute("ALTER TABLE teams DROP COLUMN opptmDRB;")
connection.execute("ALTER TABLE teams DROP COLUMN opptmTRB;")

# remove franchID and lgID from teams using a query
connection.execute("ALTER TABLE teams DROP COLUMN franchID;")
connection.execute("ALTER TABLE teams DROP COLUMN lgID;")

In [None]:
# remove firstSeason and lastSeason from players using a query
connection.execute("ALTER TABLE players DROP COLUMN firstSeason;")
connection.execute("ALTER TABLE players DROP COLUMN lastSeason;")

In [None]:
#remove lgIDWinner, lgIDLoser and series from series_post using a query
connection.execute("ALTER TABLE series_post DROP COLUMN lgIDWinner;")
connection.execute("ALTER TABLE series_post DROP COLUMN lgIDLoser;")
connection.execute("ALTER TABLE series_post DROP COLUMN series;")

In [None]:
#remove lgID from teams_post using a query
connection.execute("ALTER TABLE teams_post DROP COLUMN lgID;")

#remove lgID from awards_players using a query
connection.execute("ALTER TABLE awards_players DROP COLUMN lgID;")

#remove lgID from players_teams using a query
connection.execute("ALTER TABLE players_teams DROP COLUMN lgID;")

In [None]:
#remove lgID from coaches using a query
connection.execute("ALTER TABLE coaches DROP COLUMN lgID;")

# Trying models

Considering the data context we decided to develop a more complex model that considers teams as a set of players and coach.

Our goal is to make the final prediction based in the expected players performance according to their recorded statistics in the previous years.

In [None]:
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LogisticRegression

# remove confID and divID from teams using a query
# connection.execute("ALTER TABLE teams DROP COLUMN confID;")
# connection.execute("ALTER TABLE teams DROP COLUMN divID;")
# connection.commit()

# get data
df = pd.read_sql("select * from players join players_teams on players.bioID = players_teams.playerID;", connection)

columns = ['bioID', 'pos', 'height', 'weight', 'college', 'collegeOther',
       'birthDate', 'year', 'stint', 'tmID', 'points', 'oRebounds', 'dRebounds', 'rebounds',
       'assists', 'steals', 'blocks', 'turnovers', 'PF', 'fgAttempted',
       'fgMade', 'ftAttempted', 'ftMade', 'threeAttempted', 'threeMade']

df = df[columns]

# get bioID and year from the dataframe
bioID = df["bioID"].values
year = df["year"].values

iterable = zip(bioID, year)

# iterate through the (bioID, year) pairs

for bioID, year in iterable:
        # get number of awards for the player in the team in the year
        query = "select count(award) as num_awards_player from awards_players ap join players_teams pt on ap.year = pt.year \
                and ap.playerID = pt.playerID where ap.playerID = '" + bioID + "' and ap.year <= " + str(year) + ";"
                
        player_awards = pd.read_sql(query, connection)
        
        # if(player_awards["num_awards_player"].values[0] > 0):
        #         print(bioID, year, player_awards)
                
        # add number of awards to the dataframe
        df.loc[(df["bioID"] == bioID) & (df["year"] == year), "num_awards_player"] = player_awards["num_awards_player"].values[0]
     
# extract year from birthDate
df["birthDate"] = pd.to_datetime(df["birthDate"])
df["birthDate"] = df["birthDate"].dt.year

player_ids_10 = df[df["year"] == 10]["bioID"].values

In [None]:
# transform categorical data
categorical_columns = ['bioID', 'pos', 'college', 'collegeOther', 'tmID']
for col in categorical_columns:
    df[col] = df[col].astype('category')
    
df = pd.get_dummies(df, columns=categorical_columns)

print(df['birthDate'])

# # get all rows from df where year = 10
test_data = df.loc[df["year"] == 10]

# # get all rows from df where year <> 10
train_data = df.loc[df["year"] < 10]


labels = ['points', 'oRebounds', 'dRebounds', 'rebounds', 'assists', 'steals',
       'blocks', 'turnovers', 'PF', 'fgAttempted', 'fgMade', 'ftAttempted',
       'ftMade', 'threeAttempted', 'threeMade']

inputs = []

for col in train_data.columns:
    if col not in labels:
        inputs.append(col)

print(inputs)

train_inputs = train_data[inputs].values
train_labels = train_data[labels].values

test_inputs = test_data[inputs].values
test_labels = test_data[labels].values

# scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_inputs = scaler.fit_transform(train_inputs)
test_inputs = scaler.transform(test_inputs)

In [None]:
# train model
model = MultiOutputRegressor(MLPRegressor(hidden_layer_sizes=(100, 100, 100), max_iter=2000,batch_size=32, alpha=0.0001, solver='adam', verbose=10, random_state=21, tol=0.000000001))
model.fit(train_inputs, train_labels)

# test model
predictions = model.predict(test_inputs)
print(predictions)

In [None]:
# convert all predictions to integers
for i in range(len(predictions)):
    for j in range(len(predictions[i])):
        predictions[i][j] = int(round(predictions[i][j]))

for i in range(len(predictions)):
    # print("player: ", i)
    print("player: ", player_ids_10[i])
    for j in range(len(predictions[i])):
        print(labels[j], ":  predicted: ", predictions[i][j], " actual: ", test_labels[i][j])
    print("\n")


In [None]:
# try neuronal model 
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# remove confID and divID from teams using a query
connection.execute("ALTER TABLE teams DROP COLUMN confID;")
connection.execute("ALTER TABLE teams DROP COLUMN divID;")
connection.commit()

# get data
df = pd.read_sql("select * from teams;", connection)
print(df.columns)

# transform categorical data
categorical_columns = ["tmID", "firstRound", "semis", "finals","name","arena"]
for col in categorical_columns:
    df[col] = df[col].astype('category')
    
df = pd.get_dummies(df, columns=categorical_columns)

print(df.columns)

# get inputs and outputs
inputs = df.loc[:, df.columns != "playoff"].values
labels = df["playoff"].values

# split data
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, labels, test_size=0.3, random_state=1)

# scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_inputs = scaler.fit_transform(train_inputs)
test_inputs = scaler.transform(test_inputs)

# create model
model = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=2000,batch_size=32, alpha=0.0001, solver='adam', verbose=10, random_state=21, tol=0.000000001)
model.fit(train_inputs, train_labels)

# predict
predictions = model.predict(test_inputs)

# print results
print("Accuracy:", accuracy_score(test_labels, predictions))
print("Confusion Matrix:\n", confusion_matrix(test_labels, predictions))
print("Classification Report:\n", classification_report(test_labels, predictions))


In [None]:
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LogisticRegression

# remove confID and divID from teams using a query
# connection.execute("ALTER TABLE teams DROP COLUMN confID;")
# connection.execute("ALTER TABLE teams DROP COLUMN divID;")
# connection.commit()

# get data
df = pd.read_sql("select * from players join players_teams on players.bioID = players_teams.playerID;", connection)

columns = ['bioID', 'pos', 'height', 'weight', 'college', 'collegeOther',
       'birthDate', 'year', 'stint', 'tmID', 'points', 'oRebounds', 'dRebounds', 'rebounds',
       'assists', 'steals', 'blocks', 'turnovers', 'PF', 'fgAttempted',
       'fgMade', 'ftAttempted', 'ftMade', 'threeAttempted', 'threeMade']

df = df[columns]

# get bioID and year from the dataframe
bioID = df["bioID"].values
year = df["year"].values

iterable = zip(bioID, year)

# iterate through the (bioID, year) pairs

for bioID, year in iterable:
        # get number of awards for the player in the team in the year
        query = "select count(award) as num_awards_player from awards_players ap join players_teams pt on ap.year = pt.year \
                and ap.playerID = pt.playerID where ap.playerID = '" + bioID + "' and ap.year <= " + str(year) + ";"
                
        player_awards = pd.read_sql(query, connection)
        
        # if(player_awards["num_awards_player"].values[0] > 0):
        #         print(bioID, year, player_awards)
                
        # add number of awards to the dataframe
        df.loc[(df["bioID"] == bioID) & (df["year"] == year), "num_awards_player"] = player_awards["num_awards_player"].values[0]
     
# extract year from birthDate
df["birthDate"] = pd.to_datetime(df["birthDate"])
df["birthDate"] = df["birthDate"].dt.year

player_ids_11 = df[df["year"] == 11]["bioID"].values

In [None]:
# transform categorical data
categorical_columns = ['bioID', 'pos', 'college', 'collegeOther', 'tmID']
for col in categorical_columns:
    df[col] = df[col].astype('category')
    
df = pd.get_dummies(df, columns=categorical_columns)

print(df)
# # get all rows from df where year = 11
test_data = df.loc[df["year"] == 11]

print(test_data)

# # get all rows from df where year <> 11
train_data = df.loc[df["year"] != 11]


labels = ['points', 'oRebounds', 'dRebounds', 'rebounds', 'assists', 'steals',
       'blocks', 'turnovers', 'PF', 'fgAttempted', 'fgMade', 'ftAttempted',
       'ftMade', 'threeAttempted', 'threeMade']

inputs = []

for col in train_data.columns:
    if col not in labels:
        inputs.append(col)

# print(inputs)

train_inputs = train_data[inputs].values
train_labels = train_data[labels].values

test_inputs = test_data[inputs].values
test_labels = test_data[labels].values

# scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_inputs = scaler.fit_transform(train_inputs)
test_inputs = scaler.transform(test_inputs)