In [None]:
%matplotlib inline

import subprocess
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree
import matplotlib.pyplot as plt
import seaborn as sb

import sqlite3
import subprocess

# Create a shell script
# with open('myscript.sh', 'w') as f:
#     f.write('cat ./database/final.sql | sqlite3 ./database/bdfinal.sql')

# Execute the script in WSL
# subprocess.run(["wsl", "./myscript.sh"], check=True,shell=True)

connection = sqlite3.connect("./database/bdfinal.sql")

## Missing Values

In [None]:
print("\n=========================================\n")

df = pd.read_sql("SELECT * FROM coaches;",connection)

null_mask = df.isnull().any(axis=1)
null_rows = df[null_mask]
print(null_rows)

print("\n=========================================\n")

df = pd.read_sql("SELECT * FROM players;",connection)

null_mask = df.isnull().any(axis=1)
null_rows = df[null_mask]
print(null_rows)

## ---//---

## get rows where pos = ""
df = pd.read_sql("SELECT * FROM players;",connection)

## iterate through rows

col_names = df.columns
for index, row in df.iterrows():
    
    ## iterate through columns
    found = False
    if row["pos"] == "":
        found = True
    elif row["height"] == 0:
        found = True
    elif row["weight"] == 0:
        found = True
    elif row["birthDate"] == "" or row["birthDate"] == "0000-00-00":
        found = True
    elif row["college"] == "":
        found = True

    if(found):
        print(row["bioID"])
    
print("\n=========================================\n")

df = pd.read_sql("SELECT * FROM players_teams;",connection)

null_mask = df.isnull().any(axis=1)
null_rows = df[null_mask]
print(null_rows)

print("\n=========================================\n")

df = pd.read_sql("SELECT * FROM series_post;",connection)

null_mask = df.isnull().any(axis=1)
null_rows = df[null_mask]
print(null_rows)

print("\n=========================================\n")

df = pd.read_sql("SELECT * FROM teams;",connection)

null_mask = df.isnull().any(axis=1)
null_rows = df[null_mask]
print(null_rows)

print("\n=========================================\n")

df = pd.read_sql("SELECT * FROM teams_post;",connection)

null_mask = df.isnull().any(axis=1)
null_rows = df[null_mask]
print(null_rows)

### Player

- After a quick glance at the data, it's easy to see that there's a certain amount of players that have many important missing/null values (college, height and weight).

In [None]:
dataframe = pd.read_sql("select bioID from players where weight = 0 or height = 0 or college = '' or pos = '';", connection)
print(dataframe)

- Lets check if any of these players with a null position are actually coaches, since the awards_players tables has a coach award and references the players table.

In [None]:
# select count(*) from players where pos = "";
# execute the query
df = pd.read_sql("select count(*) from players where pos = '';",connection)
num_players = df.values[0][0]
print(num_players)

# select count(*) from players where pos = "" and bioID in (select coachID from coaches);
# execute the query
df = pd.read_sql("select count(*) from players where pos = '' and bioID in (select coachID from coaches);",connection)
num_players = df.values[0][0]
print(num_players)


- 52 out of the 78 players without position are coaches

- From these 208 players, it's important to see which actually were a part of a team

In [None]:
active_missing_values_players = pd.read_sql("select distinct(bioID), weight, height, pos from players where (weight = 0 or height = 0) and pos <> ''", connection)
print(active_missing_values_players)

- Regarding these 83 players, if a player doesn't have their position missing, we decided to replace their missing weight and/or height values with the average value of the players of their same position. 

    - Obtain the average weight and height for each player position:

In [None]:
# query para cada valor
avg_pos_weights = pd.read_sql("select pos, avg(weight) from players where weight <> 0 group by pos;", connection)
print(avg_pos_weights)
avg_pos_heights = pd.read_sql("select pos, avg(height) from players where height <> 0 group by pos;", connection)
print(avg_pos_heights)


- Store the values in two dictionaries, where the key values are the players' positions

In [None]:
# add to a dictionary where the key are the positions and the values are the avg weights
avg_weights = {}

for index, row in avg_pos_weights.iterrows():
    avg_weights[row["pos"]] = row["avg(weight)"]
    
print(avg_weights)

avg_heights = {}

for index, row in avg_pos_heights.iterrows():
    avg_heights[row["pos"]] = row["avg(height)"]
    
print(avg_heights)

In [None]:

for index, row in active_missing_values_players.iterrows():
    player = pd.read_sql("select * from players where bioID = '" + row["bioID"] + "';", connection)
    
    pos = player["pos"].values[0]
    if pos == '':
        continue
    
    if(player["weight"] != 0 and player["height"] != 0):
        # print("Player already has values")
        # print(player)
        continue
    
    print(player)
    
    print("\n===\n")

    ## get average values for the player's position pos
    if(player["weight"].values[0] == 0):
        weight = avg_weights[pos]
    else:
        weight = player["weight"].values[0]
    if (player["height"].values[0] == 0):
        height = avg_heights[pos]
    else:
        height = player["height"].values[0]

    
    
    ## get row index
    pos = row.index[0]
    
    # update player's height and weight
    print("UPDATE players SET height = '" + str(height) + "', weight = '" + str(weight) + "' WHERE bioID = '" + player["bioID"].values[0] + "';")
    
    # update player's height and weight
    connection.execute("UPDATE players SET height = " + str(height) + ", weight = " + str(weight) + " WHERE bioID = '" + player["bioID"].values[0] + "';")
    connection.commit()
    

# Outliers

Now lets check if there are any outliers in the data

## Player

In this table we will be looking for outliers in weight, height and birth dates.

In [None]:
# graph with player weight distribution
df = pd.read_sql("SELECT weight FROM players;",connection)
df = df[df.weight != 0]

plt.figure(figsize=(10, 7))
plt.title("Player weight distribution")
plt.xlabel("Weight")
plt.ylabel("Number of players")
plt.boxplot(df["weight"])
plt.show()


In [None]:
# graph with player height distribution
df = pd.read_sql("SELECT height FROM players;",connection)
df = df[df.height != 0]

plt.figure(figsize=(10, 7))
plt.title("Player height distribution")
plt.xlabel("height")
plt.ylabel("Number of players")
plt.boxplot(df["height"])
plt.show()

We can verify that there is one player with a height of 9.0. We can fix this via mean imputation, which means that her height will be replaced by the average height of the players that play in the same position as her.

In [None]:
# select player with height < 20
df = pd.read_sql("SELECT * FROM players WHERE height < 20 and height > 0;",connection)

# get the average height for the player's position
average_height = avg_heights[df["pos"].values[0]]

# update player's height
connection.execute("UPDATE players SET height = " + str(average_height) + " WHERE bioID = '" + df["bioID"].values[0] + "';")
connection.commit()

In [None]:
# graph with player birth year distribution
df = pd.read_sql("SELECT birthDate FROM players;",connection)
df = df[df.birthDate != "0000-00-00"]

#convert birthdate to year
df["birthDate"] = pd.to_datetime(df["birthDate"])
df["birthDate"] = df["birthDate"].dt.year

plt.figure(figsize=(10, 7))
plt.title("Player birth year distribution")
plt.xlabel("Birth year")
plt.ylabel("Number of players")
plt.boxplot(df["birthDate"])
plt.show()

## Inconsistent data

### Player Awards

- Check if there's any award, that should be given to one player, is given to two or more players.

In [None]:
dataframe = pd.read_sql(" select count(playerID), award, year from awards_players group by award, year;", connection)

# print rows 
print(dataframe)

However, we noticed that there's an award missing part of its title. Therefore, we'll have to fix it.

In [None]:
connection.execute("UPDATE awards_players SET award = 'Kim Perrot Sportsmanship Award' WHERE award = 'Kim Perrot Sportsmanship';")
connection.commit()

### Teams Post

- Check if, in any year, no more than 8 teams passed to the playoffs.

In [None]:
dataframe = pd.read_sql("select count(tmID) as num, year from teams_post group by year having num > 8;", connection)
print(dataframe)

- Check if, in any year, only one team won the playoff.

In [None]:
dataframe = pd.read_sql("select year, tmID, finals from teams where finals = 'W' order by year;", connection)
print(dataframe)

### Teams

- Check if the sum of games won and lost by a player is equal to the total games played by a team

In [None]:
dataframe = pd.read_sql("select year, tmID, won, lost, GP, (won + lost) as Games from teams where Games <> GP;", connection)
print(dataframe)

- Check if the sum of rebounds made by a team is equal to the sum of offensive rebounds and defensive rebounds

In [None]:
dataFrame = pd.read_sql("select year, tmID, o_oreb, o_dreb, o_reb, (o_oreb + o_dreb) as rebounds from teams where o_reb <> rebounds;", connection)
print(dataFrame)
print("===============================")

dataFrame = pd.read_sql("select year, tmID, d_oreb, d_dreb, d_reb, (d_oreb + d_dreb) as rebounds from teams where d_reb <> rebounds;", connection)
print(dataFrame)

- Check if the stats (field goals, 3 pointers, free throws, etc.) attempted are in a bigger quantity than the stats made

In [None]:
dataframe = pd.read_sql("select year, tmID from teams where o_fgm > o_fga;", connection)
print(dataframe)
print("===============================")

dataframe = pd.read_sql("select year, tmID from teams where o_ftm > o_fta;", connection)
print(dataframe)
print("===============================")

dataframe = pd.read_sql("select year, tmID from teams where o_3pm > o_3pa;", connection)
print(dataframe)
print("===============================")

dataframe = pd.read_sql("select year, tmID from teams where d_fgm > d_fga;", connection)
print(dataframe)
print("===============================")

dataframe = pd.read_sql("select year, tmID from teams where d_ftm > d_fta;", connection)
print(dataframe)
print("===============================")

dataframe = pd.read_sql("select year, tmID from teams where d_3pm > d_3pa;", connection)
print(dataframe)
print("===============================")


## Removing irrelevant Columns

In [None]:
# remove tmORB, tmDRB, tmTRB, opptmORB, opptmDRB, opptmTRB from teams using a query
connection.execute("ALTER TABLE teams DROP COLUMN tmORB;")
connection.execute("ALTER TABLE teams DROP COLUMN tmDRB;")
connection.execute("ALTER TABLE teams DROP COLUMN tmTRB;")
connection.execute("ALTER TABLE teams DROP COLUMN opptmORB;")
connection.execute("ALTER TABLE teams DROP COLUMN opptmDRB;")
connection.execute("ALTER TABLE teams DROP COLUMN opptmTRB;")

# remove franchID and lgID from teams using a query
connection.execute("ALTER TABLE teams DROP COLUMN franchID;")
connection.execute("ALTER TABLE teams DROP COLUMN lgID;")

In [None]:
# remove firstSeason and lastSeason from players using a query
connection.execute("ALTER TABLE players DROP COLUMN firstSeason;")
connection.execute("ALTER TABLE players DROP COLUMN lastSeason;")

In [None]:
#remove lgIDWinner, lgIDLoser and series from series_post using a query
connection.execute("ALTER TABLE series_post DROP COLUMN lgIDWinner;")
connection.execute("ALTER TABLE series_post DROP COLUMN lgIDLoser;")
connection.execute("ALTER TABLE series_post DROP COLUMN series;")

In [None]:
#remove lgID from teams_post using a query
connection.execute("ALTER TABLE teams_post DROP COLUMN lgID;")

#remove lgID from awards_players using a query
connection.execute("ALTER TABLE awards_players DROP COLUMN lgID;")

#remove lgID from players_teams using a query
connection.execute("ALTER TABLE players_teams DROP COLUMN lgID;")

In [None]:
#remove lgID from coaches using a query
connection.execute("ALTER TABLE coaches DROP COLUMN lgID;")

# Trying models

In [None]:
## create an empty dataframe without any column names, indices or data
df = pd.DataFrame()

## obtain the team IDS and year
team_ids = pd.read_sql("select tmID, year, confID from teams order by tmID", connection)



## iterate through each team and year
for index, row in team_ids.iterrows():

    ## obtain the team ID and year for each row
    team_id = row['tmID']
    year = row['year']
    confID = row['confID']

    # get all players from the team and year
    query = "select tmID, year, playerID from players_teams where tmID = '" + team_id + "' and year = " + str(year) + ";"

    ## obtain the players for each team
    team_players = pd.read_sql(query, connection)
    
    team_stats = {"year": year, "points": 0, "oRebounds": 0, "dRebounds": 0, "rebounds": 0,
                  "assists": 0, "steals": 0, "blocks": 0, "turnovers": 0, "PF": 0, "fgAttempted": 0,
                  "fgMade": 0, "ftAttempted": 0, "ftMade": 0, "threeAttempted": 0, "threeMade": 0,
                  'weight': 0, 'height': 0, "player_awards": 0, "confID": confID, "num_playoffs": 0,
                  "num_semis": 0, "num_finals": 0, "coach_win_ratio": 0, "coach_stint": 0,
                  "playoff": 0}

    # iterate through each player
    for idx, r in team_players.iterrows():
        ## obtain the player ID for each row
        player_id = r['playerID']

        # get the player's position
        query = "select pos from players where bioID = '" + player_id + "';"
        pos = pd.read_sql(query, connection).values[0][0]

        # get number of years played
        query = "select count(distinct year) as num_years from players_teams where playerID = '" + player_id + "' and year < " + str(year) + ";"
        num_years = pd.read_sql(query, connection).values[0][0]

        if num_years == 0:
            query = "select avg(points) as points, avg(oRebounds) as oRebounds, avg(dRebounds) as dRebounds, avg(rebounds) as rebounds, avg(assists) as assists, \
                avg(steals) as steals, avg(blocks) as blocks, avg(turnovers) as turnovers, avg(PF) as PF, avg(fgAttempted) as fgAttempted, \
                avg(fgMade) as fgMade, avg(ftAttempted) as ftAttempted, avg(ftMade) as ftMade, avg(threeAttempted) as threeAttempted, avg(threeMade) as threeMade \
                from players_teams join players on players_teams.playerID = players.bioID where year = " + str(year - 1) + " \
                and pos = '" + pos + "' and playerID not in (select playerID from players_teams where year < " + str(year - 1) + ");"
            player_stats = pd.read_sql(query, connection)

            ## add the player stats to the team stats
            team_stats['points'] += (player_stats['points'].values[0] or 0 )
            team_stats['oRebounds'] += (player_stats['oRebounds'].values[0] or 0 )
            team_stats['dRebounds'] += (player_stats['dRebounds'].values[0] or 0 )
            team_stats['rebounds'] += (player_stats['rebounds'].values[0] or 0 )
            team_stats['assists'] += (player_stats['assists'].values[0] or 0 )
            team_stats['steals'] += (player_stats['steals'].values[0] or 0 )
            team_stats['blocks'] += (player_stats['blocks'].values[0] or 0 )
            team_stats['turnovers'] += (player_stats['turnovers'].values[0] or 0 )
            team_stats['PF'] += (player_stats['PF'].values[0] or 0 )
            team_stats['fgAttempted'] += (player_stats['fgAttempted'].values[0] or 0 )
            team_stats['fgMade'] += (player_stats['fgMade'].values[0] or 0 )
            team_stats['ftAttempted'] += (player_stats['ftAttempted'].values[0] or 0 )
            team_stats['ftMade'] += (player_stats['ftMade'].values[0] or 0 )
            team_stats['threeAttempted'] += (player_stats['threeAttempted'].values[0] or 0 )
            team_stats['threeMade'] += (player_stats['threeMade'].values[0] or 0 )
            
        else:

            query = "select year, points, oRebounds, dRebounds, rebounds, assists, \
                    steals, blocks, turnovers, PF, fgAttempted, \
                    fgMade, ftAttempted, ftMade, threeAttempted, sum(threeMade) as threeMade \
                    from players_teams where playerID = '" + player_id + "' and year < " + str(year) + ";"
        
            ## obtain the average stats for each team
            player_stats = pd.read_sql(query, connection)

            points = oRebounds = dRebounds = rebounds = assists = steals = blocks = turnovers = PF = fgAttempted = fgMade = ftAttempted = ftMade = threeAttempted = threeMade = 0
            sum_weight = 0
            # iterate player stats and make weighted average for each year
            for j, row2 in player_stats.iterrows():

                iteration_year = row2['year']

                weight = 1 / (year - iteration_year)

                sum_weight += weight

                points += row2['points'] * weight
                oRebounds += row2['oRebounds'] * weight
                dRebounds += row2['dRebounds'] * weight
                rebounds += row2['rebounds'] * weight
                assists += row2['assists'] * weight
                steals += row2['steals'] * weight
                blocks += row2['blocks'] * weight
                turnovers += row2['turnovers'] * weight
                PF += row2['PF'] * weight
                fgAttempted += row2['fgAttempted'] * weight
                fgMade += row2['fgMade'] * weight
                ftAttempted += row2['ftAttempted'] * weight
                ftMade += row2['ftMade'] * weight
                threeAttempted += row2['threeAttempted'] * weight
                threeMade += row2['threeMade'] * weight

            # add the player stats to the team stats
            team_stats['points'] += points / sum_weight
            team_stats['oRebounds'] += oRebounds / sum_weight
            team_stats['dRebounds'] += dRebounds / sum_weight
            team_stats['rebounds'] += rebounds / sum_weight
            team_stats['assists'] += assists / sum_weight
            team_stats['steals'] += steals / sum_weight
            team_stats['blocks'] += blocks / sum_weight
            team_stats['turnovers'] += turnovers / sum_weight
            team_stats['PF'] += PF / sum_weight
            team_stats['fgAttempted'] += fgAttempted / sum_weight
            team_stats['fgMade'] += fgMade / sum_weight
            team_stats['ftAttempted'] += ftAttempted / sum_weight
            team_stats['ftMade'] += ftMade / sum_weight
            team_stats['threeAttempted'] += threeAttempted / sum_weight
            team_stats['threeMade'] += threeMade / sum_weight
        
        num_years = max(num_years, 1)


        # get each player num awards
        query = "select count(award) as num_awards_player from awards_players ap join players_teams pt on ap.year = pt.year \
                and ap.playerID = pt.playerID where ap.playerID = '" + player_id + "' and ap.year < " + str(year) + ";"
        num_awards_player = pd.read_sql(query, connection).values[0][0]
        team_stats['player_awards'] += num_awards_player

        # get the number of times the player went to the playoffs
        query = "select count(*) as num_playoffs from teams join players_teams on teams.year = players_teams.year and teams.tmID = players_teams.tmID\
            where playerID = '" + player_id + "' and teams.year < " + str(year) + " and firstRound <> '';"
        num_playoffs = pd.read_sql(query, connection).values[0][0]
        team_stats['num_playoffs'] += num_playoffs

        # get the number of times the player went to the semis
        query = "select count(*) as num_playoffs from teams join players_teams on teams.year = players_teams.year and teams.tmID = players_teams.tmID\
            where playerID = '" + player_id + "' and teams.year < " + str(year) + " and semis <> '';"
        num_semis = pd.read_sql(query, connection).values[0][0]
        team_stats['num_semis'] += num_semis

        # get the number of times the player went to the finals
        query = "select count(*) as num_playoffs from teams join players_teams on teams.year = players_teams.year and teams.tmID = players_teams.tmID\
            where playerID = '" + player_id + "' and teams.year < " + str(year) + " and finals <> '';"
        num_finals = pd.read_sql(query, connection).values[0][0]
        team_stats['num_finals'] += num_finals
    
    
    query = "select avg(weight), avg(height) from players_teams join players on players.bioID = players_teams.playerID \
        where tmID = '" + team_id + "' and year = " + str(year) + ";"
        
    ## obtain the average weight and height for each team
    team_weight_height = pd.read_sql(query, connection)
    
    ## add the average weight and height to the team stats
    team_stats['weight'] = team_weight_height['avg(weight)'].values[0]
    team_stats['height'] = team_weight_height['avg(height)'].values[0]

    # get team coach
    query = "select coachID, stint from coaches where tmID = '" + team_id + "' and year = "+ str(year) +";"
    coach_id = pd.read_sql(query, connection).values[0][0]
    stint = pd.read_sql(query, connection).values[0][1]
    team_stats['coach_stint'] = stint
    
    
    query = "select count(award) as num_awards from coaches join awards_players \
              on coaches.coachID = awards_players.playerID and coaches.year = awards_players.year \
              where coachID = '" + coach_id + "' and coaches.year < " + str(year) + ";"
    coach_awards = pd.read_sql(query, connection).values[0][0]
    team_stats['coach_awards'] = coach_awards

    # get average wins and losses from coach
    query = "select avg(won), avg(lost) from coaches where coachID = '" + coach_id + "' and year < " + str(year) + ";"
    coach_wins_losses = pd.read_sql(query, connection)
    num_matches_coach = (coach_wins_losses['avg(won)'].values[0] or 0) + (coach_wins_losses['avg(lost)'].values[0] or 0)
    if num_matches_coach == 0:
        team_stats['coach_win_ratio'] = 0
    else:
        team_stats['coach_win_ratio'] = (coach_wins_losses['avg(won)'].values[0] or 0) / num_matches_coach
        
        
    query = "select playoff from teams where tmID = '" + team_id + "' and year = " + str(year) + ";"
    playoff = pd.read_sql(query, connection)
    
    # check if it's Y or N
    if playoff['playoff'][0] == 'Y':
        team_stats['playoff'] = 1
    else:
        team_stats['playoff'] = 0
        
    
    # append the stats to the dataframe
    df = df._append(team_stats, ignore_index=True)

    print("Team id: " + team_id)
    print(team_stats)

In [None]:
categorical_columns = ["confID"]
for col in categorical_columns:
    df[col] = df[col].astype('category')

df= pd.get_dummies(df, columns=categorical_columns)

In [None]:
target_year = 10

# # get all rows from df where year = target year
test_data = df.loc[df["year"] == target_year]

# # get all rows from df where year <> target_year and year <> 1
train_data = df.loc[df["year"] < target_year]
train_data = train_data.loc[train_data["year"] != 1]

labels = ['playoff']

inputs = []

for col in df.columns:
    if col not in labels:
        inputs.append(col)

train_inputs = train_data[inputs].values
train_labels = train_data[labels].values

test_inputs = test_data[inputs].values
test_labels = test_data[labels].values

print(train_inputs)
print(train_labels)

print(test_inputs)
print(test_labels)

In [None]:
# min max scaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(train_inputs)

train_inputs = scaler.transform(train_inputs)
test_inputs = scaler.transform(test_inputs)

In [None]:
# logistic regression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear', random_state=0)
logreg.fit(train_inputs,train_labels)

y_pred = logreg.predict(test_inputs)
#print metrics
print("Accuracy:",metrics.accuracy_score(test_labels,y_pred))
print("Precision:",metrics.precision_score(test_labels, y_pred))
print("Recall:",metrics.recall_score(test_labels, y_pred))
print("F1:",metrics.f1_score(test_labels, y_pred))

feature_importance = abs(logreg.coef_[0])
feature_importance = sorted(zip(inputs, feature_importance), key=lambda x: x[1], reverse=True)

print("Feature importance:")
for i in range(len(feature_importance)):
    print(f"{feature_importance[i][0]}: {feature_importance[i][1]}")

# confusion matrix
confusion = confusion_matrix(test_labels, y_pred)
print(f"Confusion matrix:\n{confusion}")

#plot confusion matrix
plt.figure(figsize=(5,5))
sb.heatmap(confusion, annot=True, fmt="g", linewidths=.5, square = True, xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
all_sample_title = 'Passed to the playoffs?'
plt.title(all_sample_title, size = 10)
plt.show()

# roc curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(test_labels, logreg.predict(test_inputs))
fpr, tpr, thresholds = roc_curve(test_labels, logreg.predict_proba(test_inputs)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()


query = "select tmId, confID from teams where year = " + str(target_year) + ";"
teams_conf_ids = pd.read_sql(query, connection)

query = "select tmID from teams where year = " + str(target_year) + ";"

# print(teams_conf_ids)

west_teams = teams_conf_ids[teams_conf_ids["confID"] == "WE"]
east_teams = teams_conf_ids[teams_conf_ids["confID"] == "EA"]

# print(west_teams)
# print(east_teams)

# print the probabilities for each class
probs = logreg.predict_proba(test_inputs)
probs_west = []
probs_east = []

query = "select tmID from teams where year = " + str(target_year) + ";"
team_ids = pd.read_sql(query, connection)
team_ids = team_ids["tmID"].values

for i in range(len(probs)):
    team_id = team_ids[i]
    
    if team_id in west_teams["tmID"].values:
        probs_west.append((team_id, probs[i][1]))
        
    if team_id in east_teams["tmID"].values:
        probs_east.append((team_id, probs[i][1]))
    
    # print(f"{team_name[0]}: {probs[i][1]}")

probs_west = sorted(probs_west, key=lambda x: x[1], reverse=True)
probs_east = sorted(probs_east, key=lambda x: x[1], reverse=True)

print(probs_west)
print(probs_east)

In [None]:
# neural network

from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=2000, alpha=0.0001, solver='adam', verbose=0, random_state=21, tol=0.000000001)
mlp.fit(train_inputs,train_labels)

y_pred = mlp.predict(test_inputs)
#print metrics
print("Accuracy:",metrics.accuracy_score(test_labels,y_pred))
print("Precision:",metrics.precision_score(test_labels, y_pred))

print("Recall:",metrics.recall_score(test_labels, y_pred))
print("F1:",metrics.f1_score(test_labels, y_pred))

# confusion matrix
confusion = confusion_matrix(test_labels, y_pred)
print(f"Confusion matrix:\n{confusion}")

#plot confusion matrix
plt.figure(figsize=(5,5))
sb.heatmap(confusion, annot=True, fmt="g", linewidths=.5, square = True, xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
all_sample_title = 'Passed to the playoffs?'
plt.title(all_sample_title, size = 10)
plt.show()

# print the probabilities for each class
probs = mlp.predict_proba(test_inputs)
probs_west = []
probs_east = []

for i in range(len(probs)):
    team_id = team_ids[i]
    
    if team_id in west_teams["tmID"].values:
        probs_west.append((team_id, probs[i][1]))
        
    if team_id in east_teams["tmID"].values:
        probs_east.append((team_id, probs[i][1]))
    
    # print(f"{team_name[0]}: {probs[i][1]}")
    
probs_west = sorted(probs_west, key=lambda x: x[1], reverse=True)
probs_east = sorted(probs_east, key=lambda x: x[1], reverse=True)

print(probs_west)
print(probs_east)

In [None]:
#standardize data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_inputs)

train_inputs = scaler.transform(train_inputs)
test_inputs = scaler.transform(test_inputs)

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=7, weights='distance')
knn.fit(train_inputs,train_labels)

y_pred = knn.predict(test_inputs)

#print metrics
print("Accuracy:",metrics.accuracy_score(test_labels,y_pred))
print("Precision:",metrics.precision_score(test_labels, y_pred))
print("Recall:",metrics.recall_score(test_labels, y_pred))
print("F1:",metrics.f1_score(test_labels, y_pred))

# confusion matrix
confusion = confusion_matrix(test_labels, y_pred)
print(f"Confusion matrix:\n{confusion}")

#plot confusion matrix
plt.figure(figsize=(5,5))
sb.heatmap(confusion, annot=True, fmt="g", linewidths=.5, square = True, xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
all_sample_title = 'Passed to the playoffs?'

plt.title(all_sample_title, size = 10)
plt.show()

# print the probabilities for each class
probs = knn.predict_proba(test_inputs)
probs_west = []
probs_east = []

for i in range(len(probs)):
    
    team_id = team_ids[i]
    
    if team_id in west_teams["tmID"].values:
        probs_west.append((team_id, probs[i][1]))
        
    if team_id in east_teams["tmID"].values:
        probs_east.append((team_id, probs[i][1]))
    
    # print(f"{team_name[0]}: {probs[i][1]}")
    
probs_west = sorted(probs_west, key=lambda x: x[1], reverse=True)
probs_east = sorted(probs_east, key=lambda x: x[1], reverse=True)

print(probs_west)
print(probs_east)

In [None]:
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LogisticRegression

# remove confID and divID from teams using a query
# connection.execute("ALTER TABLE teams DROP COLUMN confID;")
# connection.execute("ALTER TABLE teams DROP COLUMN divID;")
# connection.commit()

# get data
df = pd.read_sql("select * from players join players_teams on players.bioID = players_teams.playerID;", connection)

columns = ['bioID', 'pos', 'height', 'weight', 'college', 'collegeOther',
       'birthDate', 'year', 'stINTEGER', 'tmID', 'points', 'oRebounds', 'dRebounds', 'rebounds',
       'assists', 'steals', 'blocks', 'turnovers', 'PF', 'fgAttempted',
       'fgMade', 'ftAttempted', 'ftMade', 'threeAttempted', 'threeMade']

df = df[columns]

# get bioID and year from the dataframe
bioID = df["bioID"].values
year = df["year"].values

iterable = zip(bioID, year)

# iterate through the (bioID, year) pairs

for bioID, year in iterable:
        # get number of awards for the player in the team in the year
        query = "select count(award) as num_awards_player from awards_players ap join players_teams pt on ap.year = pt.year \
                and ap.playerID = pt.playerID where ap.playerID = '" + bioID + "' and ap.year <= " + str(year) + ";"
                
        player_awards = pd.read_sql(query, connection)
        
        # if(player_awards["num_awards_player"].values[0] > 0):
        #         print(bioID, year, player_awards)
                
        # add number of awards to the dataframe
        df.loc[(df["bioID"] == bioID) & (df["year"] == year), "num_awards_player"] = player_awards["num_awards_player"].values[0]
     
# extract year from birthDate
df["birthDate"] = pd.to_datetime(df["birthDate"])
df["birthDate"] = df["birthDate"].dt.year

player_ids_10 = df[df["year"] == 10]["bioID"].values

In [38]:
# transform categorical data
categorical_columns = ['bioID', 'pos', 'college', 'collegeOther', 'tmID']
for col in categorical_columns:
    df[col] = df[col].astype('category')
    
df = pd.get_dummies(df, columns=categorical_columns)

print(df['birthDate'])

# # get all rows from df where year = 10
test_data = df.loc[df["year"] == 10]

# # get all rows from df where year <> 10
train_data = df.loc[df["year"] != 10]


labels = ['points', 'oRebounds', 'dRebounds', 'rebounds', 'assists', 'steals',
       'blocks', 'turnovers', 'PF', 'fgAttempted', 'fgMade', 'ftAttempted',
       'ftMade', 'threeAttempted', 'threeMade']

inputs = []

for col in train_data.columns:
    if col not in labels:
        inputs.append(col)

print(inputs)

train_inputs = train_data[inputs].values
train_labels = train_data[labels].values

test_inputs = test_data[inputs].values
test_labels = test_data[labels].values

# scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_inputs = scaler.fit_transform(train_inputs)
test_inputs = scaler.transform(test_inputs)

KeyError: 'bioID'

In [None]:
# train model
model = MultiOutputRegressor(MLPRegressor(hidden_layer_sizes=(100, 100, 100), max_iter=2000,batch_size=32, alpha=0.0001, solver='adam', verbose=10, random_state=21, tol=0.000000001))
model.fit(train_inputs, train_labels)

# test model
predictions = model.predict(test_inputs)
print(predictions)

In [None]:
# convert all predictions to integers
for i in range(len(predictions)):
    for j in range(len(predictions[i])):
        predictions[i][j] = int(round(predictions[i][j]))

for i in range(len(predictions)):
    # print("player: ", i)
    print("player: ", player_ids_10[i])
    for j in range(len(predictions[i])):
        print(labels[j], ":  predicted: ", predictions[i][j], " actual: ", test_labels[i][j])
    print("\n")


In [None]:
# try neuronal model 
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# remove confID and divID from teams using a query
connection.execute("ALTER TABLE teams DROP COLUMN confID;")
connection.execute("ALTER TABLE teams DROP COLUMN divID;")
connection.commit()

# get data
df = pd.read_sql("select * from teams;", connection)
print(df.columns)

# transform categorical data
categorical_columns = ["tmID", "firstRound", "semis", "finals","name","arena"]
for col in categorical_columns:
    df[col] = df[col].astype('category')
    
df = pd.get_dummies(df, columns=categorical_columns)

print(df.columns)

# get inputs and outputs
inputs = df.loc[:, df.columns != "playoff"].values
labels = df["playoff"].values

# split data
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, labels, test_size=0.3, random_state=1)

# scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_inputs = scaler.fit_transform(train_inputs)
test_inputs = scaler.transform(test_inputs)

# create model
model = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=2000,batch_size=32, alpha=0.0001, solver='adam', verbose=10, random_state=21, tol=0.000000001)
model.fit(train_inputs, train_labels)

# predict
predictions = model.predict(test_inputs)

# print results
print("Accuracy:", accuracy_score(test_labels, predictions))
print("Confusion Matrix:\n", confusion_matrix(test_labels, predictions))
print("Classification Report:\n", classification_report(test_labels, predictions))


## Season 11 

The first step in this part is adding the season 11 data to the database

In [None]:
# add the season 11 data to the database

# get the data from the csv file
df = pd.read_csv("./season11/coaches.csv")

# iterate through each row and add the data to the database
for index, row in df.iterrows():
        
    # add the row to the database
    connection.execute("INSERT INTO coaches (coachID, year, tmID, stint) VALUES ('" + row[0] + "', '" + str(row[1]) + "', '" + row[2] +  "', '" + str(row[4]) + "');")
    connection.commit()


In [None]:
# add the season 11 data to the database

# get the data from the csv file
df = pd.read_csv("./season11/teams.csv")

# iterate through each row and add the data to the database
for index, row in df.iterrows():
        
    # add the row to the database
    connection.execute("INSERT INTO teams (year, tmID, confID, name, arena) VALUES ('" + str(row[0]) + "', '" + row[2] + "', '" + row[4] +  "', '" + row[5] + "', '" + row[6] + "');")
    connection.commit()

In [None]:
# add the season 11 data to the database

# get the data from the csv file
df = pd.read_csv("./season11/players_teams.csv")

# iterate through each row and add the data to the database
for index, row in df.iterrows():
        
    # add the row to the database
    connection.execute("INSERT INTO players_teams (playerID, year, stINTEGER, tmID) VALUES ('" + row[0] + "', '" + str(row[1]) + "', '" + str(row[2]) +  "', '" + row[3] + "');")
    connection.commit()

In [50]:
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LogisticRegression

# remove confID and divID from teams using a query
# connection.execute("ALTER TABLE teams DROP COLUMN confID;")
# connection.execute("ALTER TABLE teams DROP COLUMN divID;")
# connection.commit()

# get data
df = pd.read_sql("select * from players join players_teams on players.bioID = players_teams.playerID;", connection)

columns = ['bioID', 'pos', 'height', 'weight', 'college', 'collegeOther',
       'birthDate', 'year', 'stINTEGER', 'tmID', 'points', 'oRebounds', 'dRebounds', 'rebounds',
       'assists', 'steals', 'blocks', 'turnovers', 'PF', 'fgAttempted',
       'fgMade', 'ftAttempted', 'ftMade', 'threeAttempted', 'threeMade']

df = df[columns]

# get bioID and year from the dataframe
bioID = df["bioID"].values
year = df["year"].values

iterable = zip(bioID, year)

# iterate through the (bioID, year) pairs

for bioID, year in iterable:
        # get number of awards for the player in the team in the year
        query = "select count(award) as num_awards_player from awards_players ap join players_teams pt on ap.year = pt.year \
                and ap.playerID = pt.playerID where ap.playerID = '" + bioID + "' and ap.year <= " + str(year) + ";"
                
        player_awards = pd.read_sql(query, connection)
        
        # if(player_awards["num_awards_player"].values[0] > 0):
        #         print(bioID, year, player_awards)
                
        # add number of awards to the dataframe
        df.loc[(df["bioID"] == bioID) & (df["year"] == year), "num_awards_player"] = player_awards["num_awards_player"].values[0]
     
# extract year from birthDate
df["birthDate"] = pd.to_datetime(df["birthDate"])
df["birthDate"] = df["birthDate"].dt.year

player_ids_11 = df[df["year"] == 11]["bioID"].values

In [51]:
# transform categorical data
categorical_columns = ['bioID', 'pos', 'college', 'collegeOther', 'tmID']
for col in categorical_columns:
    df[col] = df[col].astype('category')
    
df = pd.get_dummies(df, columns=categorical_columns)

# print(df['birthDate'])

print(df)
# # get all rows from df where year = 11
test_data = df.loc[df["year"] == 11]

print(test_data)

# # get all rows from df where year <> 11
train_data = df.loc[df["year"] != 11]


labels = ['points', 'oRebounds', 'dRebounds', 'rebounds', 'assists', 'steals',
       'blocks', 'turnovers', 'PF', 'fgAttempted', 'fgMade', 'ftAttempted',
       'ftMade', 'threeAttempted', 'threeMade']

inputs = []

for col in train_data.columns:
    if col not in labels:
        inputs.append(col)

# print(inputs)

train_inputs = train_data[inputs].values
train_labels = train_data[labels].values

test_inputs = test_data[inputs].values
test_labels = test_data[labels].values

# scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_inputs = scaler.fit_transform(train_inputs)
test_inputs = scaler.transform(test_inputs)

      height  weight  birthDate  year  stINTEGER  points  oRebounds   
0       74.0   169.0       1980     2          0     343         43  \
1       74.0   169.0       1980     3          0     314         45   
2       74.0   169.0       1980     4          0     318         44   
3       74.0   169.0       1980     5          0     146         17   
4       74.0   169.0       1980     6          0     304         29   
...      ...     ...        ...   ...        ...     ...        ...   
1871    78.0   174.0       1977     3          2       6          0   
1872    70.0   146.0       1976     6          0      90         11   
1873    70.0   155.0       1986    10          0     406         25   
1874    69.0   145.0       1980     4          0      11          0   
1875    67.0   148.0       1986     9          0      10          1   

      dRebounds  rebounds  assists  ...  tmID_MIN  tmID_NYL  tmID_ORL   
0           131       174       53  ...      True     False     False  \
1

ValueError: Found array with 0 sample(s) (shape=(0, 720)) while a minimum of 1 is required by StandardScaler.