In [None]:
import numpy as np
import pandas as pd
import xlrd
import os
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV,\
cross_val_score, KFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier,\
ExtraTreesClassifier, VotingClassifier, StackingRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder,\
OrdinalEncoder, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, f1_score,\
accuracy_score, classification_report, r2_score, plot_confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from xgboost import XGBClassifier
import xgboost as xgb
from matplotlib import pyplot as plt
import warnings, itertools

In [None]:
# Importing CSV of NCAAW regular season team data, pulled from Kaggle
season_df = pd.read_csv('/Users/aheinke/Documents/Flatiron/NYC-DS-010923/Phase_3/Phase3_Proj/PROJ_CSVs/WRegularSeasonDetailedResults.csv')
season_df

In [None]:
tourney_df = pd.read_csv('/Users/aheinke/Documents/Flatiron/NYC-DS-010923/Phase_3/Phase3_Proj/PROJ_CSVs/WNCAATourneyDetailedResults.csv')
tourney_df

In [None]:
# Merging the two datasets, sorting by the season and the day number of the game so that
# the games line up in order
merged_df = pd.concat([season_df, tourney_df])

# sort the merged dataframe by "col_name" in ascending order
sorted_df = merged_df.sort_values(['Season', 'DayNum'], ascending = [True, True])

# reset the index
games_df = sorted_df.reset_index(drop = True)
games_df

In [None]:
seeds_df = pd.read_csv('/Users/aheinke/Documents/Flatiron/NYC-DS-010923/Phase_3/Phase3_Proj/PROJ_CSVs/WNCAATourneySeeds.csv')
seeds_df

In [None]:
seeds_df['SeedNum'] = seeds_df['Seed'].str.extract('(\d+)').astype(int)
seeds_df

In [None]:
team_season_seed_dict = dict(zip(zip(seeds_df['TeamID'], seeds_df['Season']), seeds_df['SeedNum']))
team_season_seed_dict

In [None]:
games_df['WSeed'] = games_df.apply(lambda row: team_season_seed_dict.get((row['WTeamID'], row['Season']),\
                                                                           0), axis = 1)

In [None]:
games_df['LSeed'] = games_df.apply(lambda row: team_season_seed_dict.get((row['LTeamID'], row['Season']),\
                                                                           0), axis = 1)

In [None]:
games_df.head()

In [None]:
total_season_wins = games_df.groupby(['Season', 'WTeamID']).count()
total_season_wins = total_season_wins.reset_index()[['Season', 'WTeamID', 'DayNum']].rename(columns = {'DayNum':\
                                                                                    'NumWins', 'WTeamID': 'TeamID'})
total_season_wins.head()

In [None]:
total_season_losses = games_df.groupby(['Season', 'LTeamID']).count()
total_season_losses = total_season_losses.reset_index()[['Season', 'LTeamID', 'DayNum']].rename(columns = {'DayNum':\
                                                                                    'NumLosses', 'LTeamID': 'TeamID'})
total_season_losses.head()

In [None]:
df_features_season_w = games_df.groupby(['Season', 'WTeamID']).count().reset_index()[['Season',\
                                                'WTeamID']].rename(columns = {'WTeamID': 'TeamID'})
df_features_season_l = games_df.groupby(['Season', 'LTeamID']).count().reset_index()[['Season',\
                                                'LTeamID']].rename(columns = {'LTeamID': 'TeamID'})

In [None]:
df_features_season = pd.concat([df_features_season_w, df_features_season_l],\
                               0).drop_duplicates().sort_values(['Season', 'TeamID']).reset_index(drop = True)

In [None]:
df_features_season = df_features_season.merge(total_season_wins, on = ['Season', 'TeamID'], how = 'left')
df_features_season = df_features_season.merge(total_season_losses, on = ['Season', 'TeamID'], how = 'left')

In [None]:
# Creating a new column for the difference in points for each game
games_df['WinMargin'] = games_df['WScore'] - games_df['LScore']
games_df['LossMargin'] = games_df['LScore'] - games_df['WScore']
games_df

In [None]:
<blockquote class="tiktok-embed" cite="https://www.tiktok.com/@kiimmyyyyy__/video/7084699009390087466" data-video-id="7084699009390087466" style="max-width: 605px;min-width: 325px;" > <section> <a target="_blank" title="@kiimmyyyyy__" href="https://www.tiktok.com/@kiimmyyyyy__?refer=embed">@kiimmyyyyy__</a> i don’t even think he made the trash shot 😂 <a title="degrassi" target="_blank" href="https://www.tiktok.com/tag/degrassi?refer=embed">#degrassi</a> <a title="drake" target="_blank" href="https://www.tiktok.com/tag/drake?refer=embed">#drake</a> <a title="champagnepapi" target="_blank" href="https://www.tiktok.com/tag/champagnepapi?refer=embed">#champagnepapi</a> <a title="fyp" target="_blank" href="https://www.tiktok.com/tag/fyp?refer=embed">#fyp</a> <a target="_blank" title="♬ original sound - KIMMY 🦋" href="https://www.tiktok.com/music/original-sound-7084698985675377450?refer=embed">♬ original sound - KIMMY 🦋</a> </section> </blockquote> <script async src="https://www.tiktok.com/embed.js"></script>

In [None]:
FG + 0.5 * 3P) / FGA.

In [None]:
# true shooting percentage
games_features_df['1SP'] = 100 * ((0.5 * games_features_df['WScore']) / (games_features_df['WFGA'] + (0.475 * \
                                                                                games_features_df['WFTA'])))
games_features_df['2SP'] = 100 * ((0.5 * games_features_df['LScore']) / (games_features_df['LFGA'] + (0.475 * \
                                                                                games_features_df['LFTA'])))

In [None]:
# Number of Possessions
games_features_df['WPossessions'] = 0.5 * (('WFGA' + 0.4 * 'WFTA' - 1.07 * ('WOR' / ('WOR' + 'LDR')) * ('WFGA' - \
          'WFGM') + 'WTO') + ('LFGA' + 0.4 * 'LFTA' - 1.07 * ('LOR' / ('LOR' + 'WDR')) * ('LFGA' - 'LFGM') + 'LTO'))
games_features_df['LPossessions'] = 0.5 * (('LFGA' + 0.4 * 'LFTA' - 1.07 * ('LOR' / ('LOR' + 'WDR')) * ('LFGA' - \
          'LFGM') + 'LTO') + ('WFGA' + 0.4 * 'WFTA' - 1.07 * ('WOR' / ('WOR' + 'LDR')) * ('WFGA' - 'WFGM') + 'WTO'))

# Points Per Possession
games_features_df['WPtsPerPoss'] = games_features_df['WScore'] / games_features_df['WPossessions']
games_features_df['LPtsPerPoss'] = games_features_df['LScore'] / games_features_df['LPossessions']

# Effective Field Goal Percentage
sabermetrics['WEffectiveFGPct'] = ((df_season_results['WScore'] - df_season_results['WFTM']) / 2) / df_season_results['WFGA']
sabermetrics['LEffectiveFGPct'] = ((df_season_results['LScore'] - df_season_results['LFTM']) / 2) / df_season_results['LFGA']

# Percentage of Field Goals Assisted
sabermetrics['WAssistRate'] = df_season_results['WAst'] / df_season_results['WFGM']
sabermetrics['LAssistRate'] = df_season_results['LAst'] / df_season_results['LFGM']

# Rebound Percentage
sabermetrics['WReboundPct'] = (df_season_results['WOR'] + df_season_results['WDR']) / (df_season_results['WFGA'] - df_season_results['WFGM'])
sabermetrics['LReboundPct'] = (df_season_results['LOR'] + df_season_results['LDR']) / (df_season_results['LFGA'] - df_season_results['LFGM'])

# Assist to Turnover Ratio
sabermetrics['WATORatio'] = df_season_results['WAst'] / df_season_results['WTO']
sabermetrics['LATORatio'] = df_season_results['LAst'] / df_season_results['LTO']

# Turnover Rate
sabermetrics['WTORate'] = df_season_results['WTO'] / df_season_results['WPossessions']
sabermetrics['LTORate'] = df_season_results['LTO'] /  df_season_results['LPossessions']

# Percentage of Shots Beyond the Arc
sabermetrics['WBArcPct'] = df_season_results['WFGA3'] / df_season_results['WFGA']
sabermetrics['LBArcPct'] = df_season_results['LFGA3'] /  df_season_results['LFGA']

# Free Throw Rate
sabermetrics['WFTRate'] = df_season_results['WFTA'] / df_season_results['WFGA']
sabermetrics['LFTRate'] = df_season_results['LFTA'] /  df_season_results['LFGA']

# Block to Foul Percentage
sabermetrics['WBlockFoul'] = df_season_results['WBlk'] / (df_season_results['WPF'] + df_season_results['WBlk'])
sabermetrics['LBlockFoul'] = df_season_results['LBlk'] / (df_season_results['LPF'] + df_season_results['LBlk'])

# Steal to Foul Percentage
sabermetrics['WStealFoul'] = df_season_results['WStl'] / (df_season_results['WPF'] + df_season_results['WStl'])
sabermetrics['LStealFoul'] = df_season_results['LStl'] / (df_season_results['LPF'] + df_season_results['LStl'])