# Creating Features For My Training DataFrame 

In [4]:
import pandas as pd
import numpy as np
from functools import reduce

Years coached my current coaches for each team

In [5]:
def coaches_years(file_name):
    #"./DataFiles_2018/TeamCoaches.csv"
    df = pd.read_csv(file_name)
    years_coached = df["CoachName"].value_counts()
    years_coached = pd.DataFrame(years_coached).reset_index()
    years_coached.columns = ["CoachName", "years_coached"]

    coachs2018 = df[df["Season"] == 2018].reset_index()
    coaches_years = pd.merge(coachs2018, years_coached, on="CoachName", how="left")
    coaches_years.drop(["index","Season", "FirstDayNum", "LastDayNum"],axis=1, inplace = True)
    return coaches_years

In [6]:
coaches_years_2018 = coaches_years("./DataFiles_2018/TeamCoaches.csv")
coaches_years_2018.to_csv("coaches_years_2018.csv", index=False)


The Win/Loss record for each team in their conference tournament

In [133]:
def Conference_Tourny_WL(Conference_tourny_FP, Team_Conferences_FP):
    #"./DataFiles_2018/ConferenceTourneyGames.csv", "./DataFiles_2018/TeamConferences.csv"
    conT_games = pd.read_csv(Conference_tourny_FP)
    teamconferences = pd.read_csv(Team_Conferences_FP)
    
    teamconferences_2018 = teamconferences[teamconferences["Season"] == 2018]
    
    conT_games_2018 = conT_games[conT_games["Season"] == 2018]
    
    wins = conT_games_2018["WTeamID"].value_counts()
    wins_C_2018 = pd.DataFrame(data = wins).reset_index()
    wins_C_2018.columns =["TeamID", "conference_tourny_wins"]
    
    losses = conT_games_2018["LTeamID"].value_counts()
    losses_C_2018 = pd.DataFrame(data = losses).reset_index()
    losses_C_2018.columns =["TeamID", "conference_tourny_losses"]
    
    conferencesT_WL = reduce(lambda left,right: pd.merge(left,right, on="TeamID", how="left"), [teamconferences_2018, wins_C_2018,losses_C_2018])
    conferencesT_WL.fillna(0,inplace=True)
    
    return conferencesT_WL

In [134]:
conferencest_WL = Conference_Tourny_WL("./DataFiles_2018/ConferenceTourneyGames.csv", "./DataFiles_2018/TeamConferences.csv")
conferencest_WL.to_csv("conference_tourny_WL.csv", index=False)

Creating a DataFrame that contains every game played in the season

In [135]:
def season_games(file_path):
    #"./DataFiles_2018/RegularSeasonCompactResults.csv"
    df = pd.read_csv(file_path)
    season_year = df[df["Season"] == 2018]
    games = []

    for row in season_year.to_dict("records"):
        for perspective in ["W", "L"]:
            game = {}
            game["Won"] = int(perspective == "W")
            if perspective == "W":
                team_letter = "W"
                opp_letter = "L"
            else:
                team_letter = "L"
                opp_letter = "W"
            winner_location = row["WLoc"]
            if winner_location == "A":
                loser_location = "H"
            elif winner_location == "H":
                loser_location = "A"
            else:
                loser_location = "N"
            row["LLoc"] = loser_location
            team_stats = [k for k,v in row.items() if k[0] == team_letter]
            opp_stats = [k for k,v in row.items() if k[0] == opp_letter]
            for stat in team_stats:
                game_stat = "Team" + stat[1:]
                game[game_stat] = row[stat]
            for stat in opp_stats:
                opp_stat = "Opp" + stat[1:]
                game[opp_stat] = row[stat]

            for general_stat in ["DayNum", "Season"]:
                game[general_stat] = row[general_stat]

            games.append(game)
    games_year = pd.DataFrame(games)
    games_year.drop(['DayNum', 'OppLoc', 'OppScore','Season','TeamLoc','TeamScore'], axis=1, inplace=True)
    games_year.rename(columns={"TeamTeamID":"TeamID"}, inplace=True)
    col = ["TeamID","OppTeamID","Won"]
    games_year = games_year[col]
    return games_year

In [136]:
games_year = season_games("./DataFiles_2018/RegularSeasonCompactResults.csv")
games_year.to_csv("games2018.csv", index=False)

The season average box score stats for every team. 

In [137]:
def team_season_stats(file_path):
    #"./DataFiles_2018/RegularSeasonDetailedResults.csv"
    df = pd.read_csv(file_path)
    df_2018 = df[df["Season"] == 2018]
    
    games = []
    
    for row in df_2018.to_dict("records"):
        for perspective in ["W", "L"]:
            game = {}
            game["Win%"] = int(perspective == "W")
            if perspective == "W":
                team_letter = "W"
                opp_letter = "L"
            else:
                team_letter = "L"
                opp_letter = "W"
            winner_location = row["WLoc"]
            if winner_location == "A":
                loser_location = "H"
            elif winner_location == "H":
                loser_location = "A"
            else:
                loser_location = "N"
            row["LLoc"] = loser_location
            team_stats = [k for k,v in row.items() if k[0] == team_letter]
            for stat in team_stats:
                game_stat = stat[1:]
                game[game_stat] = row[stat]

            for general_stat in ["DayNum", "Season"]:
                game[general_stat] = row[general_stat]
            games.append(game)

    stats_year = pd.DataFrame(games)
    stats_year.drop("Loc", axis=1,inplace=True)
    
    team_season_stats = pd.DataFrame(columns=['Ast', 'Blk', 'DR', 'FGA', 'FGA3', 'FGM', 'FGM3', 'FTA', 'FTM', 'OR',
       'PF', 'Score', 'Stl', 'TO', 'TeamID', "Win%"])
    
    for team in stats_year["TeamID"].unique():
        team_stats =stats_year[stats_year["TeamID"] == team]
        team_stats_df = pd.DataFrame(team_stats.mean().round(2)).T
        team_season_stats = pd.concat([team_season_stats, team_stats_df])
    
    team_season_stats.reset_index(inplace=True)
    return team_season_stats.drop("index", axis=1)

In [138]:
team_2018_stats = team_season_stats("./DataFiles_2018/RegularSeasonDetailedResults.csv")
team_2018_stats.to_csv("team_2018_detailed_stats.csv", index=False)

Getting the min, average and max ranks for every teams from selected ranking systems. 

In [148]:
def get_ranks(file_path1,file_path2):
    #"./DataFiles_2018/MasseyOrdinals_thruSeason2018_Day128.csv", 
    #"./DataFiles_2018/MasseyOrdinals_2018_133_only_43Systems.csv"
    rankings128 = pd.read_csv(file_path1)
    ranking133 = pd.read_csv(file_path2)
    rankings = pd.concat([rankings128,ranking133])
    rank_2018 = rankings[rankings["Season"] == 2018]
    teams = rank_2018["TeamID"].unique()
    rank_all = ["RPI", "ESR", "EBP","AP","USA"]
    rank_by_team_all = []
    for team in teams:
        team_rankings = {}
        team_rankings["TeamID"] = team
        for rank in rank_all:
            one_rank = rank_2018[rank_2018["SystemName"] == rank]
            team_rank = one_rank[one_rank["TeamID"] == team]
            team_rankings[rank+"_mean"] = round(team_rank["OrdinalRank"].mean(),2)
            team_rankings[rank+"_min"] = team_rank["OrdinalRank"].min()
            team_rankings[rank+"_max"] = team_rank["OrdinalRank"].max()
        rank_by_team_all.append(team_rankings)
        
    all_team_ranks = pd.DataFrame(rank_by_team_all)
    all_team_ranks.fillna(35, inplace=True)
    return all_team_ranks

In [149]:
the_rankings = get_ranks("./DataFiles_2018/MasseyOrdinals_thruSeason2018_Day128.csv", "./DataFiles_2018/MasseyOrdinals_2018_133_only_43Systems.csv")
the_rankings.to_csv("the_rankings.csv", index=False)

Creating a DataFrame for every game played in conference tournament. 

In [150]:
def conf_games(file_path):
    #"./DataFiles_2018/ConferenceTourneyGames.csv"
    df = pd.read_csv(file_path)
    season_year = df[df["Season"] == 2018]
    games = []

    for row in season_year.to_dict("records"):
        for perspective in ["W", "L"]:
            game = {}
            game["Won"] = int(perspective == "W")
            if perspective == "W":
                team_letter = "W"
                opp_letter = "L"
            else:
                team_letter = "L"
                opp_letter = "W"

            team_stats = [k for k,v in row.items() if k[0] == team_letter]
            opp_stats = [k for k,v in row.items() if k[0] == opp_letter]
            for stat in team_stats:
                game_stat = "Team" + stat[1:]
                game[game_stat] = row[stat]
            for stat in opp_stats:
                opp_stat = "Opp" + stat[1:]
                game[opp_stat] = row[stat]

            for general_stat in ["DayNum", "Season"]:
                game[general_stat] = row[general_stat]

            games.append(game)
    games_year = pd.DataFrame(games)
    games_year.drop(['DayNum','Season'], axis=1, inplace=True)
    games_year.rename(columns={"TeamTeamID":"TeamID"}, inplace=True)
    col = ["TeamID","OppTeamID","Won"]
    games_year = games_year[col]
    return games_year

In [151]:
conf_tourny_games = conf_games("./DataFiles_2018/ConferenceTourneyGames.csv")
conf_tourny_games.to_csv("conf_tourny_games.csv", index=False)

Getting odds into a usable DataFrame that is able to merge on TeamID. 

In [8]:
def get_odds(odds_win, odds_4, all_names_fp, teams_fp):
    #"./DataFiles_2018/Odds_to_wins_tourny.csv", "./DataFiles_2018/Odd_to_final4.csv",
    #"./DataFiles_2018/TeamSpellings.csv", "./DataFiles_2018/Teams.csv"

    team_names = pd.read_csv(all_names_fp, encoding="latin-1")
    odds_to_win = pd.read_csv(odds_win)
    odds_to_final4 = pd.read_csv(odds_4)
    all_teams = pd.read_csv(teams_fp)
    all_teams.drop(["FirstD1Season", "LastD1Season"], axis=1, inplace=True)
    
    odds_to_win["odds_to_win"] = pd.DataFrame(odds_to_win["School"].map(lambda x:x.split("+")[1]))
    odds_to_win["School"] = pd.DataFrame(odds_to_win["School"].map(lambda x:x.split("+")[0]))
    odds_to_final4["odds_to_final4"] = pd.DataFrame(odds_to_final4["School"].map(lambda x:x.split("+")[1]))
    odds_to_final4["School"] = pd.DataFrame(odds_to_final4["School"].map(lambda x:x.split("+")[0]))
    
    Odds_tourny = pd.merge(odds_to_win,odds_to_final4,on="School")
    Odds_tourny["School"] = Odds_tourny["School"].map(lambda x:x.replace("\xa0", ""))
    Odds_tourny["School"]= Odds_tourny["School"].map(lambda x:x.lower())
    
    team_odds = pd.merge(Odds_tourny, team_names, how="left", left_on="School", right_on="TeamNameSpelling")
    # four teamid's would still not match up. Had to fill in anyway I could
    filler = [1120,1274,1420,1158]
    team_odds.loc[team_odds["TeamID"].isnull(), "TeamID"] = filler
    team_odds.drop(["School", "TeamNameSpelling"], axis=1, inplace=True)
    
    all_team_odds = pd.merge(all_teams, team_odds, how="left", on="TeamID")
    all_team_odds.fillna(100000, inplace=True)
    
    return all_team_odds

In [9]:
team_odds = get_odds("./DataFiles_2018/Odds_to_wins_tourny.csv", "./DataFiles_2018/Odd_to_final4.csv", "./DataFiles_2018/TeamSpellings.csv", "./DataFiles_2018/Teams.csv")
team_odds.to_csv("team_odds.csv", index=False)