In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV

In [11]:
## Read in dataframes 
data_dir = './DataFiles/'
# This file provides a master list of cities that have been locations for games played.
df_cities = pd.read_csv(data_dir + 'Cities.csv')
df_conferences = pd.read_csv(data_dir + 'Conferences.csv')
df_conferencetourney = pd.read_csv(data_dir + 'ConferenceTourneyGames.csv')
df_gamecities = pd.read_csv(data_dir + 'GameCities.csv')
df_tourneycompact = pd.read_csv(data_dir + 'NCAATourneyCompactResults.csv')
df_tourneydetailed = pd.read_csv(data_dir + 'NCAATourneyDetailedResults.csv')
df_tourneyseedroundslots = pd.read_csv(data_dir + 'NCAATourneySeedRoundSlots.csv')
df_tourneyseedrounds = pd.read_csv(data_dir + 'NCAATourneySeeds.csv')
df_tourneyslots = pd.read_csv(data_dir + 'NCAATourneySlots.csv')
df_seasoncompact = pd.read_csv(data_dir + 'RegularSeasonCompactResults.csv')
df_seasondetailed = pd.read_csv(data_dir + 'RegularSeasonDetailedResults.csv')
df_seasons = pd.read_csv(data_dir + 'Seasons.csv')
df_secondtourneycompact = pd.read_csv(data_dir + 'SecondaryTourneyCompactResults.csv')
df_coaches = pd.read_csv(data_dir + 'TeamCoaches.csv')
df_teamconferences = pd.read_csv(data_dir + 'TeamConferences.csv')
df_teams = pd.read_csv(data_dir + 'Teams.csv')
# df_teamspellings = pd.read_csv(data_dir + 'TeamSpellings.csv') utf-8 encoding issue

df_seasoncompact.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [None]:
def getSeasonData(team_id, year):
    # The data frame below holds stats for every single game in the given year
    df_year = df_seasoncompact[df_seasoncompact['Season'] == year]
    # Finding number of points per game
    gamesWon = df_year[df_year.Wteam == team_id] 
    totalPointsScored = gamesWon['Wscore'].sum()
    gamesLost = df_year[df_year.Lteam == team_id] 
    totalGames = gamesWon.append(gamesLost)
    numGames = len(totalGames.index)
    totalPointsScored += gamesLost['Lscore'].sum()
    
    # Finding number of points per game allowed
    totalPointsAllowed = gamesWon['Lscore'].sum()
    totalPointsAllowed += gamesLost['Wscore'].sum()
    
    stats_SOS_pd = pd.read_csv('Data/MMStats/MMStats_'+str(year)+'.csv')
    stats_SOS_pd = handleDifferentCSV(stats_SOS_pd)
    ratings_pd = pd.read_csv('Data/RatingStats/RatingStats_'+str(year)+'.csv')
    ratings_pd = handleDifferentCSV(ratings_pd)
    
    name = getTeamName(team_id)
    team = stats_SOS_pd[stats_SOS_pd['School'] == name]
    team_rating = ratings_pd[ratings_pd['School'] == name]
    if (len(team.index) == 0 or len(team_rating.index) == 0): #Can't find the team
        total3sMade = 0
        totalTurnovers = 0
        totalAssists = 0
        sos = 0
        totalRebounds = 0
        srs = 0
        totalSteals = 0
    else:
        total3sMade = team['X3P'].values[0]
        totalTurnovers = team['TOV'].values[0]
        if (math.isnan(totalTurnovers)):
            totalTurnovers = 0
        totalAssists = team['AST'].values[0]
        if (math.isnan(totalAssists)):
            totalAssists = 0
        sos = team['SOS'].values[0]
        srs = team['SRS'].values[0]
        totalRebounds = team['TRB'].values[0]
        if (math.isnan(totalRebounds)):
            totalRebounds = 0
        totalSteals = team['STL'].values[0]
        if (math.isnan(totalSteals)):
            totalSteals = 0
    
    #Finding tournament seed for that year
    tourneyYear = tourney_seeds_pd[tourney_seeds_pd['Season'] == year]
    seed = tourneyYear[tourneyYear['Team'] == team_id]
    if (len(seed.index) != 0):
        seed = seed.values[0][1]
        tournamentSeed = int(seed[1:3])
    else:
        tournamentSeed = 25 #Not sure how to represent if a team didn't make the tourney
    
    # Finding number of wins and losses
    numWins = len(gamesWon.index)
    # There are some teams who may have dropped to Division 2, so they won't have games 
    # a certain year. In this case, we don't want to divide by 0, so we'll just set the
    # averages to 0 instead
    if numGames == 0:
        avgPointsScored = 0
        avgPointsAllowed = 0
        avg3sMade = 0
        avgTurnovers = 0
        avgAssists = 0
        avgRebounds = 0
        avgSteals = 0
    else:
        avgPointsScored = totalPointsScored/numGames
        avgPointsAllowed = totalPointsAllowed/numGames
        avg3sMade = total3sMade/numGames
        avgTurnovers = totalTurnovers/numGames
        avgAssists = totalAssists/numGames
        avgRebounds = totalRebounds/numGames
        avgSteals = totalSteals/numGames
    #return [numWins, sos, srs]
    #return [numWins, avgPointsScored, avgPointsAllowed, checkPower6Conference(team_id), avg3sMade, avg3sAllowed, avgTurnovers,
    #        tournamentSeed, getStrengthOfSchedule(team_id, year), getTourneyAppearances(team_id)]
    return [numWins, avgPointsScored, avgPointsAllowed, checkPower6Conference(team_id), avg3sMade, avgAssists, avgTurnovers,
           checkConferenceChamp(team_id, year), checkConferenceTourneyChamp(team_id, year), tournamentSeed,
            sos, srs, avgRebounds, avgSteals, getTourneyAppearances(team_id), getNumChampionships(team_id)]
