# NCAA ML Competition - Women's 2018
## W207 Final Project
## Julia Buffinton, Charlene Chen, Arvindh Ganesan, Prashant Kumar Sahay
### Due: 4/17/18

## Import Relevant Libraries

In [36]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools as it

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

## Load regular season data

In [37]:
# Load the data
data_dir = './WStage2DataFiles/'
df_seeds = pd.read_csv(data_dir + 'WNCAATourneySeeds.csv')
df_tour_compact = pd.read_csv(data_dir + 'WNCAATourneyCompactResults.csv')
df_tour_detail = pd.read_csv(data_dir + 'WNCAATourneyDetailedResults.csv')
df_reg_compact = pd.read_csv(data_dir + 'WRegularSeasonCompactResults.csv')
df_reg_detail = pd.read_csv(data_dir + 'WRegularSeasonDetailedResults.csv')
df_teams = pd.read_csv(data_dir + 'WTeams.csv')
df_teams_leagues = pd.merge(left=df_teams, right=pd.read_csv(data_dir + 'WLeagues.csv'), how='left', on=['TeamName'])
df_tour_18 = pd.read_csv('NCAA_Result_2018_sep.csv')

## Get just numeric seeds - we aren't interested in region

In [38]:
# get just integer value of seed (exclude region information)
def seed_to_int(seed):
    #Get just the digits from the seeding. Return as int
    s_int = int(seed[1:3])
    return s_int
df_seeds['SeedInt'] = df_seeds.Seed.apply(seed_to_int)
df_seeds.drop(labels=['Seed'], inplace=True, axis=1) # This is the string label

## Load the league that each team belongs to

In [39]:
df_winning_team_leagues = df_teams_leagues.rename(columns={'TeamID' : 'WTeamID', 'LeagueName' : 'WLeagueName'}).drop('TeamName',1)
df_winning_team_leagues.head()

Unnamed: 0,WTeamID,WLeagueName
0,3101,Southland
1,3102,MWC
2,3103,MAC
3,3104,SEC
4,3105,SWAC


## Load and bin league performance (to dampen the effect of small variations)

In [40]:
## Bin league performance into 10 bins
df_league_perf = pd.read_pickle('league_performance')
df_league_perf['Leaguebin'] = pd.cut(df_league_perf['PctWins mean'], 24, labels=False)

df_league_perf = df_league_perf[['LeagueName', 'Leaguebin']]
df_team_league_perf = pd.merge(left=df_teams_leagues, right=df_league_perf, how='left', on=['LeagueName'])

df_team_league_perf = df_team_league_perf[['TeamID', 'Leaguebin']]
df_team_league_perf.head()

Unnamed: 0,TeamID,Leaguebin
0,3101,1.0
1,3102,7.0
2,3103,2.0
3,3104,23.0
4,3105,0.0


In [41]:
## Inject winning and losing teams' leaugues into df
# Total of 36 columns
df_reg_detail = pd.merge(left=df_reg_detail, right=df_winning_team_leagues, how='left', on=['WTeamID'])

df_losing_team_leagues = df_winning_team_leagues.rename(columns={'WTeamID': 'LTeamID', 'WLeagueName': 'LLeagueName'})
df_reg_detail = pd.merge(left=df_reg_detail, right=df_losing_team_leagues, how='left', on=['LTeamID'])
df_reg_detail.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,WLeagueName,LLeagueName
0,2010,11,3103,63,3237,49,H,0,23,54,...,10,11,27,11,23,7,6,19,MAC,Horizon
1,2010,11,3104,73,3399,68,N,0,26,62,...,27,14,26,7,20,4,2,27,SEC,OVC
2,2010,11,3110,71,3224,59,A,0,29,62,...,23,17,23,8,15,6,0,15,Patriot,MEAC
3,2010,11,3111,63,3267,58,A,0,27,52,...,25,22,22,15,11,14,5,14,Sun Belt,C-USA
4,2010,11,3119,74,3447,70,H,1,30,74,...,21,21,32,12,14,4,2,14,Patriot,NEC


## Feature Engineering
-  Operationalize the notion of Point Opportunities Developed (POD) and Opportunity Convertion Rate (OCR)
-  Demean and rescale the performance of teams by league and season
-  Bin performance to dampen small variations


In [42]:
## Add a few more columns 
## Now have a total of 44 columns

df_reg_detail['WPOD'] = df_reg_detail.WFGA3 * 3 + (df_reg_detail.WFGA - df_reg_detail.WFGA3) * 2 + df_reg_detail.WFTA * 1
df_reg_detail['LPOD'] = df_reg_detail.LFGA3 * 3 + (df_reg_detail.LFGA - df_reg_detail.LFGA3) * 2 + df_reg_detail.LFTA * 1

df_reg_detail['WOCR'] = 1.0 * df_reg_detail.WScore / df_reg_detail.WPOD
df_reg_detail['LOCR'] = 1.0 * df_reg_detail.LScore / df_reg_detail.LPOD

df_reg_detail['WFGP'] = df_reg_detail.WFGM / df_reg_detail.WFGA ## Field goal percentage for the winning team
df_reg_detail['LFGP'] = df_reg_detail.LFGM / df_reg_detail.LFGA ## Field goal percentage for the losing team

df_reg_detail['WFTP'] = df_reg_detail.WFTM / df_reg_detail.WFTA ## Field goal percentage for the winning team
df_reg_detail['LFTP'] = df_reg_detail.LFTM / df_reg_detail.LFTA ## Field goal percentage for the losing team

In [43]:
features_todo = ['FGP','FTP','FGA','FTA','Ast','Blk','OR','DR','POD','OCR','Stl']

In [44]:
# take in string of feature name
def calc_feats(dat,feat_list):
    
    # get just the list of teams, to build our features from
    w_cols = dat[[col for col in dat if col.startswith('W')]].copy()
    w_cols.columns = [s[1:] for s in w_cols.columns]
    w_cols['Season'] = dat['Season']
    
    l_cols = dat[[col for col in dat if col.startswith('L')]].copy()
    l_cols.columns = [s[1:] for s in l_cols.columns]
    l_cols['Season'] = dat['Season']
    
    # this is a dataframe with all games for each season, with relevant stats for each team on a row
    all_teams = pd.concat([w_cols,l_cols])
    
    all_tm_lg_szn = all_teams[['TeamID','LeagueName','Season']].drop_duplicates()
    
    for f in feat_list:
        # aggregate data
        team_agg = all_teams.groupby(['Season', 'LeagueName', 'TeamID'], as_index=False).agg({f:['mean']})
        team_agg.columns = team_agg.columns.droplevel(1)

        league_agg = all_teams.groupby(['Season', 'LeagueName'], as_index=False).agg({f:['mean','std']})
        league_agg.columns = [''.join(col).strip() for col in league_agg.columns.values]
        
        team_agg = pd.merge(left=team_agg,right=league_agg,how='left',on=['Season','LeagueName'])
        
        team_agg[f+'norm'] = ((team_agg[f] - team_agg[f+'mean'])/ team_agg[f+'std'])
        team_agg[f+'bin'] = pd.cut(team_agg[f+'norm'], 24, labels=False)
        
        all_tm_lg_szn = pd.merge(left=all_tm_lg_szn,right=team_agg,how='left',on=['Season','LeagueName','TeamID'])
            
    return all_tm_lg_szn  

In [45]:
# note: this includes teams from 2018
df_engineered_features = calc_feats(df_reg_detail,features_todo)
df_engineered_features = pd.merge(left=df_engineered_features,right=df_team_league_perf,how='left',on='TeamID')

In [46]:
# This has 10 fewer columns than Prashant's because it doesn't have duplicate LeagueName columns
df_engineered_features.head()

Unnamed: 0,TeamID,LeagueName,Season,FGP,FGPmean,FGPstd,FGPnorm,FGPbin,FTP,FTPmean,...,OCRmean,OCRstd,OCRnorm,OCRbin,Stl,Stlmean,Stlstd,Stlnorm,Stlbin,Leaguebin
0,3103,MAC,2010,0.405346,0.402665,0.072138,0.037166,10,0.704329,0.698111,...,0.432482,0.068308,0.104973,11,8.033333,9.304709,3.193953,-0.398057,5,2.0
1,3104,SEC,2010,0.40077,0.415768,0.074202,-0.202128,8,0.621109,0.67691,...,0.43703,0.07129,-0.327385,7,7.37931,8.371495,3.85994,-0.257047,6,23.0
2,3110,Patriot,2010,0.39955,0.381652,0.067985,0.263277,12,0.740438,0.686326,...,0.410244,0.065445,0.343623,14,7.952381,7.6,3.381968,0.104194,9,0.0
3,3111,Sun Belt,2010,0.435525,0.394316,0.076786,0.53667,14,0.622631,0.671585,...,0.422639,0.073208,0.376765,14,9.935484,8.425287,3.595169,0.420063,11,1.0
4,3119,Patriot,2010,0.358097,0.381652,0.067985,-0.346465,7,0.673364,0.686326,...,0.410244,0.065445,-0.471252,6,6.458333,7.6,3.381968,-0.337575,5,0.0


In [47]:
# Create 'winning team' and 'losing team' versions of engineered features DF
# TO align with Prashant, this needs to only contain 'Bin' features. Season, and teamID

df_engineered_features_w = df_engineered_features[[col for col in df_engineered_features if col.endswith('bin')]].copy()
df_engineered_features_w.columns = map(lambda x:'1'+x,df_engineered_features_w.columns)
df_engineered_features_w['Season'] = df_engineered_features['Season']
df_engineered_features_w['1TeamID'] = df_engineered_features['TeamID']

df_engineered_features_l = df_engineered_features[[col for col in df_engineered_features if col.endswith('bin')]].copy()
df_engineered_features_l.columns = map(lambda x:'2'+x,df_engineered_features_l.columns)
df_engineered_features_l['Season'] = df_engineered_features['Season']
df_engineered_features_l['2TeamID'] = df_engineered_features['TeamID']

In [48]:
df_engineered_features_l.head()

Unnamed: 0,2FGPbin,2FTPbin,2FGAbin,2FTAbin,2Astbin,2Blkbin,2ORbin,2DRbin,2PODbin,2OCRbin,2Stlbin,2Leaguebin,Season,2TeamID
0,10,14,7,12,10,6,12,11,6,11,5,2.0,2010,3103
1,8,9,11,10,9,6,11,14,10,7,6,23.0,2010,3104
2,12,17,9,13,8,13,14,9,9,14,9,0.0,2010,3110
3,14,10,11,9,16,22,13,17,9,14,11,1.0,2010,3111
4,7,12,9,8,5,10,13,10,8,6,5,0.0,2010,3119


## Build Regular Season and Tournament Datasets

In [49]:
# This function can be used to reverse winning and losing teams
# to make datapoints with Result = 0
def gen_neg_items(df):
    df_wins = pd.DataFrame()
    df_wins['Season'] = df['Season']
    df_wins['1TeamID'] = df['WTeamID']
    df_wins['2TeamID'] = df['LTeamID']
    df_wins['Result'] = 1

    df_losses = pd.DataFrame()
    df_losses['Season'] = df['Season']
    df_losses['1TeamID'] = df['LTeamID']
    df_losses['2TeamID'] = df['WTeamID']
    df_losses['Result'] = 0
    
    return pd.concat((df_wins, df_losses))

In [50]:
df_regular_season_games = gen_neg_items(df_reg_detail[df_reg_detail['Season'] > 2009])

In [51]:
df_regular_season_games_with_features = pd.merge(left=df_regular_season_games, right=df_engineered_features_w, on=['Season', '1TeamID'])
df_regular_season_games_with_features = pd.merge(left=df_regular_season_games_with_features, right=df_engineered_features_l, on=['Season', '2TeamID'])

In [52]:
# This is the training dataset for model 1
# Contains all matchups from regular season games 2010-18
# These are duplicated for both 'win' and 'loss' outcomes 
df_regular_season_games_with_features.head()

Unnamed: 0,Season,1TeamID,2TeamID,Result,1FGPbin,1FTPbin,1FGAbin,1FTAbin,1Astbin,1Blkbin,...,2FGAbin,2FTAbin,2Astbin,2Blkbin,2ORbin,2DRbin,2PODbin,2OCRbin,2Stlbin,2Leaguebin
0,2010,3103,3237,1,10,14,7,12,10,6,...,5,14,2,15,12,11,5,6,7,4.0
1,2010,3231,3237,1,6,12,14,14,6,2,...,5,14,2,15,12,11,5,6,7,4.0
2,2010,3282,3237,1,12,14,10,11,10,7,...,5,14,2,15,12,11,5,6,7,4.0
3,2010,3282,3237,1,12,14,10,11,10,7,...,5,14,2,15,12,11,5,6,7,4.0
4,2010,3293,3237,1,12,19,13,12,12,13,...,5,14,2,15,12,11,5,6,7,4.0


In [53]:
# While we're at it, let's build our training dataset for our second model
# THis is all actual tournament data from 2010-18

# Key difference: 2010-17 tourneys by win-lose team
# 2018 is organized by team #

# DOUBLE CHECK THIS 
df_tour = df_tour_compact[df_tour_compact['Season'] > 2009]
df_tour = df_tour[['Season', 'WTeamID', 'LTeamID']]

# this now also includes 0 outcome items
df_tour = gen_neg_items(df_tour)
df_tour.shape

(1008, 4)

In [54]:
df_tour_18 = df_tour_18.rename(columns={'Team1':'1TeamID', 'Team2':'2TeamID'})

df_tour_matchups = pd.concat([df_tour, df_tour_18], ignore_index=False)
df_tour_matchups.shape

(1071, 4)

In [55]:
# Inject regular season stats to tournament games
df_tour_matchups_with_features = pd.merge(left=df_tour_matchups, right=df_engineered_features_w, on=['Season', '1TeamID'])
df_tour_matchups_with_features = pd.merge(left=df_tour_matchups_with_features, right=df_engineered_features_l, on=['Season', '2TeamID'])

## Train Model 1 on Regular Season Results

In [56]:
# For 8 teams, we are missing league information so we've had to drop those from our dataset
df_train_1 = df_regular_season_games_with_features.dropna(how='any')

## UPDATE THIS IF WE WANT TO DROP FEATURES
df_train_1 = df_train_1

################
# Separate training and test data based on season played
# Will eventually be predicting 2018 matchups, so that must be test
X_train_1 = df_train_1[df_train_1['Season'] < 2018]
X_test_1 = df_train_1[df_train_1['Season'] == 2018]

y_train_1 = X_train_1.Result.values
y_test_1 = X_test_1.Result.values

X_train_1 = X_train_1.drop('Result', 1)
X_test_1 = X_test_1.drop('Result', 1)

In [67]:
def eval(clf,train_x,train_y,test_x,test_y):
    clf.fit(train_x,train_y)
    print "score = " + str(clf.score(test_x, test_y))

    pred = clf.predict_proba(test_x)
    print "log loss = " + str(metrics.log_loss(test_y,pred))

In [68]:
print "fitting logistic regression model"
modelLR = LogisticRegression()
eval(modelLR,X_train_1,y_train_1,X_test_1,y_test_1)

print "\nfitting Naive Bayes model"
modelNB = GaussianNB()
eval(modelNB,X_train_1,y_train_1,X_test_1,y_test_1)

print "\nfitting Random Forest model"
modelRF = RandomForestClassifier(n_jobs=2, random_state=0)
eval(modelRF,X_train_1,y_train_1,X_test_1,y_test_1)

fitting logistic regression model
score = 0.73910539451
log loss = 0.511413498501

fitting Naive Bayes model
score = 0.706661547322
log loss = 0.704248870597

fitting Random Forest model
score = 0.696966788251
log loss = 0.630665296146


## Engineer Feature 'winprob' for tournament matchups based on regular season data

In [58]:
df_tour_matchups_with_features.head()

Unnamed: 0,Season,1TeamID,2TeamID,Result,1FGPbin,1FTPbin,1FGAbin,1FTAbin,1Astbin,1Blkbin,...,2FGAbin,2FTAbin,2Astbin,2Blkbin,2ORbin,2DRbin,2PODbin,2OCRbin,2Stlbin,2Leaguebin
0,2010,3124,3201,1,14,16,8,20,11,18,...,14,14,12,6,12,13,15,13,12,7.0
1,2010,3124,3207,1,14,16,8,20,11,18,...,12,16,13,6,16,2,13,10,17,15.0
2,2010,3265,3207,0,15,22,10,15,13,13,...,12,16,13,6,16,2,13,10,17,15.0
3,2010,3124,3397,1,14,16,8,20,11,18,...,11,11,12,17,13,18,9,16,5,23.0
4,2010,3173,3397,0,13,14,15,16,13,14,...,11,11,12,17,13,18,9,16,5,23.0


In [59]:
## UPDATE columns used as needed
df_train_2 = df_tour_matchups_with_features

# Need to make predictions for everything in tournament data
# Do not separate 2018 from everything else - need to generate predictions

X_train_2 = df_train_2
y_train_2 = X_train_2.Result.values
X_train_2 = X_train_2.drop('Result', 1)

In [60]:
# Use Naive Bayes Probabilities
## WHY NB vs. LR?

# DOUBLE CHECK on this ordering
df_tour_winprob = pd.DataFrame(modelNB.predict_proba(X_train_2), columns=['LossProb','WinProb'])

df_tour_winprob = pd.concat([X_train_2, df_tour_winprob],axis=1)

df_tour_winprob['Result'] = y_train_2
df_tour_winprob.tail(10)

Unnamed: 0,Season,1TeamID,2TeamID,1FGPbin,1FTPbin,1FGAbin,1FTAbin,1Astbin,1Blkbin,1ORbin,...,2Blkbin,2ORbin,2DRbin,2PODbin,2OCRbin,2Stlbin,2Leaguebin,LossProb,WinProb,Result
1061,2018,3268,3343,13,12,13,17,12,9,16,...,13,12,14,8,14,9,2.0,0.188065,0.811935,1
1062,2018,3276,3294,14,16,8,12,9,7,11,...,9,5,7,7,15,11,0.0,0.313545,0.686455,1
1063,2018,3278,3453,10,17,17,18,11,5,16,...,6,11,16,7,14,10,4.0,0.22036,0.77964,1
1064,2018,3280,3311,14,16,13,14,12,9,12,...,8,8,14,9,13,11,1.0,0.037444,0.962556,1
1065,2018,3280,3329,14,16,13,14,12,9,12,...,4,12,11,14,10,10,21.0,0.059939,0.940061,1
1066,2018,3323,3437,19,18,14,16,17,7,13,...,13,7,15,9,10,2,15.0,8.7e-05,0.999913,1
1067,2018,3355,3437,13,18,12,15,14,13,14,...,13,7,15,9,10,2,15.0,0.016872,0.983128,0
1068,2018,3329,3393,9,17,13,18,9,4,12,...,7,13,10,15,8,9,22.0,0.448743,0.551257,1
1069,2018,3332,3370,17,17,10,14,15,10,10,...,9,13,21,13,12,7,0.0,0.02166,0.97834,1
1070,2018,3333,3443,16,13,8,6,15,17,10,...,13,19,6,14,14,15,12.0,0.168957,0.831043,1


## Feature Engineering & Selection for Model 2

In [61]:
# Our baseline with just seed difference information does pretty well
# But we think that seed difference isn't the same across all level of seeds
# so we develop a % and then calculate the difference for each matchup

In [62]:
# Calculate Seed Difference, and Weighted Seed Difference

# Get seeds for each team
df_seeds_w = df_seeds.rename(columns={'TeamID' : '1TeamID','SeedInt' : '1Seed'})
df_tour_final_feats = pd.merge(left=df_tour_winprob, right=df_seeds_w, how='left', on=['Season','1TeamID'])

df_seeds_l = df_seeds.rename(columns={'TeamID' : '2TeamID','SeedInt' : '2Seed'})
df_tour_final_feats = pd.merge(left=df_tour_final_feats, right=df_seeds_l, how='left', on=['Season','2TeamID'])

## Calculate 'difference' statistics for teams in the matchup

# positive seed difference indicate that Team1 is higher seed
df_tour_final_feats['SeedDiff'] = df_tour_final_feats['2Seed'] - df_tour_final_feats['1Seed']
df_tour_final_feats['SeedDiffPct'] = df_tour_final_feats['SeedDiff'] / (df_tour_final_feats['1Seed'] + df_tour_final_feats['2Seed'])

# We would also like to weight by relative league strength, so calculate that difference
df_tour_final_feats['LeaguebinDiff'] = df_tour_final_feats['1Leaguebin'] - df_tour_final_feats['2Leaguebin']

## this now contains all reg season features plus select 'difference' features
df_tour_final_feats.head()

Unnamed: 0,Season,1TeamID,2TeamID,1FGPbin,1FTPbin,1FGAbin,1FTAbin,1Astbin,1Blkbin,1ORbin,...,2Stlbin,2Leaguebin,LossProb,WinProb,Result,1Seed,2Seed,SeedDiff,SeedDiffPct,LeaguebinDiff
0,2010,3124,3201,14,16,8,20,11,18,9,...,12,7.0,0.076033,0.923967,1,4,13,9,0.529412,14.0
1,2010,3124,3207,14,16,8,20,11,18,9,...,17,15.0,0.059654,0.940346,1,4,5,1,0.111111,6.0
2,2010,3265,3207,15,22,10,15,13,13,7,...,17,15.0,0.088892,0.911108,0,12,5,-7,-0.411765,-12.0
3,2010,3124,3397,14,16,8,20,11,18,9,...,5,23.0,0.43867,0.56133,1,4,1,-3,-0.6,-2.0
4,2010,3173,3397,13,14,15,16,13,14,16,...,5,23.0,0.642625,0.357375,0,8,1,-7,-0.777778,-13.0


In [63]:
# Generate features for baseline data

X_train_baseline = df_tour_final_feats[['SeedDiff']][df_tour_final_feats['Season'] < 2018]
X_test_baseline = df_tour_final_feats[['SeedDiff']][df_tour_final_feats['Season'] == 2018]

# Output = 1 or 0
y_train_baseline = df_tour_final_feats[df_tour_final_feats['Season'] < 2018]['Result']
y_test_baseline = df_tour_final_feats[df_tour_final_feats['Season'] == 2018]['Result']

In [64]:
# Final features: SeedDiffPct, LeaguebinDiff, winprob
X_train = df_tour_final_feats[['SeedDiffPct','LeaguebinDiff','WinProb']][df_tour_final_feats['Season'] < 2018]
X_test = df_tour_final_feats[['SeedDiffPct','LeaguebinDiff','WinProb']][df_tour_final_feats['Season'] == 2018]

# Output = 1 or 0
y_train = df_tour_final_feats[df_tour_final_feats['Season'] < 2018]['Result']
y_test = df_tour_final_feats[df_tour_final_feats['Season'] == 2018]['Result']

## Fit Model 2 to Training Tourney Matchups and Predict Test Matchups

In [72]:
# Determine models we'd like to use
model_names = ['Logistic Regression', 'SVM', 'Gaussian NB', 'Random Forests', 'Gradient Boosting']
models = [LogisticRegression(),svm.SVC(probability=True),GaussianNB(),RandomForestClassifier(n_jobs=2, random_state=0),GradientBoostingClassifier(n_estimators=10, learning_rate=1.0, max_depth=2, random_state=0)]

In [79]:
## Reminder of baseline predictions

for i, val in enumerate(models):
    model = val
    print "Performance on "+model_names[i]
    eval(model,X_train_baseline,y_train_baseline,X_test_baseline,y_test_baseline)
    print 

Performance on Logistic Regression
score = 0.761904761905
log loss = 0.491646882578

Performance on SVM
score = 0.714285714286
log loss = 0.570466808634

Performance on Gaussian NB
score = 0.761904761905
log loss = 0.493318056692

Performance on Random Forests
score = 0.714285714286
log loss = 0.468333713454

Performance on Gradient Boosting
score = 0.761904761905
log loss = 0.466783386092



In [81]:
## Achieve better results using our model

for i, val in enumerate(models):
    model = val
    print "Performance on "+model_names[i]
    eval(model,X_train,y_train,X_test,y_test)
    print

Performance on Logistic Regression
score = 0.761904761905
log loss = 0.432803111266

Performance on SVM
score = 0.793650793651
log loss = 0.482783377191

Performance on Gaussian NB
score = 0.777777777778
log loss = 0.498292613749

Performance on Random Forests
score = 0.777777777778
log loss = 0.418648024739

Performance on Gradient Boosting
score = 0.746031746032
log loss = 0.432034319899

