In [126]:
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
#import tensorflow as tf 


# Preprocess Each Dataset (Fighters and Bouts)

In [127]:
#just fighters
fighters_data = pd.read_csv('Fights_scraper/spiders/scraped_fighters.csv') 
#just bouts
bouts_data = pd.read_csv('Bouts_Scraper/bouts_scraped/bouts_scraped/spiders/scraped_bouts.csv') 
#combined dataset, created with SQL Query, joined on fighter1 and 2 names
fighter_bouts = pd.read_csv('fighters_bouts_joined.csv') 


In [128]:
#shuffles winners in the df to make the classes of winners and losers balanced
import math
negative_index = np.random.choice(len(fighter_bouts),
                                  size= math.ceil(len(fighter_bouts)/2),
                                  replace = False)

fighter_bouts.iloc[negative_index,[2,3]] = fighter_bouts.iloc[negative_index,[3,2]].values



In [129]:
#make winner column align correctly with negative index
fighter_bouts['winner'].iloc[negative_index] = fighter_bouts['fighter2'].iloc[negative_index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [130]:
#creates two lists of the relevant columns stats for f1 and f2
f1_list = [col for col in fighter_bouts.columns if 'f1' in col]
f2_list = [col for col in fighter_bouts.columns if 'f2' in col]


In [131]:
f1_index = []
for col in fighter_bouts.columns:
    if col in f1_list:
        f1_index.append(fighter_bouts.columns.get_loc(col))

f2_index = []
for col in fighter_bouts.columns:
    if col in f2_list:
        f2_index.append(fighter_bouts.columns.get_loc(col))

In [132]:
fb_copy = fighter_bouts.copy()
#all values at the negative index of f1's become the values of negative index of f2's
fighter_bouts.iloc[negative_index,f1_index] = fighter_bouts.iloc[negative_index,f2_index].values
#all values at the negative index of f2's become the values of negative index of f1's using copy of fb_copy of fighter bouts
fighter_bouts.iloc[negative_index,f2_index] =fb_copy.iloc[negative_index,f1_index].values

In [133]:
#seperate df consisting only of categorical dtypes
categorical_fb = fighter_bouts.select_dtypes(include='object')
#drop those types for now from fighter bouts leaving only numeric values
fighter_bouts = fighter_bouts.drop(columns=categorical_fb.columns)


In [134]:
#some fighters have '--' as a null for their DOB and height
categorical_fb['f1_dob'] =  categorical_fb['f1_dob'].replace('--',None) 
categorical_fb['f2_dob'] =  categorical_fb['f2_dob'].replace('--',None)
#some fighters have '--' as a null for their DOB
categorical_fb['f1_height'] =  categorical_fb['f1_height'].replace('--',None) 
categorical_fb['f2_height'] =  categorical_fb['f2_height'].replace('--',None)

In [135]:
categorical_fb['event_date'] = pd.to_datetime(categorical_fb['event_date'])
categorical_fb['f1_dob'] = pd.to_datetime(categorical_fb['f1_dob'])
categorical_fb['f2_dob'] = pd.to_datetime(categorical_fb['f2_dob'])


In [136]:
#weights can be seen as an ordinal category value so this list declares that order
weights_ordered = ["Women's Strawweight","Women's Flyweight","Women's Bantamweight",
               "Women's Featherweight", "Flyweight", "Bantamweight", "Featherweight",
               "Lightweight","Welterweight", "Middleweight","Light Heavyweight",
               "Heavyweight", "Super Heavyweight", "Open Weight","Catch Weight"]



In [137]:
categorical_fb["weight_class"] = categorical_fb.weight_class.astype("category",
                                 ordered=True,
                                 categories= weights_ordered).cat.codes


  This is separate from the ipykernel package so we can avoid doing imports until


In [138]:
#Creating a new feature for fighter1 and fighter2: ageAtFight. It's ordinal so no need to one hot encode,
categorical_fb['f1_ageAtFight'] = (categorical_fb.event_date - categorical_fb.f1_dob).dt.days
categorical_fb['f2_ageAtFight'] = (categorical_fb.event_date - categorical_fb.f2_dob).dt.days

In [139]:
#starting to clean up the the fighters' records categories, first we replace text ('Record') with nothing
categorical_fb = categorical_fb.replace('Record: ',"",regex=True) 

In [140]:
#make these values all zero (just in case something goes wrong with record_split()
categorical_fb['f1_win'],categorical_fb['f1_loss'],categorical_fb['f1_draw'],categorical_fb['f1_nc'] = 0,0,0,0
categorical_fb['f2_win'],categorical_fb['f2_loss'],categorical_fb['f2_draw'],categorical_fb['f2_nc'] = 0,0,0,0


In [141]:
#a small function to handle the remainder of the record strings
#takes in a row split the values on '-'
#if 'NC' is contained the draw var(the only place it could be) then we split draw on brackets first
#draw is simply equal to the 0th element of that split
#nc is equal to the 1st element of that split with a regex to remove any remaining non numeric values
#cast all values to int and return all 4 
import re

def record_split(row):
        win,loss, draw = row.split('-')
        nc= 0
        if 'NC' in draw:       
            draw = draw.split('(')
            nc = re.sub('[^0-9]','', draw[1])
            draw = draw[0]
        else:
            nc = 0     
            
        win,loss, draw, nc = int(win),int(loss), int(draw), int(nc)
            
        return win,loss,draw,nc
        

In [142]:
#this is a messy way to do these assignments but works. 
#use the pd apply function to apply our record_split() function on each row in df
#zip(*...) unpacks our results nicely
categorical_fb['f1_win'],\
categorical_fb['f1_loss'],\
categorical_fb['f1_draw'],\
categorical_fb['f1_nc'] = zip(*categorical_fb.apply(lambda x: record_split(x['f1_record']), axis=1))

categorical_fb['f2_win'],\
categorical_fb['f2_loss'],\
categorical_fb['f2_draw'],\
categorical_fb['f2_nc'] = zip(*categorical_fb.apply(lambda x: record_split(x['f2_record']), axis=1))



In [143]:
categorical_fb = categorical_fb.drop(columns =['f1_record','f2_record'])

In [144]:

#adds winner to end of column for readability 
cols = list(categorical_fb.columns.values) 
cols.pop(cols.index('winner')) 
categorical_fb = categorical_fb[cols+['winner']] 



In [145]:
categorical_fb.head()

Unnamed: 0,event_date,figher1,fighter2,weight_class,win_method_finish,win_method_type,f1_dob,f1_height,f1_stance,f2_dob,...,f2_ageAtFight,f1_win,f1_loss,f1_draw,f1_nc,f2_win,f2_loss,f2_draw,f2_nc,winner
0,2018-01-14,Jeremy Stephens,Dooho Choi,6,Punch,KO/TKO,1986-05-25,5' 8,Orthodox,1991-04-10,...,9776,28,16,0,0,14,3,0,0,Jeremy Stephens
1,2018-01-14,Paige VanZant,Jessica-Rose Clark,1,,U-DEC,1994-03-26,5' 4,Orthodox,1987-11-28,...,11005,8,4,0,0,9,5,0,1,Jessica-Rose Clark
2,2018-01-14,Emil Meek,Kamaru Usman,8,,U-DEC,1988-08-20,5' 11,Switch,1987-05-11,...,11206,9,4,1,1,15,1,0,0,Kamaru Usman
3,2018-01-14,Darren Elkins,Michael Johnson,6,Rear Naked Choke,SUB,1984-05-16,5' 10,Orthodox,1986-06-04,...,11547,25,7,0,0,20,14,0,0,Darren Elkins
4,2018-01-14,James Krause,Alex White,7,,U-DEC,1986-06-04,6' 2,Orthodox,1988-10-22,...,10676,25,8,0,0,13,5,0,0,James Krause


In [146]:
def parse_height(height):
    #expected format is 5' 10, 6'3 etc
    ht = height.split("' ")
    ft = float(ht[0])
    inch = float(ht[1])
    return (12*ft) + inch
        

In [147]:
categorical_fb.f2_height = categorical_fb.f2_height.apply(lambda x: parse_height(x))
categorical_fb.f1_height = categorical_fb.f1_height.apply(lambda x: parse_height(x))

In [149]:
categorical_fb = categorical_fb.join(pd.get_dummies(categorical_fb.win_method_type,prefix= 'win_method'))
categorical_fb = categorical_fb.join(pd.get_dummies(categorical_fb.win_method_finish,prefix= 'win_finish'))
categorical_fb = categorical_fb.join(pd.get_dummies(categorical_fb.f1_stance,prefix= 'f1_stance'))
categorical_fb = categorical_fb.join(pd.get_dummies(categorical_fb.f2_stance,prefix= 'f2_stance'))

In [150]:
categorical_fb = categorical_fb.drop(columns=['win_method_type','win_method_finish','f1_stance','f2_stance','f1_dob','f2_dob','event_date'])

In [151]:

#adds winner to end of column for readability 
cols = list(categorical_fb.columns.values) 
cols.pop(cols.index('winner')) 
categorical_fb = categorical_fb[cols+['winner']] 



In [152]:
categorical_fb['winner'] = (categorical_fb['fighter2'] == categorical_fb['winner']).astype('int')
categorical_fb['figher1'] = 0
categorical_fb['fighter2'] = 1

In [155]:
fbs_joined = pd.concat([fighter_bouts,categorical_fb],axis=1)

In [156]:
targets = fbs_joined.winner

In [157]:
fbs_joined = fbs_joined.drop(columns=['winner'])

In [None]:
fbs_joined.head()

In [159]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy= 'median',copy= False)
scaler = StandardScaler()

In [160]:
scaled_fbs = scaler.fit_transform(fbs_joined)
imputed_fbs = imputer.fit_transform(scaled_fbs)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [161]:
scaled_fbs[0:10]

array([[-0.16311237, -0.54384707, -0.56070889, ..., -0.02872721,
        -0.49251571, -0.1742809 ],
       [-0.16311237, -0.54384707,  1.43232926, ..., -0.02872721,
        -0.49251571, -0.1742809 ],
       [-0.16311237, -0.54384707,  2.92710788, ..., -0.02872721,
        -0.49251571,  5.73786345],
       ...,
       [-0.16311237, -0.06233546, -0.56070889, ..., -0.02872721,
         2.0303921 , -0.1742809 ],
       [-0.16311237, -0.06233546,  0.93406973, ..., -0.02872721,
        -0.49251571, -0.1742809 ],
       [-0.16311237, -0.06233546, -0.56070889, ..., -0.02872721,
        -0.49251571, -0.1742809 ]])

In [162]:
from sklearn.model_selection import StratifiedShuffleSplit
sss= StratifiedShuffleSplit(n_splits=20,test_size=0.2,random_state=42)

for train_index, test_index in sss.split(imputed_fbs, targets):
    X_train, X_test = imputed_fbs[train_index], imputed_fbs[test_index]
    y_train, y_test = targets[train_index], targets[test_index]

In [170]:
np.savetxt("model_data/X_train.csv", X_train, delimiter=",")
np.savetxt("model_data/y_train.csv", y_train, delimiter=",")
np.savetxt("model_data/X_test.csv", X_test, delimiter=",")
np.savetxt("model_data/y_test.csv", y_test, delimiter=",")

# Build Nueral Net

In [164]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras import optimizers, losses, models

In [165]:
print(X_train.shape)
print(y_train.shape)

(3880, 102)
(3880,)


In [None]:
model = tf.keras.Sequential()

model.add(Dense(100,activation='relu',input_dim= X_train.shape[1]))
model.add(Dense(1,activation ='sigmoid'))

model.compile(optimizer= 'adam',
              loss = 'binary_crossentropy',
              metrics =['accuracy'])

model.fit(x=X_train,
          y=y_train,
          epochs=2,batch_size=36)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/2


# Fight Stats Predictor Data Prep