Scripts del Proyect (UFC WINNER PREDICTION)

Script 1: Preparacion de datos para el entrenamiento

In [1]:
#Import Cell
#used to import all the libraries and functions used
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import *
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
import sys, warnings, os

In [2]:
#To ignore max-iteration warnings while cross validating scores
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

In [3]:
#Setting columns and rows to display all the results
pd.set_option("display.max_columns", None, "display.max_rows", None)

In [None]:
# Leemos la tabla de entrenamiento
ufc_master_ds = pd.read_csv("../data/raw/ufc-master_final.csv").set_index('Winner')

In [None]:
# Realizamos la transformación de datos

ufc_master_ds['draw_diff'] = (ufc_master_ds['BlueDraws']-ufc_master_ds['RedDraws'])
ufc_master_ds['avg_sig_str_pct_diff'] = (ufc_master_ds['BlueAvgSigStrPct']-ufc_master_ds['RedAvgSigStrPct'])
ufc_master_ds['avg_TD_pct_diff'] = (ufc_master_ds['BlueAvgTDPct']-ufc_master_ds['RedAvgTDPct'])
ufc_master_ds['win_by_Decision_Majority_diff'] = (ufc_master_ds['BlueWinsByDecisionMajority']-ufc_master_ds['RedWinsByDecisionMajority'])
ufc_master_ds['win_by_Decision_Split_diff'] = (ufc_master_ds['BlueWinsByDecisionSplit']-ufc_master_ds['RedWinsByDecisionSplit'])
ufc_master_ds['win_by_Decision_Unanimous_diff'] = (ufc_master_ds['BlueWinsByDecisionUnanimous']-ufc_master_ds['RedWinsByDecisionUnanimous'])
ufc_master_ds['win_by_TKO_Doctor_Stoppage_diff'] = (ufc_master_ds['BlueWinsByTKODoctorStoppage']-ufc_master_ds['RedWinsByTKODoctorStoppage'])
ufc_master_ds['odds_diff'] = (ufc_master_ds['BlueOdds']-ufc_master_ds['RedOdds'])
ufc_master_ds['ev_diff'] = (ufc_master_ds['BlueExpectedValue']-ufc_master_ds['RedExpectedValue'])

In [None]:
# After extracting the necessary information from these variables, there's no more need for them. So, I'll just drop them
#Dropping variables
var_drop = [
'BlueOdds',
'RedOdds',
'BlueCurrentLoseStreak', 'RedCurrentLoseStreak',
'BlueCurrentWinStreak', 'RedCurrentWinStreak',
'BlueLongestWinStreak', 'RedLongestWinStreak',
'BlueWins', 'RedWins',
'BlueLosses', 'RedLosses',
'BlueTotalRoundsFought', 'RedTotalRoundsFought',
'BlueTotalTitleBouts', 'RedTotalTitleBouts',
'BlueWinsByKO', 'RedWinsByKO',
'BlueWinsBySubmission', 'RedWinsBySubmission',
'BlueHeightCms', 'RedHeightCms',
'BlueReachCms', 'RedReachCms',
'BlueAge', 'RedAge',
'BlueAvgSigStrLanded', 'RedAvgSigStrLanded',
'BlueAvgSubAtt', 'RedAvgSubAtt',
'BlueAvgTDLanded', 'RedAvgTDLanded',
'BlueDraws','BlueAvgSigStrPct','BlueAvgTDPct','BlueWinsByDecisionMajority','BlueWinsByDecisionSplit','BlueWinsByDecisionUnanimous','BlueWinsByTKODoctorStoppage',
'RedDraws','RedAvgSigStrPct','RedAvgTDPct','RedWinsByDecisionMajority','RedWinsByDecisionSplit','RedWinsByDecisionUnanimous','RedWinsByTKODoctorStoppage']
ufc_master_ds.drop(var_drop, axis=1, inplace = True)

In [None]:
comm_drop = [
'Date','Location','Country','WeightClass','Gender','NumberOfRounds','EmptyArena','Finish','FinishDetails','FinishRound','FinishRoundTime','TotalFightTimeSecs','BlueWeightLbs','RedWeightLbs'
]
ufc_master_ds.drop(comm_drop, axis=1, inplace = True)

In [None]:
ufc_master_ds['BlueStance'].loc[ufc_master_ds['BlueStance']=='Switch '] = 'Switch'

In [None]:
stance = ['BlueStance', 'RedStance']

In [None]:
for x in stance:
    ufc_master_ds[x] = [4 if st == 'Orthodox'
                           else 3 if st == 'Southpaw'
                           else 2 if st == 'Switch'
                           else 1 for st in ufc_master_ds[x]]
#using -1 and 1 for both red and blue so there is no misunderstanding that one variable is better than the other    
ufc_master_ds['BetterRank'] = [-1 if rank == 'Red'
                               else 1 if rank == 'Blue'
                               else 0 for rank in ufc_master_ds['BetterRank']]

ufc_master_ds['TitleBout'] = [1 if tb==True else 0 for tb in ufc_master_ds['TitleBout']]

In [None]:
ufc_master_ds['Stance_diff'] = (ufc_master_ds['BlueStance'] - ufc_master_ds['RedStance'])
ufc_master_ds.drop(stance, axis = 1, inplace = True)

In [None]:
#Encoding label so it is easier to find correlation
ufc_master_ds['Winner'] = [1 if winner == 'Red' else 0 for winner in ufc_master_ds.Winner]

In [None]:
##dataset final
ufc_master_ds.drop(ufc_master_ds.loc[:,'BMatchWCRank':'BetterRank'], axis=1, inplace = True)
ufc_master_ds.to_csv("../data/processed/ufc-master.csv")

Script 2: Código de Entrenamiento


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import *
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
import sys, warnings, os

In [None]:
# Cargar la tabla transformada
df = pd.read_csv("../data/processed/ufc-master.csv").set_index('Winner')

In [None]:
label = df.Winner
df.drop(['Winner'], axis=1, inplace = True)

In [None]:
#Encoding the remaining categorical variables
cat_col = ['RedFighter', 'BlueFighter']
enc = LabelEncoder()
for i in df[cat_col]:
    df[i] = enc.fit_transform(df[i])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(df, label, test_size = 0.3, random_state=2)

In [None]:
impute = SimpleImputer(strategy = 'mean')
impute.fit(X_train)
X_train = impute.transform(X_train)
X_valid = impute.transform(X_valid)

In [None]:
RF_model_1 = RandomForestClassifier(n_estimators = 350, max_depth = 12, random_state = 2)
RF_model_1.fit(X_train, y_train)

In [None]:
# Guardamos el modelo entrenado para usarlo en produccion
filename = '../models/best_model.pkl'
pickle.dump(RF_model_1, open(filename, 'wb'))

Script 3: Código de Validación

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import *
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
import sys, warnings, os

In [None]:
# Cargar la tabla transformada
df = pd.read_csv("../data/processed/ufc-master.csv").set_index('Winner')


In [None]:
label = df.Winner
df.drop(['Winner'], axis=1, inplace = True)

In [None]:
#Encoding the remaining categorical variables
cat_col = ['RedFighter', 'BlueFighter']
enc = LabelEncoder()
for i in df[cat_col]:
    df[i] = enc.fit_transform(df[i])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(df, label, test_size = 0.3, random_state=2)

In [None]:
impute = SimpleImputer(strategy = 'mean')
impute.fit(X_train)
X_train = impute.transform(X_train)
X_valid = impute.transform(X_valid)

In [None]:
# Leemos el modelo entrenado!
filename = '../models/best_model.pkl'
model = pickle.load(open(filename, 'rb'))

In [None]:
# Predecimos sobre el set de datos de implementacion con el modelo entrenado
y_pred_test=model.predict(df.drop(['Winner'],axis=1))