<a href="https://colab.research.google.com/github/Davilirio/Neural-Nets/blob/master/IBM_HypothesisTest_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import  Path
from google.colab import drive, files 
from scipy.stats import binom
from scipy.stats import norm
import math

plt.style.use('seaborn-dark')

In [None]:
drive.mount('/content/my_drive')

In [None]:
data_path = Path('/content/my_drive/MyDrive/data_science_stuff/datasets/UFC')

In [None]:
os.listdir(data_path)

In [None]:
df=pd.read_csv(data_path/'ufc-master.csv')

df.columns = df.columns.str.lower()
df.rename(columns={'r_fighter':'red_corner', 'b_fighter':'blue_corner',
                   'no_of_rounds':'nbr_rounds'})
df['date'] = pd.to_datetime(df['date'])
df['red_wins'] = np.where(df['winner'] == 'Red',1,0)
df['blue_wins'] = np.abs(1-df['red_wins'])
df.head()

In [None]:
df_stats = df.describe()
df_stats

In [None]:
fig, ax = plt.subplots(figsize=(8,8))

ax.set_title('Odd Distribuition', size=14)
sns.histplot(df['b_odds'], color = 'Blue', alpha = 0.5, ax = ax, label='Blue Corner Odds'),
sns.histplot(df['r_odds'], alpha = 0.5, ax=ax, label='Red Corner Odds')

ax.legend(fontsize=12)
ax.set_xlabel('Odds', size=12)
fig.show()

In [None]:
df_wstk_b = pd.DataFrame(df.loc[df['blue_wins']==1].groupby('b_current_win_streak')['blue_wins'].count())
df_wstk_b.reset_index(inplace=True)
df_wstk_b.rename(columns={'b_current_win_streak':'winstreak', 'blue_wins':'bwins'},
                 inplace=True)

df_wstk_r = pd.DataFrame(df.loc[df['red_wins']==1].groupby('r_current_win_streak')['red_wins'].count())
df_wstk_r.reset_index(inplace=True)
df_wstk_r.rename(columns={'r_current_win_streak':'winstreak', 'red_wins':'rwins'},
                 inplace=True)

df_wstk=df_wstk_b.merge(df_wstk_r, how='left', on='winstreak')
df_wstk['total_wins'] = df_wstk['bwins']+df_wstk['rwins']
df_wstk['bwins'] = df_wstk['bwins']/df_wstk['total_wins']
df_wstk['rwins'] = df_wstk['rwins']/df_wstk['total_wins']

In [None]:
total_df = pd.DataFrame(df.groupby('b_current_win_streak')['blue_wins'].count()).reset_index()
total_df.rename(columns={'b_current_win_streak':'winstreak',
                         'blue_wins':'total_wins'}, inplace=True)

df_wstk_b =df_wstk_b.merge(total_df, how='left', on='winstreak',) 
df_wstk_b['b_pct_win'] = df_wstk_b['bwins']/df_wstk_b['total_wins']*100

In [None]:
total_df = pd.DataFrame(df.groupby('r_current_win_streak')['red_wins'].count()).reset_index()
total_df.rename(columns={'r_current_win_streak':'winstreak',
                         'red_wins':'total_wins'}, inplace=True)

df_wstk_r = df_wstk_r.merge(total_df, how='left', on='winstreak')
df_wstk_r['r_pct_win'] = df_wstk_r['rwins']/df_wstk_r['total_wins']*100

In [None]:
df_wstk

In [None]:
corr=np.corrcoef(df_wstk['winstreak'], df_wstk['rwins'])[0,1]

In [None]:
scipy.stats.linregress(y=df_wstk['rwins'], x=df_wstk['winstreak'])

In [None]:
assert scipy.stats.linregress(y=df_wstk['rwins'], x=df_wstk['winstreak']).pvalue < 0.05

In [None]:
ppf = norm.ppf(0.95)

assert red_corr > ppf

In [None]:
fig, ax = plt.subplots(figsize=(8,6))

sns.scatterplot(ax=ax, x='winstreak',y='rwins', data=df_wstk)
ax.set_ylabel('Red Win Prob.')
ax.set_title('Winstreak x Win Prob')

fig.show()

### Exploratory Data Analysis of the UFC Dataset

### We will check if being on the red corner makes you more probable to win (favorite), if winstreaks impact directly on winrates and if total round fought impact on win ratio as well

In [None]:
sns.set_palette(sns.color_palette(['red', 'blue']))
sns.histplot(df,x='winner',hue='winner',stat='density')
plt.title('Winner per Corner', size=14);

In [None]:
red_wins = len(df.loc[df['winner']=='Red'])
blue_wins = len(df.loc[df['winner']=='Blue'])
df_size = len(df)

In [None]:
df_size

In [None]:
prob_red_favorite = red_wins/(red_wins+blue_wins)
prob_red_favorite

#### We supose that red is favorite, but for checking we will test the hypothesis that the probability of winning is 50/50, knowing that we have a significant sample of the population. 
- H0: The corner does not matter (probability of red winning == 0.5)
- H1: The corner does matter and red is favorite (probability of red winning >= 0.5)

In [None]:
wins_to_favor = binom.ppf(0.95, df_size, 0.5) + 1
assert red_wins > wins_to_favor

In [None]:
p_value_red = (1 - binom.cdf(red_wins, df_size, 0.5))
max_p = 0.05

print(f'We have that the probability of that amount of wins happening by chance \
is {p_value_red}, which is smaller than our p_value of {max_p}')

In [None]:
sns.regplot(data=df_wstk_b, x='winstreak', y='b_pct_win',color='blue')
plt.ylabel('Winrate', size=12)
plt.title('Blue Winstreak - Winrate', size=14);

In [None]:
sns.regplot(data=df_wstk_r, x='winstreak', y='r_pct_win', color='red')
plt.ylabel('Winrate', size=12)
plt.title('Red Winstreak - Winrate', size=14);

In [None]:
weight_class_df = df.groupby('weight_class').mean()
weight_class_df.head()

In [None]:
sns.regplot(x = 'r_weight_lbs', y = 'total_fight_time_secs',
            data=weight_class_df, color = 'purple'
            )
plt.xlabel('Weight')
plt.ylabel('Mean Fight Time (s)')
plt.title('Fight Time per Weight Class');

In [None]:
sns.histplot(df['b_avg_sig_str_landed'], color='blue', bins=15,
             alpha= 0.4,label='Blue Corner', stat='density')

sns.histplot(df['r_avg_sig_str_landed'], color='red', bins=15,
             alpha= 0.4, label='Red Corner', stat='density')

plt.title('Distribuition of Significant Strikes', size=14)
plt.xlabel('Significant Strikes Landed')
plt.legend()
plt.show()

In [None]:
df_sample = df.sample(450)
a = df_sample[['r_avg_sig_str_landed','b_avg_sig_str_landed', 'weight_class', 'winner']]

In [None]:
sns.set_context('paper')
sns.pairplot(a, hue='winner')

In [None]:
df.loc[(df['r_stance']=='Southpaw')|(['b_stance']=='Orthodox')]['red_wins'].mean()\
,df['red_wins'].mean()

In [None]:
df.loc[(df['b_stance']=='Southpaw')|(['r_stance']=='Orthodox')]['blue_wins'].mean()\
,df['blue_wins'].mean()

#### Do more fights lead to a lessened win chance?
- H0 - correlation total_fights/wins == 0
- H1 - correlation total_fights/wins < 0 

In [None]:
scipy.stats.linregress(y=df['red_wins'], x=df['r_total_rounds_fought']).slope

In [None]:
p_v =scipy.stats.linregress(y=df['red_wins'], x=df['r_total_rounds_fought']).pvalue

assert p_v < 0.05

In [None]:
red_corr = np.corrcoef(df['r_total_rounds_fought'], df['red_wins'])[0,1]
ppf = -norm.ppf(0.95)

assert red_corr < ppf

In [None]:
1 - norm.cdf(red_corr)

In [None]:
np.corrcoef(df['b_total_rounds_fought'], df['blue_wins'])[0,1]

In [None]:
fig, ax = plt.subplots( figsize = (6, 6))

ax.set_title('Blue Corner Winstreaks')
sns.kdeplot(ax=ax, data=df['b_current_win_streak'], color='blue',
            label='Blue Corner', shade=True)
sns.kdeplot(ax=ax, data=df['r_current_win_streak'], color= 'red',
            label='Red Corner', shade=True)
ax.set_xlabel('Current winstreak')
ax.legend()

fig.show()