Basketball Birthday Paradox

In [111]:
#Birthday Paradox Function
from math import perm
def birthday_probability(n: int) -> float:
    return (1 - ((perm(365, n)) / 365 ** n))

In [112]:
birthday_probability(30)

0.7063162427192686

In [113]:
birthday_probability(10)

0.11694817771107768

In [114]:
birthday_probability(15)

0.25290131976368635

In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [116]:
#NBA Paradox Analysis
df = pd.read_csv('../Data/basketball_players.csv')
df.drop(columns = ['year_start', 'year_end', 'height', 'weight'], inplace = True)

In [117]:
df.head()

Unnamed: 0,name,position,birth_date,college
0,Alaa Abdelnaby,F-C,"June 24, 1968",Duke University
1,Zaid Abdul-Aziz,C-F,"April 7, 1946",Iowa State University
2,Kareem Abdul-Jabbar,C,"April 16, 1947","University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,G,"March 9, 1969",Louisiana State University
4,Tariq Abdul-Wahad,F,"November 3, 1974",San Jose State University


In [118]:
#Convert birth_date to datetime type
df['birth_date'] = pd.to_datetime(df['birth_date'], format = '%B %d, %Y')

In [119]:
df.loc[df['college'].isnull()]

Unnamed: 0,name,position,birth_date,college
9,Alex Abrines,G-F,1993-08-01,
32,Alexis Ajinca,C,1988-05-06,
38,Furkan Aldemir,F-C,1991-08-09,
74,David Andersen,C,1980-06-23,
100,Martynas Andriuskevicius,C,1986-03-12,
...,...,...,...,...
4529,Sun Yue,G,1985-11-06,
4540,Wang Zhizhi,C,1977-07-08,
4544,Paul Zipser,G-F,1994-02-18,
4545,Ante Zizic,F-C,1997-01-04,


In [120]:
df['college'].fillna('None')

0                             Duke University
1                       Iowa State University
2       University of California, Los Angeles
3                  Louisiana State University
4                   San Jose State University
                        ...                  
4545                                     None
4546                    Kent State University
4547                      Duquesne University
4548                                     None
4549             George Washington University
Name: college, Length: 4550, dtype: object

In [121]:
df.head()

Unnamed: 0,name,position,birth_date,college
0,Alaa Abdelnaby,F-C,1968-06-24,Duke University
1,Zaid Abdul-Aziz,C-F,1946-04-07,Iowa State University
2,Kareem Abdul-Jabbar,C,1947-04-16,"University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,G,1969-03-09,Louisiana State University
4,Tariq Abdul-Wahad,F,1974-11-03,San Jose State University


In [122]:
df.groupby('college')['college'].value_counts().sort_values(ascending = False)

college
University of Kentucky                   99
University of California, Los Angeles    91
University of North Carolina             87
University of Kansas                     72
Duke University                          71
                                         ..
Northwood Institute                       1
Ohio Wesleyan University                  1
Okaloosa-Walton Community College         1
Alabama - Huntsville                      1
Youngstown State University               1
Name: count, Length: 473, dtype: int64

In [123]:
#Create birth date column
df['birth_month_day'] = df['birth_date'].dt.strftime('%m-%d')

In [124]:
df.head()

Unnamed: 0,name,position,birth_date,college,birth_month_day
0,Alaa Abdelnaby,F-C,1968-06-24,Duke University,06-24
1,Zaid Abdul-Aziz,C-F,1946-04-07,Iowa State University,04-07
2,Kareem Abdul-Jabbar,C,1947-04-16,"University of California, Los Angeles",04-16
3,Mahmoud Abdul-Rauf,G,1969-03-09,Louisiana State University,03-09
4,Tariq Abdul-Wahad,F,1974-11-03,San Jose State University,11-03


In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4550 entries, 0 to 4549
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   name             4550 non-null   object        
 1   position         4549 non-null   object        
 2   birth_date       4519 non-null   datetime64[ns]
 3   college          4248 non-null   object        
 4   birth_month_day  4519 non-null   object        
dtypes: datetime64[ns](1), object(4)
memory usage: 177.9+ KB


In [126]:
#Separate Analysis
from itertools import combinations

In [127]:
names = ['John', 'Mary', 'Rob', 'Susan', 'Violet']
birthdays = ['March 5th', 'Sept 20th', 'March 5th', 'July 20th', 'Sept 20th']

In [128]:
list(combinations(names, 2))

[('John', 'Mary'),
 ('John', 'Rob'),
 ('John', 'Susan'),
 ('John', 'Violet'),
 ('Mary', 'Rob'),
 ('Mary', 'Susan'),
 ('Mary', 'Violet'),
 ('Rob', 'Susan'),
 ('Rob', 'Violet'),
 ('Susan', 'Violet')]

In [129]:
list(combinations(birthdays, 2))

[('March 5th', 'Sept 20th'),
 ('March 5th', 'March 5th'),
 ('March 5th', 'July 20th'),
 ('March 5th', 'Sept 20th'),
 ('Sept 20th', 'March 5th'),
 ('Sept 20th', 'July 20th'),
 ('Sept 20th', 'Sept 20th'),
 ('March 5th', 'July 20th'),
 ('March 5th', 'Sept 20th'),
 ('July 20th', 'Sept 20th')]

In [130]:
names_df = pd.DataFrame(combinations(names, 2), columns = ['Person 1', 'Person 2'])
names_df

Unnamed: 0,Person 1,Person 2
0,John,Mary
1,John,Rob
2,John,Susan
3,John,Violet
4,Mary,Rob
5,Mary,Susan
6,Mary,Violet
7,Rob,Susan
8,Rob,Violet
9,Susan,Violet


In [131]:
birthdays_df = pd.DataFrame(combinations(birthdays, 2), columns = ['Birthday 1', 'Birthday 2'])
birthdays_df

Unnamed: 0,Birthday 1,Birthday 2
0,March 5th,Sept 20th
1,March 5th,March 5th
2,March 5th,July 20th
3,March 5th,Sept 20th
4,Sept 20th,March 5th
5,Sept 20th,July 20th
6,Sept 20th,Sept 20th
7,March 5th,July 20th
8,March 5th,Sept 20th
9,July 20th,Sept 20th


In [132]:
#Combine the dataframes
combined_df = pd.concat([names_df, birthdays_df], axis = 1)
combined_df

Unnamed: 0,Person 1,Person 2,Birthday 1,Birthday 2
0,John,Mary,March 5th,Sept 20th
1,John,Rob,March 5th,March 5th
2,John,Susan,March 5th,July 20th
3,John,Violet,March 5th,Sept 20th
4,Mary,Rob,Sept 20th,March 5th
5,Mary,Susan,Sept 20th,July 20th
6,Mary,Violet,Sept 20th,Sept 20th
7,Rob,Susan,March 5th,July 20th
8,Rob,Violet,March 5th,Sept 20th
9,Susan,Violet,July 20th,Sept 20th


In [133]:
combined_df['Birthday 1'] == combined_df['Birthday 2']

0    False
1     True
2    False
3    False
4    False
5    False
6     True
7    False
8    False
9    False
dtype: bool

In [134]:
df.groupby('college')['college'].value_counts().sort_values(ascending = False).head(10)

college
University of Kentucky                   99
University of California, Los Angeles    91
University of North Carolina             87
University of Kansas                     72
Duke University                          71
Indiana University                       60
University of Notre Dame                 56
University of Louisville                 55
St. John's University                    51
University of Arizona                    51
Name: count, dtype: int64

In [146]:
#Find same birthday at UArizona
arizona_df = df.loc[df['college'] == 'University of Arizona']
arizona_df

Unnamed: 0,name,position,birth_date,college,birth_month_day
18,Hassan Adams,G,1984-06-20,University of Arizona,06-20
53,Kadeem Allen,G,1993-01-15,University of Arizona,01-15
119,Gilbert Arenas,G,1982-01-06,University of Arizona,01-06
245,Jerryd Bayless,G,1988-08-20,University of Arizona,08-20
314,Mike Bibby,G,1978-05-13,University of Arizona,05-13
351,Leon Blevins,G,1926-06-25,University of Arizona,06-25
435,A.J. Bramlett,C,1977-01-10,University of Arizona,01-10
540,Chase Budinger,F,1988-05-22,University of Arizona,05-22
542,Jud Buechler,F-G,1968-06-19,University of Arizona,06-19
775,Anthony Cook,F-C,1967-03-19,University of Arizona,03-19


In [147]:
arizona_names_df = pd.DataFrame(combinations(arizona_df['name'], 2), columns = ['Player 1', 'Player 2'])
arizona_names_df.head()

Unnamed: 0,Player 1,Player 2
0,Hassan Adams,Kadeem Allen
1,Hassan Adams,Gilbert Arenas
2,Hassan Adams,Jerryd Bayless
3,Hassan Adams,Mike Bibby
4,Hassan Adams,Leon Blevins


In [148]:
arizona_bday_df = pd.DataFrame(combinations(arizona_df['birth_month_day'], 2), columns = ['Day 1', 'Day 2'])
arizona_bday_df.head()

Unnamed: 0,Day 1,Day 2
0,06-20,01-15
1,06-20,01-06
2,06-20,08-20
3,06-20,05-13
4,06-20,06-25


In [149]:
arizona_combined_df = pd.merge(arizona_names_df, arizona_bday_df, left_index = True, right_index = True, how = 'inner')

In [150]:
arizona_combined_df.head()

Unnamed: 0,Player 1,Player 2,Day 1,Day 2
0,Hassan Adams,Kadeem Allen,06-20,01-15
1,Hassan Adams,Gilbert Arenas,06-20,01-06
2,Hassan Adams,Jerryd Bayless,06-20,08-20
3,Hassan Adams,Mike Bibby,06-20,05-13
4,Hassan Adams,Leon Blevins,06-20,06-25


In [151]:
arizona_combined_df.loc[arizona_combined_df['Day 1'] == arizona_combined_df['Day 2']]

Unnamed: 0,Player 1,Player 2,Day 1,Day 2
247,Leon Blevins,Michael Dickerson,06-25,06-25
350,Chase Budinger,Lauri Markkanen,05-22,05-22
632,Bob Elliott,Mustafa Shakur,08-18,08-18
949,Richard Jefferson,Loren Woods,06-21,06-21
1230,Ed Stokes,Damon Stoudamire,09-03,09-03
