In [24]:
#importing the libraries 
import pandas as pd
import plotly.express as px
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import random
from scipy.stats import t
import os 
import collections


In [2]:
seed = 2004

In [3]:
def uniform_distribution_simulator():
    student_counter = 0  # the number of students in the year
    collision = 0 # occurrence of collisions
    days_of_the_year_uniform = [0] * 366
    while collision==0:
        birth_day = random.randint(0, 365)
        if days_of_the_year_uniform[birth_day] == 0:  # if there are no students born in that day of the year
            days_of_the_year_uniform[birth_day] = 1  # sets that day to one since a student is born in that day of the year
            student_counter += 1
        elif days_of_the_year_uniform[birth_day] == 1:  # we exit the while loop the moment we find a collision
            collision=1

    return student_counter  # returns the number of student it took until a collision happens

In [4]:
# adding the realistic distribution file

In [5]:
path_to_file = './US_births_1994-2003_CDC_NCHS.csv'
if os.path.isfile(path_to_file):
    column_names = ['year', 'month', 'date_of_month', 'day_of_week,births', 'births'] # adding the column names
    df = pd.read_csv(path_to_file, header=None, sep = ',', skiprows=1, index_col=None) # adding the file
    df.columns = column_names
# grouping the columns 'month', 'date_of_month' to provide the average number of people born on a specific day of the year
    average_births  = df.copy()
    month_day_group = average_births.groupby(['month', 'date_of_month'], as_index=False).mean()
else:
    print('please add the file for the realistic distribution of birthdays ')

In [6]:
# computing the statistics of the birthday data
sum_births = month_day_group['births'].sum()
month_day_group["prob"] = month_day_group['births'] / sum_births # finding the probabilty of each day
month_day_group["cdf"] = month_day_group["prob"].cumsum() # creading the cumulative distribution function (cdf)

In [19]:
def realistic_distribution_simulator():
    # Create a list of 366 zeros to represent the days of the year
    days_of_the_year_realistic = [0] * 366
    number_of_students = 0  # number of students it took until a collision happened
    while True:
        # we randomly find days of the year bassed on the value for cdf 
        randomly_generated_day = random.choices(month_day_group.index, weights=month_day_group['cdf'])[0]
    # If there are no students born on this day, increment the number of students and set the corresponding element in the list to 1
        if days_of_the_year_realistic[randomly_generated_day] == 0:
            days_of_the_year_realistic[randomly_generated_day] = 1
            number_of_students += 1
        # If there is already a student born on this day, set the collision flag to 1 to exit the while loop
        elif days_of_the_year_realistic[randomly_generated_day] == 1:
            break # collision has happened 
    # Return the number of students it took until a collision occurred
    return number_of_students

# performing the simulation

In [21]:
#degree of freedom = n-1 in here it's 1999
np.random.seed(seed)
random.seed(seed) 
uniform_distro_conflicts=[]
realistic_distro_conflicts=[]
for i in range(0,1000):
    uniform_distro_conflicts.append(uniform_distribution_simulator())
    realistic_distro_conflicts.append(realistic_distribution_simulator())
#we get the average uniform conflicts on the left and the average real-life conflicts on the right
sum_of_conflicts={'Uniform distribution':sum(uniform_distro_conflicts)/len(uniform_distro_conflicts),'Realistic distribution':sum(realistic_distro_conflicts)/len(realistic_distro_conflicts)}


In [22]:
sum_of_conflicts 

{'Uniform distribution': 24.269, 'Realistic distribution': 21.042}

In [None]:
# here we can see the difference between the theoratical version oand the realistic distribution 

# evaluating the probabilites

In [23]:
#we have to consider that the number of students should be higher than 2 (so the conflict happens)
#and the number of trials should be higher than 32 (based on the centeral limit theorem)

In [None]:
num_students_list = [i for i in range(200)]
uniform_prob = {}
realistic_prob = {}
num_simulations = 40 # 40 is the size of the class
for number_of_sutdents in num_students_list:
    
    class_uniform = np.zeros(num_simulations)
    class_real_life = np.zeros(num_simulations)
    for i in range(num_simulations):  
#         uniform_bdays=[random.randint(0,365) for _ in range(0,number_of_sutdents)]
        #simulate the unifrom distro 
        uniform_bdays=random.sample(range(366), number_of_sutdents)
        if len(uniform_bdays) != len(set(uniform_bdays)):
    
            class_uniform[i] = 1
        
        # Simulate the real-life distribution
        real_life_bdays=[realistic_distribution_simulator() for _ in range(number_of_sutdents)] 
        if len(real_life_bdays) != len(set(real_life_bdays)): # checking for collision
            class_real_life[i] = 1

    uniform_prob[number_of_sutdents] = class_uniform.mean()
    realistic_prob[number_of_sutdents] = class_real_life.mean()


In [None]:
realistic_prob