In [1]:
import numpy as np
import pandas as pd
import random
from faker import Faker
import datetime

In [2]:
# Create a Faker instance to generate user names
fake = Faker()

#school

#list of possible schools in the data set
school_choices = ['UST', 'AdU', 'FEU', 'TIP', 'OTHER']
#the chance of each corresponding school of being picked..... 

#this can be improved to change the data structure as a hashmap with the key as the school name and the value is the weight
school_weights = [0.4, 0.1, 0.1, 0.1, 0.3]  # Adjusted weights

#employment
#same as the schools ds
employment_choices = ['BPO', 'teacher', 'software engineer', 'influencer', 'management', 'doctor', 'nurse', 'lawyer']
employment_weights = [0.4, 0.1, 0.1, 0.1, 0.1, 0.05, 0.05, 0.1]

#gender
#another one
gender_choices = ['M', 'F']
gender_weights = [0.50, 0.50]

#age
# age_choices = ['20-25', '15-20', '25-30', '40-above']
# age_weights = [0.30, 0.15, 0.30, 0.25]

#rows
num_rows = 1000

# birthday
birthday = [fake.date_of_birth() for _ in range(num_rows)] #underscore in the for loop is just a temp variable 

#user IDs
# there will be a random number gnerated between 1000 and 9999
user_ids = [random.randint(1000, 9999) for _ in range(num_rows)]


#used the faker library to generate the names
first_names = [fake.first_name() for _ in range(num_rows)]
last_names = [fake.last_name() for _ in range(num_rows)]

# timestamps
#durations are the same data structure as the ones used in school and employment
#data used are in minutes
durations = [random.choices([60, 90, 120, 180, 240, 300, 360], weights=[9, 15, 15, 20, 20, 15, 6])[0] for _ in range(num_rows)]
start_times = [fake.date_time_between(start_date='-1d', end_date='now') for _ in range(num_rows)]
end_times = [start_time + datetime.timedelta(minutes=dur) for start_time, dur in zip(start_times, durations)]

# Generate extension durations
extension_durations = []
for duration in durations:
    extension_probabilities = [0.3 / (i + 1) for i in range(duration // 30)]
    extension_duration = random.choices(range(1, duration // 30 + 1), weights=extension_probabilities, k=1)
    extension_durations.append(extension_duration[0] * 30)

# null values
null_chance = 0.05  # 5% chance for null values
for i in range(num_rows):
    if random.random() < null_chance:
        user_ids[i] = None
        first_names[i] = None
        last_names[i] = None
        extension_durations[i] = None

# employment is school or job
employment_data = np.random.choice(['school', 'job'], size=num_rows, p=[0.6, 0.4])

#category (school or job)
employment_list = []
for category in employment_data:
    if category == 'school':
        employment = np.random.choice(school_choices, p=school_weights)
        employment_list.append(employment)
    else:
        employment = np.random.choice(employment_choices, p=employment_weights)
        employment_list.append(employment)

birthdays = []
for category in employment_data:
    if category == 'school':
        # Generate a birthday between 15 to 24 years old
        birthday = fake.date_of_birth(minimum_age=15, maximum_age=24)
    else:
        # Generate a birthday between 25 to 50 years old
        birthday = fake.date_of_birth(minimum_age=25, maximum_age=50)
    birthdays.append(birthday)

# Create a DataFrame

In [3]:
data = pd.DataFrame({
    'User ID': user_ids,
    'First Name': first_names,
    'Last Name': last_names,
    'Employment': employment_list,
    'Gender': np.random.choice(gender_choices, size=num_rows, p=gender_weights),
    'Birthday': birthdays,
    'Time In': start_times,
    'Time Out': end_times,
    'Extension Duration': extension_durations
})

print(data.describe())

           User ID                        Time In  \
count   963.000000                           1000   
mean   5635.925234  2023-10-04 02:10:51.184999936   
min    1000.000000            2023-10-03 14:10:54   
25%    3316.500000  2023-10-03 20:08:07.750000128   
50%    5791.000000            2023-10-04 02:17:29   
75%    7918.500000  2023-10-04 07:50:38.249999872   
max    9994.000000            2023-10-04 14:09:24   
std    2589.507498                            NaN   

                            Time Out  Extension Duration  
count                           1000          963.000000  
mean   2023-10-04 05:19:18.785000192           75.233645  
min              2023-10-03 15:33:37           30.000000  
25%              2023-10-03 23:30:22           30.000000  
50%       2023-10-04 05:33:47.500000           60.000000  
75%    2023-10-04 11:00:58.750000128           90.000000  
max              2023-10-04 20:03:46          360.000000  
std                              NaN           60.