In [1]:
# This notebook contains the code used to generate students dataset, if needed can be easily translated to a scipt where parameters can be entered to generate different versions of the datasets

In [2]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from random import randint

In [3]:
year_dict = {'Freshman' : 0.25, 'Sophmore' : 0.25, 'Junior' : 0.25, 'Senior' : 0.25}

school_dict = {'Edward J. Bloustein School of Planning and Public Policy' : 0.1,
               'Graduate School of Applied and Professional Psychology' : 0.1,
               'Graduate School of Education' : 0.1,
               'Mason Gross School of the Arts': 0.1,
               'Rutgers Business School–Newark and New Brunswick' : 0.1,
               'School of Arts and Sciences' : 0.1,
               'School of Communication and Information': 0.1,
               'School of Engineering': 0.1,
               'School of Environmental and Biological Sciences': 0.1,
               'School of Graduate Studies': 0.1
               }

degree_dict = {}

In [4]:
def generate_student_dataset(number_of_students, year_dict, school_dict, accessibility_needs_frac, low_income_frac):
    students = pd.DataFrame()
    fake = Faker()
    Faker.seed(4833)
    fake_students = [{'RUID':randint(10**(9-1), (10**9)-1),
                    'student_profile':fake.profile(fields = ['name','sex']),
                    'date_of_birth':fake.date_of_birth(minimum_age = 17, maximum_age= 22),
                    'student_year':np.random.choice(list(year_dict.keys()), p=list(year_dict.values())),
                    'student_school': np.random.choice(list(school_dict.keys()), p=list(school_dict.values()))
                    } for x in range(number_of_students)]
    student_df = pd.DataFrame(fake_students)
    student_df = pd.concat([student_df['student_profile'].apply(pd.Series), student_df.drop(['student_profile'], axis=1)], axis=1)
    student_df['accessibility_need'], student_df['low_income_status'] = 0,0
    student_df.loc[student_df.sample(frac = accessibility_needs_frac).index,'accessibility_need'] = 1
    student_df.loc[student_df.sample(frac = low_income_frac).index,'low_income_status'] = 1
    return student_df

In [5]:
dummy_data = generate_student_dataset(5000, year_dict=year_dict, school_dict=school_dict,accessibility_needs_frac = 0.05, low_income_frac=0.01)

In [6]:
#Check if there are any duplicate rows
dummy_data.duplicated().sum() == 0

True

In [7]:
#Validate that RUIDs are unqiue
len(set(dummy_data['RUID'])) == dummy_data.shape[0]

True

In [8]:
#Validae the distribution of date_of_birth
all(ele >= 17 and ele <= 23 for ele in list(2022 - pd.DatetimeIndex(dummy_data['date_of_birth']).year))

True

In [9]:
#Validate distribution of student_year
dummy_data['student_year'].value_counts(normalize=True)

Freshman    0.2550
Sophmore    0.2544
Junior      0.2496
Senior      0.2410
Name: student_year, dtype: float64

In [10]:
#Validate distribution of student_school
dummy_data['student_school'].value_counts(normalize=True)

School of Engineering                                       0.1056
School of Environmental and Biological Sciences             0.1050
Graduate School of Education                                0.1038
Mason Gross School of the Arts                              0.1028
Edward J. Bloustein School of Planning and Public Policy    0.1016
School of Arts and Sciences                                 0.1006
School of Graduate Studies                                  0.0982
School of Communication and Information                     0.0972
Rutgers Business School–Newark and New Brunswick            0.0958
Graduate School of Applied and Professional Psychology      0.0894
Name: student_school, dtype: float64

In [11]:
#Validate accessibility_needs attribute
dummy_data['accessibility_need'].sum()/dummy_data.shape[0] == 0.05

True

In [12]:
#Validate low_income_status attribute
dummy_data['low_income_status'].sum()/dummy_data.shape[0] == 0.01

True

In [13]:
dummy_data.to_csv('students_dummy_data.csv', index = False)