In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from random import randint

### Creating Student Dataset

In [2]:
undergrad_min_age, undergrad_max_age = 17,22
undergrad_low_income_frac  = 0.3
undergrad_accessibility_frac = 0.05

undergrad_year_dict = {
    'Freshman' : 0.4, 
    'Sophomore' : 0.3, 
    'Junior' : 0.2, 
    'Senior' : 0.1
    }

undergrad_school_dict = {
    'Edward J. Bloustein School of Planning and Public Policy' : 0.05,
    'Mason Gross School of the Arts': 0.1,
    'Rutgers Business School–Newark and New Brunswick' : 0.1,
    'School of Arts and Sciences' : 0.4, #most students(undergrad)
    'School of Communication and Information': 0.1,
    'School of Engineering': 0.1,
    'School of Environmental and Biological Sciences': 0.05,
    'School of Management and Labor Relations' : 0.05,
    'School of Social Work': 0.05
    }

In [3]:
grad_year_dict ={
    'Masters' : 0.5,
    'Doctorate' : 0.4,
    'Postdoctoral studies' : 0.1,
}

grad_school_dict = {
    'Graduate School of Applied and Professional Psychology' : 0.1, #grad school
    'Graduate School of Education' : 0.2, #grad school
    'School of Graduate Studies': 0.7 #mostly grad
    }

grad_min_age, grad_max_age = 22,28
grad_low_income_frac = 0.3
grad_accessibility_frac = 0.05

#### Analysis before creating the dataset

In [4]:
url = 'https://raw.githubusercontent.com/EktaDhobley/Algorithms-in-the-wild/main/data/rooms_data/final_data.csv'
rooms_data = pd.read_csv(url, index_col = False)

In [5]:
grad_hall_ids = ['BA','JA']

In [6]:
#### Getting student count (no of students of each category we can house)
ug_rooms_dataset = rooms_data.loc[~rooms_data.hall_id.isin(grad_hall_ids)]
grad_rooms_dataset = rooms_data.loc[rooms_data.hall_id.isin(grad_hall_ids)]
print(f"The number of undergrad students we can house is {ug_rooms_dataset.shape[0]} \nThe number of graduate students we can house is {grad_rooms_dataset.shape[0]}")

The number of undergrad students we can house is 4746 
The number of graduate students we can house is 584


In [7]:
#### Looking at the variables of interest among undergrad and gradauate housing
variables_of_interest = ['has_laundry','floor_plan', 'residence_type', 'hall_id', 'has_private_bathroom', 'has_accessibility_ramps'] 

with open('rooms_data/grad_room_summary.txt', 'w') as f:
    for x in variables_of_interest:
        print("-"*10,x,"-"*10, file=f)
        print(grad_rooms_dataset[x].value_counts(dropna=False), file=f)
        
with open('rooms_data/ug_room_summary.txt', 'w') as f:
    for x in variables_of_interest:
        print("-"*10,x,"-"*10, file=f)
        print(ug_rooms_dataset[x].value_counts(dropna=False), file=f)

In [8]:
def generate_student_dataset(number_of_students, year_dict, school_dict, accessibility_needs_frac, low_income_frac, min_age, max_age):
    students = pd.DataFrame()
    fake = Faker()
    fake_students = [{'RUID':randint(10**(9-1), (10**9)-1),
                    'student_profile':fake.profile(fields = ['name','sex']),
                    'date_of_birth':fake.date_of_birth(minimum_age = min_age, maximum_age= max_age),
                    'student_year':np.random.choice(list(year_dict.keys()), p=list(year_dict.values())),
                    'student_school': np.random.choice(list(school_dict.keys()), p=list(school_dict.values()))
                    } for x in range(number_of_students)]
    student_df = pd.DataFrame(fake_students)
    student_df = pd.concat([student_df['student_profile'].apply(pd.Series), student_df.drop(['student_profile'], axis=1)], axis=1)
    student_df['accessibility_need'], student_df['low_income_status'] = 0,0
    student_df.loc[student_df.sample(frac = accessibility_needs_frac).index,'accessibility_need'] = 1
    student_df.loc[student_df.sample(frac = low_income_frac).index,'low_income_status'] = 1
    return student_df

In [9]:
undergrad_student_dataset = generate_student_dataset(number_of_students=4746, 
                                                     year_dict=undergrad_year_dict, 
                                                     school_dict=undergrad_school_dict,
                                                     accessibility_needs_frac = undergrad_accessibility_frac, 
                                                     low_income_frac=undergrad_low_income_frac,
                                                     min_age = undergrad_min_age,
                                                     max_age = undergrad_max_age)

In [10]:
grad_student_dataset = generate_student_dataset(number_of_students=584, 
                                                     year_dict=grad_year_dict, 
                                                     school_dict=grad_school_dict,
                                                     accessibility_needs_frac = grad_accessibility_frac, 
                                                     low_income_frac=grad_low_income_frac,
                                                     min_age = grad_min_age,
                                                     max_age = grad_max_age)

In [11]:
def check_valid_dataset(df, min_age, max_age, low_income_frac, accessibility_need_frac):
    cond1 = df.duplicated().sum() == 0 #Check if there are any duplicate rows
    cond2 = len(set(df['RUID'])) == df.shape[0] #Validate that RUIDs are unqiue
    cond3 = all(ele >= min_age and ele <= max_age+1 for ele in list(2022 - pd.DatetimeIndex(df['date_of_birth']).year)) #check age distribution
    cond4 = df['accessibility_need'].sum()/df.shape[0] == accessibility_need_frac #Validate accessibility_needs attribute
    cond5 = df['low_income_status'].sum()/df.shape[0] == low_income_frac # Validate low_income_frac
    display(df['student_year'].value_counts(normalize=True)) #Validate distribution of student_year
    display(df['student_school'].value_counts(normalize=True)) #Validate distribution of student_school
    if cond1 and cond2 and cond3 and cond4 and cond5:
        return True
    return False

In [12]:
if check_valid_dataset(grad_student_dataset, grad_min_age, grad_max_age, grad_low_income_frac, grad_accessibility_frac):
    grad_student_dataset.to_csv('student_data/graduate_students_data.csv', index = False)

Masters                 0.465753
Doctorate               0.433219
Postdoctoral studies    0.101027
Name: student_year, dtype: float64

School of Graduate Studies                                0.702055
Graduate School of Education                              0.200342
Graduate School of Applied and Professional Psychology    0.097603
Name: student_school, dtype: float64

In [13]:
if check_valid_dataset(undergrad_student_dataset, undergrad_min_age, undergrad_max_age, undergrad_low_income_frac, undergrad_accessibility_frac):
    undergrad_student_dataset.to_csv('student_data/undergrad_student_data.csv', index = False)

Freshman     0.384324
Sophomore    0.308681
Junior       0.205436
Senior       0.101559
Name: student_year, dtype: float64

School of Arts and Sciences                                 0.410662
Rutgers Business School–Newark and New Brunswick            0.103245
Mason Gross School of the Arts                              0.103245
School of Communication and Information                     0.097345
School of Engineering                                       0.093131
School of Management and Labor Relations                    0.053519
School of Environmental and Biological Sciences             0.048251
Edward J. Bloustein School of Planning and Public Policy    0.046566
School of Social Work                                       0.044037
Name: student_school, dtype: float64

### Generate Choices Dataset

In [14]:
ug_hall_pref = rooms_data.loc[~rooms_data['hall_id'].isin(grad_hall_ids)]['hall_id'].value_counts(normalize=True, dropna=False).to_dict()
# ug_price_pref = rooms_data.loc[~rooms_data['hall_id'].isin(grad_hall_ids)]['price'].value_counts(bins = 3, dropna=False, normalize = True).to_dict()
ug_residence_pref = rooms_data.loc[~rooms_data['hall_id'].isin(grad_hall_ids)]['residence_type'].value_counts(dropna=False, normalize = True).to_dict()
ug_price_pref = {'4000-6000': 0.5,'6000-8000' : 0.20,'8000-11000': 0.3}

In [15]:
grad_hall_pref = rooms_data.loc[rooms_data['hall_id'].isin(grad_hall_ids)]['hall_id'].value_counts(normalize=True, dropna=False).to_dict()
# grad_price_pref = rooms_data.loc[rooms_data['hall_id'].isin(grad_hall_ids)]['price'].value_counts(bins = 2, dropna=False, normalize=True).to_dict()
grad_residence_pref = rooms_data.loc[rooms_data['hall_id'].isin(grad_hall_ids)]['residence_type'].value_counts(dropna=False, normalize = True).to_dict()
grad_price_pref = {'8000-10500': 0.6,'10500-12000': 0.4}

In [21]:
# Code to creat choices for each student
def generate_dataset(df, hall_pref, price_pref, residence_pref, undergrad = False):
    n_choices = 3 if undergrad else 2
    fake_choices = [{
        'RUID':ruid,
        'preferred_hall_ids':np.random.choice(list(hall_pref.keys()), n_choices, p=list(hall_pref.values()), replace=False),
        'preferred_residence_type' : np.random.choice(list(residence_pref.keys()), p=list(residence_pref.values())),
        'is_single_preferred' : np.random.choice([0,1], p=[0,1]),
        'is_private_bathroom_preferred' : np.random.choice([0,1], p=[0.1,0.9]),
        'laundry_availibility':np.random.choice([0,1], p=[0.05,0.95]),
        'preferred_price_range' : np.random.choice(list(price_pref.keys()), p=list(price_pref.values())),
        } for ruid in df['RUID']]
    student_choices = pd.DataFrame(fake_choices)
    student_data = df.merge(student_choices, on = "RUID")
    student_data['max_price'] = student_data.apply(lambda row: int(row.preferred_price_range.split('-')[1]) - row.low_income_status*1000, axis=1)
    return student_data

In [22]:
grad_student_merged_data = generate_dataset(grad_student_dataset, grad_hall_pref, grad_price_pref, grad_residence_pref)
ug_student_merged_data = generate_dataset(undergrad_student_dataset, ug_hall_pref, ug_price_pref, ug_residence_pref, undergrad=True)

In [23]:
final_data = pd.concat([ug_student_merged_data,grad_student_merged_data], ignore_index=True) 

In [27]:
final_data.to_csv('student_data/final_student_merged_data.csv')