In [1]:
# This notebook contains the code used to generate students dataset, if needed can be easily translated to a scipt where parameters can be entered to generate different versions of the datasets

In [2]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from random import randint

### Creating Student Dataset

In [3]:
undergrad_min_age, undergrad_max_age = 17,22
undergrad_low_income_frac  = 0.3
undergrad_accessibility_frac = 0.05

undergrad_year_dict = {
    'Freshman' : 0.4, 
    'Sophomore' : 0.3, 
    'Junior' : 0.2, 
    'Senior' : 0.1
    }

undergrad_school_dict = {
    'Edward J. Bloustein School of Planning and Public Policy' : 0.1,
    'Mason Gross School of the Arts': 0.1,
    'Rutgers Business School–Newark and New Brunswick' : 0.1,
    'School of Arts and Sciences' : 0.2, #most students(undergrad)
    'School of Communication and Information': 0.1,
    'School of Engineering': 0.1,
    'School of Environmental and Biological Sciences': 0.1,
    'School of Management and Labor Relations' : 0.1,
    'School of Social Work': 0.1
    }

In [4]:
grad_year_dict ={
    'Masters' : 0.5,
    'Doctorate' : 0.4,
    'Postdoctoral studies' : 0.1,
}

grad_school_dict = {
    'Graduate School of Applied and Professional Psychology' : 0.1, #grad school
    'Graduate School of Education' : 0.2, #grad school
    'School of Graduate Studies': 0.7 #mostly grad
    }

grad_min_age, grad_max_age = 22,28
grad_low_income_frac = 0.3
grad_accessibility_frac = 0.05

#### Analysis before creating the dataset

In [5]:
url = 'https://raw.githubusercontent.com/EktaDhobley/Algorithms-in-the-wild/main/data/rooms_data/final_data.csv'
rooms_data = pd.read_csv(url, index_col = False)

In [6]:
grad_hall_ids = ['BA','JA']

In [7]:
#### Getting student count (no of students of each category we can house)
ug_rooms_dataset = rooms_data.loc[~rooms_data.hall_id.isin(grad_hall_ids)]
grad_rooms_dataset = rooms_data.loc[rooms_data.hall_id.isin(grad_hall_ids)]
print(f"The number of undergrad students we can house is {ug_rooms_dataset.shape[0]} \nThe number of graduate students we can house is {grad_rooms_dataset.shape[0]}")

The number of undergrad students we can house is 4746 
The number of graduate students we can house is 584


In [8]:
#### Looking at the variables of interest among undergrad and gradauate housing
variables_of_interest = ['has_laundry','floor_plan', 'residence_type', 'hall_id', 'has_private_bathroom', 'has_accessibility_ramps'] 
for x in variables_of_interest:
    print("-"*10,x,"-"*10)
    print(grad_rooms_dataset[x].value_counts(dropna=False))

---------- has_laundry ----------
1    584
Name: has_laundry, dtype: int64
---------- floor_plan ----------
Traditional Single    584
Name: floor_plan, dtype: int64
---------- residence_type ----------
Apartment    584
Name: residence_type, dtype: int64
---------- hall_id ----------
BA    384
JA    200
Name: hall_id, dtype: int64
---------- has_private_bathroom ----------
1    584
Name: has_private_bathroom, dtype: int64
---------- has_accessibility_ramps ----------
0    584
Name: has_accessibility_ramps, dtype: int64


In [9]:
for x in variables_of_interest:
    print("-"*10,x,"-"*10)
    print(ug_rooms_dataset[x].value_counts(dropna=False))

---------- has_laundry ----------
1    4523
0     223
Name: has_laundry, dtype: int64
---------- floor_plan ----------
Traditional Double    4590
Traditional Single     156
Name: floor_plan, dtype: int64
---------- residence_type ----------
Suite                         2158
Apartment                     1808
Traditional Residence Hall     780
Name: residence_type, dtype: int64
---------- hall_id ----------
BS     976
NCA    640
SA     640
RA     528
MS     282
MH     223
MZH    223
CS     180
JS     180
MRS    180
TS     180
WS     180
AH     167
BH     167
Name: hall_id, dtype: int64
---------- has_private_bathroom ----------
1    3966
0     780
Name: has_private_bathroom, dtype: int64
---------- has_accessibility_ramps ----------
0    3130
1    1616
Name: has_accessibility_ramps, dtype: int64


In [10]:
def generate_student_dataset(number_of_students, year_dict, school_dict, accessibility_needs_frac, low_income_frac, min_age, max_age):
    students = pd.DataFrame()
    fake = Faker()
    fake_students = [{'RUID':randint(10**(9-1), (10**9)-1),
                    'student_profile':fake.profile(fields = ['name','sex']),
                    'date_of_birth':fake.date_of_birth(minimum_age = min_age, maximum_age= max_age),
                    'student_year':np.random.choice(list(year_dict.keys()), p=list(year_dict.values())),
                    'student_school': np.random.choice(list(school_dict.keys()), p=list(school_dict.values()))
                    } for x in range(number_of_students)]
    student_df = pd.DataFrame(fake_students)
    student_df = pd.concat([student_df['student_profile'].apply(pd.Series), student_df.drop(['student_profile'], axis=1)], axis=1)
    student_df['accessibility_need'], student_df['low_income_status'] = 0,0
    student_df.loc[student_df.sample(frac = accessibility_needs_frac).index,'accessibility_need'] = 1
    student_df.loc[student_df.sample(frac = low_income_frac).index,'low_income_status'] = 1
    return student_df

In [11]:
undergrad_student_dataset = generate_student_dataset(number_of_students=4746, 
                                                     year_dict=undergrad_year_dict, 
                                                     school_dict=undergrad_school_dict,
                                                     accessibility_needs_frac = undergrad_accessibility_frac, 
                                                     low_income_frac=undergrad_low_income_frac,
                                                     min_age = undergrad_min_age,
                                                     max_age = undergrad_max_age)

In [12]:
grad_student_dataset = generate_student_dataset(number_of_students=584, 
                                                     year_dict=grad_year_dict, 
                                                     school_dict=grad_school_dict,
                                                     accessibility_needs_frac = grad_accessibility_frac, 
                                                     low_income_frac=grad_low_income_frac,
                                                     min_age = grad_min_age,
                                                     max_age = grad_max_age)

In [13]:
def check_valid_dataset(df, min_age, max_age, low_income_frac, accessibility_need_frac):
    cond1 = df.duplicated().sum() == 0 #Check if there are any duplicate rows
    cond2 = len(set(df['RUID'])) == df.shape[0] #Validate that RUIDs are unqiue
    cond3 = all(ele >= min_age and ele <= max_age+1 for ele in list(2022 - pd.DatetimeIndex(df['date_of_birth']).year)) #check age distribution
    cond4 = df['accessibility_need'].sum()/df.shape[0] == accessibility_need_frac #Validate accessibility_needs attribute
    cond5 = df['low_income_status'].sum()/df.shape[0] == low_income_frac # Validate low_income_frac
    display(df['student_year'].value_counts(normalize=True)) #Validate distribution of student_year
    display(df['student_school'].value_counts(normalize=True)) #Validate distribution of student_school
    if cond1 and cond2 and cond3 and cond4 and cond5:
        return True
    return False

In [14]:
check_valid_dataset(grad_student_dataset, grad_min_age, grad_max_age, grad_low_income_frac, grad_accessibility_frac)
grad_student_dataset.to_csv('student_data/graduate_students_data.csv', index = False)

Masters                 0.520548
Doctorate               0.385274
Postdoctoral studies    0.094178
Name: student_year, dtype: float64

School of Graduate Studies                                0.720890
Graduate School of Education                              0.190068
Graduate School of Applied and Professional Psychology    0.089041
Name: student_school, dtype: float64

In [15]:
check_valid_dataset(undergrad_student_dataset, undergrad_min_age, undergrad_max_age, undergrad_low_income_frac, undergrad_accessibility_frac)
undergrad_student_dataset.to_csv('student_data/undergrad_student_dataset.csv', index = False)

Freshman     0.396334
Sophomore    0.302571
Junior       0.197640
Senior       0.103456
Name: student_year, dtype: float64

School of Arts and Sciences                                 0.204804
School of Social Work                                       0.105141
Rutgers Business School–Newark and New Brunswick            0.103666
School of Environmental and Biological Sciences             0.103034
Mason Gross School of the Arts                              0.101138
School of Engineering                                       0.099241
School of Management and Labor Relations                    0.096502
Edward J. Bloustein School of Planning and Public Policy    0.093342
School of Communication and Information                     0.093131
Name: student_school, dtype: float64

### Generate Choices Dataset

In [16]:
ug_hall_pref = rooms_data.loc[~rooms_data['hall_id'].isin(grad_hall_ids)]['hall_id'].value_counts(normalize=True, dropna=False).to_dict()
ug_price_pref = rooms_data.loc[~rooms_data['hall_id'].isin(grad_hall_ids)]['price'].value_counts(bins = 3, dropna=False, normalize = True).to_dict()
ug_residence_pref = rooms_data.loc[~rooms_data['hall_id'].isin(grad_hall_ids)]['residence_type'].value_counts(dropna=False, normalize = True).to_dict()

In [17]:
grad_hall_pref = rooms_data.loc[rooms_data['hall_id'].isin(grad_hall_ids)]['hall_id'].value_counts(normalize=True, dropna=False).to_dict()
grad_price_pref = rooms_data.loc[rooms_data['hall_id'].isin(grad_hall_ids)]['price'].value_counts(bins = 2, dropna=False, normalize=True).to_dict()
grad_residence_pref = rooms_data.loc[rooms_data['hall_id'].isin(grad_hall_ids)]['residence_type'].value_counts(dropna=False, normalize = True).to_dict()

In [18]:
# Code to creat choices for each student
def generate_choices_dataset( df, hall_pref, price_pref, residence_pref):
    fake_choices = [{
        'RUID':x,
        'laundry_availibility':np.random.choice([0,1], p=[0.05,0.95]),
        'is_single_preferred' : np.random.choice([0,1], p=[0,1]),
        'is_private_bathroom_preferred' : np.random.choice([0,1], p=[0.1,0.9]),
        'preferred_hall_id':np.random.choice(list(hall_pref.keys()), 2, p=list(hall_pref.values()), replace=False),
        'preferred_price_range' : np.random.choice(list(price_pref.keys()), p=list(price_pref.values())),
        'max_price' : np.random.choice([8000, 9000, 10000, 11000], p= [0.1, 0.2, 0.3, 0.4]),
        'preferred_residence_type' : np.random.choice(list(residence_pref.keys()), p=list(residence_pref.values()))
        } for x in df['RUID']]
    student_choices = pd.DataFrame(fake_choices)
    student_choices['preferred_price_range'] = student_choices['preferred_price_range'].astype(str)
    student_choices = pd.concat([(student_choices.drop(['preferred_hall_id'], axis=1)),
                                  pd.DataFrame(student_choices["preferred_hall_id"].to_list(), columns=['preferred_hall_id1', 'preferred_hall_id2'])], 
                                 axis=1)
    student_choices = student_choices[['RUID','preferred_hall_id1','preferred_hall_id2', 'preferred_residence_type', 'is_single_preferred', 'is_private_bathroom_preferred', 'laundry_availibility','preferred_price_range', 'max_price']]
    return student_choices

In [19]:
grad_student_choices = generate_choices_dataset(grad_student_dataset, grad_hall_pref, grad_price_pref, grad_residence_pref)
grad_student_choices.to_csv('student_data/grad_student_choices.csv', index = False)

In [20]:
ug_student_choices = generate_choices_dataset(undergrad_student_dataset, ug_hall_pref, ug_price_pref, ug_residence_pref)
ug_student_choices.to_csv('student_data/ug_student_choices.csv', index = False)

In [21]:
ug_student_choices

Unnamed: 0,RUID,preferred_hall_id1,preferred_hall_id2,preferred_residence_type,is_single_preferred,is_private_bathroom_preferred,laundry_availibility,preferred_price_range,max_price
0,477565385,JS,NCA,Traditional Residence Hall,1,1,1,"(8093.0, 10072.0]",11000
1,703986029,SA,MS,Suite,1,1,1,"(8093.0, 10072.0]",10000
2,806948601,MRS,BH,Suite,1,1,1,"(8093.0, 10072.0]",10000
3,804671726,TS,NCA,Suite,1,1,1,"(4129.062, 6114.0]",11000
4,372981620,SA,RA,Apartment,1,1,1,"(6114.0, 8093.0]",11000
...,...,...,...,...,...,...,...,...,...
4741,815634994,NCA,JS,Suite,1,1,1,"(8093.0, 10072.0]",10000
4742,260420449,BH,NCA,Suite,1,1,1,"(6114.0, 8093.0]",11000
4743,130330020,MRS,AH,Suite,1,1,1,"(8093.0, 10072.0]",9000
4744,351248070,SA,CS,Apartment,1,1,1,"(8093.0, 10072.0]",10000


In [22]:
grad_student_choices

Unnamed: 0,RUID,preferred_hall_id1,preferred_hall_id2,preferred_residence_type,is_single_preferred,is_private_bathroom_preferred,laundry_availibility,preferred_price_range,max_price
0,625358388,BA,JA,Apartment,1,1,1,"(10789.0, 11618.0]",9000
1,370064933,JA,BA,Apartment,1,1,1,"(10789.0, 11618.0]",11000
2,392307531,JA,BA,Apartment,1,0,1,"(9958.341, 10789.0]",10000
3,867994550,JA,BA,Apartment,1,1,1,"(9958.341, 10789.0]",11000
4,890951652,JA,BA,Apartment,1,1,1,"(10789.0, 11618.0]",10000
...,...,...,...,...,...,...,...,...,...
579,737281961,BA,JA,Apartment,1,1,1,"(10789.0, 11618.0]",11000
580,985231634,BA,JA,Apartment,1,1,1,"(9958.341, 10789.0]",8000
581,452099900,JA,BA,Apartment,1,1,1,"(9958.341, 10789.0]",10000
582,355572388,JA,BA,Apartment,1,1,1,"(10789.0, 11618.0]",8000
