In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
def print_columns(dfs):
    for key in dfs.keys():
        print(key)
        print(list(dfs[key].columns))

In [3]:
def fix_missing_columns(dfs, columns):
    '''
        Adds columns to each dataframe in dfs if the column is not already present.
    '''
    for key in dfs:
        for column in columns:
            if column not in list(dfs[key].columns):
                dfs[key][column] = np.nan
    return dfs

In [4]:
def reorder_columns(dfs, columns):
    '''
        Reorder columns so they match the order in the columns list
    '''
    for key in dfs:
        for column in columns:
            if column in list(dfs[key].columns):
                col = dfs[key][column]
                dfs[key].drop(columns=[column], inplace=True)
                dfs[key][column] = col
    return dfs

In [5]:
def create_df_from_files(dir):
    '''
        Reads all files from a directory and creates a dictionary of dataframes
    '''
    files = os.listdir(dir)

    dfs = {}
    for file in files:
        if file == '.DS_Store':
            continue
        path_to_file = dir + file
        try:
            df = pd.read_excel(path_to_file, sheet_name='Student Addresses')
        except:
            print('Could not read file: ' + path_to_file)
        dfs[file] = df

    return dfs

def prep_dfs(dir):
    '''
        Prepares the dataframes for the universities
    '''
    dir = '../data/universities/' + dir + '/'
    dfs = create_df_from_files(dir)
    
    return dfs

In [6]:
def polish_and_save(dfs, university_name, save_as):
    '''
        Adds year-range and university name to the dataframes and saves them to a file
    '''
    save_to = '../data/sorted/universities/' + save_as
    for key in dfs:
        dfs[key]['year_range'] = key.split('.')[0]
        dfs[key]['university'] = university_name

    df = pd.concat(dfs.values(), ignore_index=True)

    df.to_csv(save_to, index=False)

# Boston University

#### Polishing Columns

In [7]:
dfs = prep_dfs('BostonUniversity')
print_columns(dfs)

University Bldgs


  warn(f"Print area cannot be set to Defined name: {defn.value}.")


Definitions
University Bldgs
2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet N

In [8]:
# 2022-2023.xlsx
# 2023-2024.xlsx
# they have at-home or not-at-home column
dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time', 'at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time', 'at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time', 'extra_large_unit']

In [9]:
dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])
dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])
print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [10]:
polish_and_save(dfs, 'Boston University', 'BostonUniversity.csv')

# Baptist College

#### Polishing Columns

In [11]:
dfs = prep_dfs('BaptistCollege')
print_columns(dfs)

Could not read file: ../data/universities/BaptistCollege/~$2021-2022.xlsx
2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-

In [12]:
# remove the 2021-2022.xlsx file
dfs.pop('2021-2022.xlsx')

Unnamed: 0,"Instructions: Please complete the addresses for students residing off-campus in Boston. The totals should closely match row 17 or row 18 (excluding students with suppressed addresses). In Column 9, please provide a flag/label for which address are at-home (commuter) and not-at-home.",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,6a. \nStreet #,6b. \nStreet Name,6c. \nStreet Suffix,6d.\n Unit #,6e. \nZip,7. \nUndergraduate (U) or Graduate (G),8. \nFull-time (FT) or \nPart-time (PT),9. \nAt-Home or Not-at-Home
1,50,Rockwell,St.,,2124,U,PT,Not-at-Home


In [13]:
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time', 'at_home']

for key in dfs:
    if key in ['2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

In [14]:
dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])
dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])
print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

Note:

The student sheet is locked so we couldn't fix the formatting, so we manually inserted 1 row ( since it was the only one)

In [15]:
# create 2021-2022 dataframe
dfs['2021-2022.xlsx'] = pd.DataFrame(columns=['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home','extra_large_unit'])

# street_number = 50
# street_name = Rockwell
# street suffix = St.
# unit_number = nan
# zip_code = 2124
# level_of_study = U
# full_time = PT
# at_home = not-at-home
# extra_large_unit = nan
dfs['2021-2022.xlsx'] = pd.concat([dfs['2021-2022.xlsx'], pd.DataFrame({'street_number':[50],'street_name':['Rockwell'],'street_suffix':['St.'],'unit_number':[np.nan],'zip_code':[2124],'level_of_study':['U'],'full_time':['PT'],'at_home':['not-at-home'],'extra_large_unit':[np.nan]})], ignore_index=True)

#### Save the Result

In [16]:
polish_and_save(dfs, 'Baptist College', 'BaptistCollege.csv')

# BayStateCollege

#### Polishing Columns

In [17]:
dfs = prep_dfs('BayStateCollege')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', 'Unnamed: 2', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', 'Unnamed: 2', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)', '10. At Home or Not at Home']
2016-2017.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c.

In [18]:
# from 2016-2017 to 2019-2020 there is no at-home column
for key in dfs:
    if key in ['2016-2017.xlsx','2017-2018.xlsx','2018-2019.xlsx','2019-2020.xlsx','2021-2022.xlsx']:
        dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

# 2020-2021 has at-home column ( at the end )
dfs['2020-2021.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit','at_home']

# 2022-2023 has at-home column ( at the end )
dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

In [19]:
dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])
dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])
print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2021-2022.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [20]:
polish_and_save(dfs, 'Bay State College', 'BayStateCollege.csv')

# BerkleeCollegeMusic

#### Polishing Columns

In [21]:
dfs = prep_dfs('BerkleeCollegeMusic')
print_columns(dfs)

  warn(msg)


2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  '

In [22]:
# up to 2022-2023, there is no at-home column
for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
    else:
        dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

In [23]:
dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])
dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])
print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [24]:
polish_and_save(dfs, 'Berklee College of Music', 'BerkleeCollegeMusic.csv')

# Boston Architectural College

#### Polishing Columns

In [25]:
dfs = prep_dfs('BostonArchitecturalCollege')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2016-2017.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  '

In [26]:
# 2022-2023.xlsx and 2023-2024.xlsx have at-home column
dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

In [27]:
# the rest of the files do not have at-home column
for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

In [28]:
dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])
dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])
print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2021-2022.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [29]:
polish_and_save(dfs, 'Boston Architectural College', 'BostonArchitecturalCollege.csv')

# Boston College

#### Polishing Columns

In [30]:
dfs = prep_dfs('BostonCollege')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)', 'Housing Type']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Nam

In [31]:
# 2016-2017 has expected graduation term and housing type columns
# from 2017-2018 to 2022-2023, there is no at-home column
# at 2022-2023, there is an housing type column
# 2023-2024 has at-home column
dfs['2016-2017.xlsx'].columns = ['street_number','street_name','unit_number','zip_code','level_of_study','full_time','expected_graduation_term','housing_type']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit','housing_type']

for key in dfs:
    if key in ['2016-2017.xlsx','2023-2024.xlsx','2022-2023.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']


# fill 2016-2017 na columns in stree_name with empty string
dfs['2016-2017.xlsx']['street_name'] = dfs['2016-2017.xlsx']['street_name'].fillna('')

# print the columns of each df
# for 2016-2017, street name also has a suffix, let's fix that
street_suffix = dfs['2016-2017.xlsx']['street_name'].apply(lambda name: name.split(' ')[-1] if len(name.split(' ')) > 1 else '')
dfs['2016-2017.xlsx']['street_suffix'] = street_suffix
dfs['2016-2017.xlsx']['street_name'] = dfs['2016-2017.xlsx']['street_name'].apply(lambda name: ' '.join(name.split(' ')[:-1]))

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'housing_type']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2016-2017.xlsx
['street_number', 'street_name', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'expected_graduation_term', 'housing_type', 'street_suffix']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', '

In [32]:
unique_columns = []
for key in dfs:
    for column in list(dfs[key].columns):
        if column not in unique_columns:
            unique_columns.append(column)

unique_columns

['street_number',
 'street_name',
 'street_suffix',
 'unit_number',
 'zip_code',
 'level_of_study',
 'full_time',
 'extra_large_unit',
 'housing_type',
 'at_home',
 'expected_graduation_term']

In [33]:
# now, make sure all of these columns are in each df
dfs = fix_missing_columns(dfs, unique_columns)
# now, reorder the columns
dfs = reorder_columns(dfs, ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit','expected_graduation_term','housing_type','at_home'])
# print the columns of each df
print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'expected_graduation_term', 'housing_type', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'expected_graduation_term', 'housing_type', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'expected_graduation_term', 'housing_type', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'expected_graduation_term', 'housing_type', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'expected_graduation_term', 'housing_type', 'at_home']
2016-2017.

#### Save the Result

In [34]:
polish_and_save(dfs, 'Boston College', 'BostonCollege.csv')

# Boston Conservatory Berklee

#### Polishing Columns

In [35]:
dfs = prep_dfs('BostonConservatoryBerklee')
print_columns(dfs)

2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2016-2017.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']


In [36]:
for key in dfs:
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

In [37]:
print_columns(dfs)

2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']


#### Save the Result

In [38]:
polish_and_save(dfs, 'Boston Conservatory Berklee', 'BostonConservatoryBerklee.csv')

# Emerson College

#### Polishing Columns

In [39]:
dfs = prep_dfs('EmersonCollege')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  '

In [40]:
# 2021-2022 has at-home column and extra large unit column
# 2022-2023 has at-home column
# 2023-2024 has at-home column
# the rest do not have at-home column

dfs['2021-2022.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit','at_home']
dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2021-2022.xlsx','2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

In [41]:
dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])
dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])
print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [42]:
polish_and_save(dfs, 'Emerson College', 'EmersonCollege.csv')

# Emmanuel College

#### Polishing Columns

In [43]:
dfs = prep_dfs('EmmanuelCollege')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  '

In [44]:
# 2022-2023 has at-home column
# 2023-2024 has at-home column
# the rest do not have at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [45]:
polish_and_save(dfs, 'Emmanuel College', 'EmmanuelCollege.csv')

# Fisher College

#### Polishing Columns

In [46]:
dfs = prep_dfs('FisherCollege')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '

In [47]:
# all of them do not have at-home column
for key in dfs:
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

# add at-home column
dfs = fix_missing_columns(dfs, ['at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [48]:
polish_and_save(dfs, 'Fisher College', 'FisherCollege.csv')

# Franklin Institute of Technology

#### Polishing Columns

In [49]:
dfs = prep_dfs('FranklinInstituteTechnology')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStree

In [50]:
# 2022-2023 has at-home column
# others do not have at-home column
dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
for key in dfs:
    if key in ['2022-2023.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2021-202

In [51]:
dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])
dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])
print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [52]:
polish_and_save(dfs, 'Franklin Institute of Technology', 'FranklinInstituteTechnology.csv')

# Harvard University

#### Polishing Columns

In [53]:
dfs = prep_dfs('HarvardUniversity')
print_columns(dfs)

Student Addresses
2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. 

In [54]:
# 2022-2023 has at-home column
# 2023-2024 has at-home column
# others do not have at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2021-2022.xlsx
['

In [55]:
dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [56]:
polish_and_save(dfs, 'Harvard University', 'HarvardUniversity.csv')

# Massachusetts College of Pharamacy and Health Sciences

#### Polishing Columns

In [57]:
dfs = prep_dfs('MassachusettsCollegePharmacyHealthSciences')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUnderGaduate (U) or Gaduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', 

In [58]:
# 2022-2023 has at-home column
# 2023-2024 has at-home column
# others do not have at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2021-2022.xlsx
['

In [59]:
dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [60]:
polish_and_save(dfs, 'Massachusetts College of Pharmacy and Health Sciences', 'MassachusettsCollegePharmacyHealthSciences.csv')

# MIT

#### Polishing Columns

In [61]:
dfs = prep_dfs('MassachusettsInstituteTechnology')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  '

In [62]:
# 2022-2023 has at-home column
# 2023-2024 has at-home column
# others do not have at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2021-2022.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']


In [63]:
dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2021-2022.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [64]:
polish_and_save(dfs, 'Massachusetts Institute of Technology', 'MassachusettsInstituteTechnology.csv')

# MGH Institute of Health Professions

#### Polishing Columns

In [65]:
dfs = prep_dfs('MGHInstituteHealthProfessions')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  '

In [66]:
# 2022-2023 has at-home column
# 2023-2024 has at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']


dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [67]:
polish_and_save(dfs, 'MGH Institute of Health Professions', 'MGHInstituteHealthProfessions.csv')

# New England College of Optometry

#### Polishing Columns

In [68]:
dfs = prep_dfs('NewEnglandCollegeOptometry')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['Street Number', 'Street Name', 'Street Suffix', 'Unit Number', 'ZIP', 'Undergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZi

In [69]:
# 2022-2023 has at-home column
# 2023-2024 has at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [70]:
polish_and_save(dfs, 'New England College of Optometry', 'NewEnglandCollegeOptometry.csv')

# New England Conservatory

#### Polishing Columns

In [71]:
dfs = prep_dfs('NewEnglandConservatory')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFT-time (FT) or \nPT-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6

In [72]:
# 2022-2023 has at-home column
# 2023-2024 has at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [73]:
polish_and_save(dfs, 'New England Conservatory', 'NewEnglandConservatory.csv')

# New England Law

#### Polishing Columns

In [74]:
dfs = prep_dfs('NewEnglandLaw')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  '

In [75]:
# 2022-2023 has at-home column
# 2023-2024 has at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [76]:
polish_and_save(dfs, 'New England Law', 'NewEnglandLaw.csv')

# Sattler College

#### Polishing Columns

In [77]:
dfs = prep_dfs('SattlerCollege')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2021-2022.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']


In [78]:
# 2022-2023 has at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2021-2022.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']


#### Save the Result

In [79]:
polish_and_save(dfs, 'Sattler College', 'SattlerCollege.csv')

# SHOWA Boston Institute

#### Polishing Columns

In [80]:
dfs = prep_dfs('SHOWABostonInstitute')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  '

In [81]:
# 2022-2023 has at-home column
# 2023-2024 has at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [82]:
polish_and_save(dfs, 'SHOWA Boston Institute', 'SHOWABostonInstitute.csv')

# Simmons College

#### Polishing Columns

In [83]:
dfs = prep_dfs('SimmonsCollege')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nSt Su

  warn(msg)


In [84]:
# 2022-2023 has at-home column
# 2023-2024 has at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2021-2022.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [85]:
polish_and_save(dfs, 'Simmons College', 'SimmonsCollege.csv')

# St. John's Seminary

#### Polishing Columns

In [86]:
dfs = prep_dfs('StJohnSeminary')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  '

In [87]:
# 2022-2023 has at-home column
# 2023-2024 has at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [88]:
polish_and_save(dfs, 'St John Seminary', 'StJohnSeminary.csv')

# Suffolk University

#### Polishing Columns

In [89]:
dfs = prep_dfs('SuffolkUniversity')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  '

In [90]:
# 2022-2023 has at-home column
# 2023-2024 has at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])


print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [91]:
polish_and_save(dfs, 'Suffolk University', 'SuffolkUniversity.csv')

# Tufts University

#### Polishing Columns

In [92]:
dfs = prep_dfs('TuftsUniversity')
print_columns(dfs)


2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  '

In [93]:
# in 2016-2017, let's remove the columns that do not belong there
# remove:  'Unnamed: 8', 'Home address', 'Current address same as home address', 'SMFA student'
dfs['2016-2017.xlsx'].drop(columns=['Unnamed: 8', 'Home address', 'Current address same as home address', 'SMFA student'], inplace=True)
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  '

In [94]:
# 2022-2023 has at-home column
# 2023-2024 has at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)


2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [95]:
polish_and_save(dfs, 'Tufts University', 'TuftsUniversity.csv')

# University of Massachusetts Boston

#### Polishing Columns

In [96]:
dfs = prep_dfs('UniversityMassachusettsBoston')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  '

In [97]:
# 2022-2023 has at-home column
# 2023-2024 has at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [98]:
polish_and_save(dfs, 'University of Massachusetts Boston', 'UniversityMassachusettsBoston.csv')

# Urban College Boston

#### Polishing Columns

In [99]:
dfs = prep_dfs('UrbanCollegeBoston')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStree

In [100]:
# 2023-2024 has at-home column

dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])

dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])

print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [101]:
polish_and_save(dfs, 'Urban College of Boston', 'UrbanCollegeBoston.csv')

# Wentworth Institute of Technology

#### Polishing Columns

In [102]:
dfs = prep_dfs('WentworthInstituteTechnology')
print_columns(dfs)

2022-2023.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2019-2020.xlsx
['Street Number', 'Street Name', 'Unit Number', 'Degree Level', 'Full Part Time', 'Local Zip', 'Yes or No 5 or more']
2023-2024.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \nAt-Home or Not-at-Home']
2020-2021.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart

In [103]:
# 2019-2020 doesn't have suffix column

street_names = dfs['2019-2020.xlsx']['Street Name']

# fills nulls with empty string
street_names = street_names.fillna('')

# splits the street name by space
street_suffix = street_names.apply(lambda name: name.split(' ')[-1] if len(name.split(' ')) > 1 else '')

# splits the street name by space
street_name = street_names.apply(lambda name: ' '.join(name.split(' ')[:-1]))

dfs['2019-2020.xlsx']['Street Suffix'] = street_suffix
dfs['2019-2020.xlsx']['Street Name'] = street_name

In [104]:
dfs['2019-2020.xlsx'].columns

Index(['Street Number', 'Street Name', 'Unit Number', 'Degree Level',
       'Full Part Time', 'Local Zip', 'Yes or No 5 or more', 'Street Suffix'],
      dtype='object')

In [105]:
# reorder the columns for 2019-2020
# street_num - street_name - street_suffix - unit_num - zip_code - level_of_study - full_time - extra_large_unit
dfs['2019-2020.xlsx'] = dfs['2019-2020.xlsx'][['Street Number','Street Name','Street Suffix','Unit Number','Local Zip','Degree Level','Full Part Time','Yes or No 5 or more']]

# 2022-2023 has at-home column
# 2023-2024 has at-home column

dfs['2022-2023.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']
dfs['2023-2024.xlsx'].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','at_home']

for key in dfs:
    if key in ['2022-2023.xlsx','2023-2024.xlsx']:
        continue
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

In [106]:
dfs = fix_missing_columns(dfs, ['at_home','extra_large_unit'])
dfs = reorder_columns(dfs, ['extra_large_unit','at_home'])
print_columns(dfs)

2022-2023.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2019-2020.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2023-2024.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2020-2021.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit', 'at_home']
2018-2019.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_numbe

#### Save the Result

In [107]:
polish_and_save(dfs, 'Wentworth Institute of Technology', 'WentworthInstituteTechnology.csv')

# Wheelock College

#### Polishing Columns

In [108]:
dfs = prep_dfs('WheelockCollege')
print_columns(dfs)

2017-2018.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
2016-2017.xlsx
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']


In [109]:
for key in dfs:
    dfs[key].columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']

print_columns(dfs)

2017-2018.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']
2016-2017.xlsx
['street_number', 'street_name', 'street_suffix', 'unit_number', 'zip_code', 'level_of_study', 'full_time', 'extra_large_unit']


#### Save the Result

In [110]:
polish_and_save(dfs, 'Wheelock College', 'WheelockCollege.csv')