In [20]:
import pandas as pd
import numpy as np
import re

Let's set up a universal column format for better experience

In [21]:
columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit','at_home']

In [22]:
dfs = {}

dir = 'data/universities/NortheasternUniversity/'
files = ['2016-2017.xlsx','2017-2018.xlsx','2018-2019.xlsx','2019-2020.xlsx','2020-2021.xlsx','2021-2022.xlsx','2022-2023.xlsx','2023-2024.xlsx']

for file in files:
    dfs[file] = pd.read_excel(dir + file, sheet_name='Student Addresses')


  warn(msg)
  warn(msg)


Let's see what are the columns

In [23]:
for df in dfs:
    print(list(dfs[df].columns))

['Address', 'Neighborhood', 'Zipcode', 'Level', 'Time Status', 'Year', 'Professional School', 'Home or Private?']
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ', '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)', '8. \nFull-time (FT) or \nPart-time (PT)', '9. \n 5 or More Undergrads/Unit\n(Y/N)']
['Address', 'Neighborhood', 'Zipcode', 'Level', 'Time Status', 'Year', 'Professional School', 'Home or Private?']
['Address', 'Neighborhood', 'Zipcode', 'Level', 'Time Status', 'Year', 'Professional School', 'Home or Privat

There are 2 distinct groups of columns, let's just group them and print the groups so we know what files to focus on and how to fix them

In [24]:
column_grouping = {}

for df in dfs:
    c_str = ' '.join(dfs[df].columns)
    # remove \n from string
    c_str = re.sub(r'\n', '', c_str)
    
    if c_str not in column_grouping:
        column_grouping[c_str] = []

    column_grouping[c_str].append(df)

for c in column_grouping:
    print(c)
    print(column_grouping[c])


Address Neighborhood Zipcode Level Time Status Year Professional School Home or Private?
['2016-2017.xlsx', '2020-2021.xlsx', '2021-2022.xlsx', '2022-2023.xlsx', '2023-2024.xlsx']
6a. Street # 6b. Street Name 6c. Street Suffix   6d. Unit # 6e. Zip 7. Undergraduate (U) or Graduate (G) 8. Full-time (FT) or Part-time (PT) 9.  5 or More Undergrads/Unit(Y/N)
['2017-2018.xlsx', '2018-2019.xlsx', '2019-2020.xlsx']


To explain the next cell, looking over the dataframes, some of the columns could simply be renamed and it would suffice.

The only issue was that the address column has all the addresses, and we have to seperate them into "street_num" "street_name" "street_suffix" "unit_number"

However, first let's fix the different columns

In [25]:
# let's drop the columns that are not in the mapping
mapping = {
    "Address" : "street_name",
    "Zipcode" : "zip_code",
    "Level" : "level_of_study",
    "Time Status" : "full_time",
    "Home or Private?" : "at_home",
}

# this happens for 2016-2017.xlsx, 2020-2021.xlsx, 2021-2022.xlsx, 2022-2023.xlsx, 2023-2024.xlsx
# if dfs has columns that are not in the mapping, we drop them

dfs['2016-2017.xlsx'] = dfs['2016-2017.xlsx'].drop(columns=[c for c in dfs['2016-2017.xlsx'].columns if c not in mapping])
dfs['2020-2021.xlsx'] = dfs['2020-2021.xlsx'].drop(columns=[c for c in dfs['2020-2021.xlsx'].columns if c not in mapping])
dfs['2021-2022.xlsx'] = dfs['2021-2022.xlsx'].drop(columns=[c for c in dfs['2021-2022.xlsx'].columns if c not in mapping])
dfs['2022-2023.xlsx'] = dfs['2022-2023.xlsx'].drop(columns=[c for c in dfs['2022-2023.xlsx'].columns if c not in mapping])
dfs['2023-2024.xlsx'] = dfs['2023-2024.xlsx'].drop(columns=[c for c in dfs['2023-2024.xlsx'].columns if c not in mapping])


dfs['2016-2017.xlsx'] = dfs['2016-2017.xlsx'].rename(columns=mapping)
dfs['2020-2021.xlsx'] = dfs['2020-2021.xlsx'].rename(columns=mapping)
dfs['2021-2022.xlsx'] = dfs['2021-2022.xlsx'].rename(columns=mapping)
dfs['2022-2023.xlsx'] = dfs['2022-2023.xlsx'].rename(columns=mapping)
dfs['2023-2024.xlsx'] = dfs['2023-2024.xlsx'].rename(columns=mapping)

Now we have to fix the rest of the dataframes, the order looks fine, but let's change the names, ( we are not using the above standard because we do not have the at_home column, yet)

In [26]:
# now map the rest to this columns variable
columns = ['street_number','street_name','street_suffix','unit_number','zip_code','level_of_study','full_time','extra_large_unit']
# rename 2017-2018.xlsx, 2018-2019.xlsx, 2019-2020.xlsx
dfs['2017-2018.xlsx'].columns = columns
dfs['2018-2019.xlsx'].columns = columns
dfs['2019-2020.xlsx'].columns = columns

Add the missing columns to the first group

In [27]:
# now fix the missing from the 2016-2017.xlsx, 2020-2021.xlsx, 2021-2022.xlsx, 2022-2023.xlsx, 2023-2024.xlsx
def fix_columns(df, columns):
    for c in columns:
        if c not in df.columns:
            df[c] = np.nan

    return df

dfs['2016-2017.xlsx'] = fix_columns(dfs['2016-2017.xlsx'], columns)
dfs['2020-2021.xlsx'] = fix_columns(dfs['2020-2021.xlsx'], columns)
dfs['2021-2022.xlsx'] = fix_columns(dfs['2021-2022.xlsx'], columns)
dfs['2022-2023.xlsx'] = fix_columns(dfs['2022-2023.xlsx'], columns)
dfs['2023-2024.xlsx'] = fix_columns(dfs['2023-2024.xlsx'], columns)

reordering everything to make sure everything is in the same format

In [28]:
def reorder(df,columns):
    return df[columns]

dfs['2016-2017.xlsx'] = reorder(dfs['2016-2017.xlsx'], columns)
dfs['2017-2018.xlsx'] = reorder(dfs['2017-2018.xlsx'], columns)
dfs['2018-2019.xlsx'] = reorder(dfs['2018-2019.xlsx'], columns)
dfs['2019-2020.xlsx'] = reorder(dfs['2019-2020.xlsx'], columns)
dfs['2020-2021.xlsx'] = reorder(dfs['2020-2021.xlsx'], columns)
dfs['2021-2022.xlsx'] = reorder(dfs['2021-2022.xlsx'], columns)
dfs['2022-2023.xlsx'] = reorder(dfs['2022-2023.xlsx'], columns)
dfs['2023-2024.xlsx'] = reorder(dfs['2023-2024.xlsx'], columns)

Now we can add the at_home column

In [11]:
# now add at_home to the rest of the dataframes
dfs['2016-2017.xlsx']['at_home'] = np.nan
dfs['2017-2018.xlsx']['at_home'] = np.nan
dfs['2018-2019.xlsx']['at_home'] = np.nan
dfs['2019-2020.xlsx']['at_home'] = np.nan
dfs['2020-2021.xlsx']['at_home'] = np.nan
dfs['2021-2022.xlsx']['at_home'] = np.nan
dfs['2022-2023.xlsx']['at_home'] = np.nan
dfs['2023-2024.xlsx']['at_home'] = np.nan

Do what we did to every other dataset, add university and year range so we know exactly where the data came from when workin with the final dataset

In [12]:
# add year_range
for df in dfs:
    dfs[df]['year_range'] = df.split('.')[0]

# add university
for df in dfs:
    dfs[df]['university'] = 'Northeastern University'

In [13]:
# merge all the dataframes
df = pd.concat([dfs['2016-2017.xlsx'], dfs['2017-2018.xlsx'], dfs['2018-2019.xlsx'], dfs['2019-2020.xlsx'], dfs['2020-2021.xlsx'], dfs['2021-2022.xlsx'], dfs['2022-2023.xlsx'], dfs['2023-2024.xlsx']], ignore_index=True)

# Save the result

In [14]:
# save the dataframe
df.to_csv('./data/clean/NortheasternUniversity.csv', index=False)