In [22]:
import pandas as pd

# Function to parse the address into street number, name, suffix, and apartment/unit number
def parse_address(address):
    if pd.isna(address):
        return None, None, None, None
    
    address = str(address).replace(",", " ")
    add_split = address.split()
    
    try:
        stno = int(add_split[0])
        add_split.pop(0)
    except:
        stno = None
    
    suffixes = ['St', 'Street', 'Ave', 'Avenue', 'Blvd', 'Boulevard', 'Rd', 'Road']
    y = set(add_split).intersection(set(suffixes))
    if len(y) == 1:
        i = add_split.index(list(y)[0])
        suff = add_split.pop(i)
    else:
        suff = None
    
    if "Unit" in add_split or "Apt" in add_split or "Apt." in add_split or "#" in add_split:
        if "Unit" in add_split:
            i = add_split.index("Unit")
        elif "Apt" in add_split:
            i = add_split.index("Apt")
        elif "Apt." in add_split:
            i = add_split.index("Apt.")
        else:
            i = add_split.index("#")
        apt = " ".join(add_split[i:])
        add_split = add_split[:i]
    else:
        apt = None
    
    st_name = " ".join(add_split)
    
    return stno, st_name, suff, apt

# Load the dataset for Northeastern 2022-23 from the 'Student Addresses' sheet
file_path_2022_23 = r'Northeastern University 2022-23.xlsx'
df_2022_23 = pd.read_excel(file_path_2022_23, sheet_name='Student Addresses', skiprows=0)

# Apply the parse_address function to the 'Address' column
df_2022_23['6a. Street #'], df_2022_23['6b. Street Name'], df_2022_23['6c. Street Suffix'], df_2022_23['6d. Unit #'] = zip(*df_2022_23['Address'].apply(parse_address))

# Drop the 'Address' and 'Neighborhood' columns
df_2022_23.drop(columns=['Address', 'Neighborhood'], inplace=True)

# Rename the 'Zipcode' column to '6e. Zip'
df_2022_23.rename(columns={'Zipcode': '6e. Zip'}, inplace=True)

# Reorder the columns to have 6a, 6b, 6c, and 6d as the first four columns
desired_order = ['6a. Street #', '6b. Street Name', '6c. Street Suffix', '6d. Unit #'] + [col for col in df_2022_23.columns if col not in ['6a. Street #', '6b. Street Name', '6c. Street Suffix', '6d. Unit #']]
df_2022_23 = df_2022_23[desired_order]

# Save the processed file
df_2022_23.to_excel('Northeastern_2022_23_updated.xlsx', index=False)

# Repeat the same steps for Northeastern 2023-24 from the 'Student Addresses' sheet
file_path_2023_24 = r'Northeastern University 2023-24.xlsx'
df_2023_24 = pd.read_excel(file_path_2023_24, sheet_name='Student Addresses', skiprows=0)

# Apply the parse_address function to the 'Address' column
df_2023_24['6a. Street #'], df_2023_24['6b. Street Name'], df_2023_24['6c. Street Suffix'], df_2023_24['6d. Unit #'] = zip(*df_2023_24['Address'].apply(parse_address))

# Drop the 'Address' and 'Neighborhood' columns
df_2023_24.drop(columns=['Address', 'Neighborhood'], inplace=True)

# Rename the 'Zipcode' column to '6e. Zip'
df_2023_24.rename(columns={'Zipcode': '6e. Zip'}, inplace=True)

# Reorder the columns to have 6a, 6b, 6c, and 6d as the first four columns
desired_order = ['6a. Street #', '6b. Street Name', '6c. Street Suffix', '6d. Unit #'] + [col for col in df_2023_24.columns if col not in ['6a. Street #', '6b. Street Name', '6c. Street Suffix', '6d. Unit #']]
df_2023_24 = df_2023_24[desired_order]

# Save the processed file
df_2023_24.to_excel('Northeastern_2023_24_updated.xlsx', index=False)

print("Files processed and saved.")


Files processed and saved.
