In [1]:
import pandas as pd
import re

In [4]:
df = pd.read_csv("2016-2018.csv",low_memory=False) # We use low memory=False because each column has multiple data types

In [7]:
df.columns

Index(['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ',
       '6d.\n Unit #', '6e. \nZip', '7. \nUndergraduate (U) or Graduate (G)',
       '8. \nFull-time (FT) or \nPart-time (PT)',
       '9. \n 5 or More Undergrads/Unit\n(Y/N)', 'University', 'Year',
       'Unnamed: 8', 'Home address', 'Current address same as home address',
       'SMFA student', '7. \nUnderGaduate (U) or Gaduate (G)',
       '8. \nFT-time (FT) or \nPT-time (PT)'],
      dtype='object')

#### Problem universities were identified in the Data Concat scripts, this script tackles those problem universities, since different years have different issues, there is no set format for this code, modifications for each year were made to ensure that all the data shared matches the standard format. This script is closer to a template that was used.

In [75]:
temp = df[df["University"]=="Fisher College"].copy(deep=True)

In [76]:
temp.head()

Unnamed: 0,6a. \nStreet #,6b. \nStreet Name,6c. \nStreet Suffix,6d.\n Unit #,6e. \nZip,7. \nUndergraduate (U) or Graduate (G),8. \nFull-time (FT) or \nPart-time (PT),9. \n 5 or More Undergrads/Unit\n(Y/N),University,Year,Unnamed: 8,Home address,Current address same as home address,SMFA student,7. \nUnderGaduate (U) or Gaduate (G),8. \nFT-time (FT) or \nPT-time (PT)
13229,,8 Westminster Ave Apt 1,,,2134.0,UG,FT,,Fisher College,2016-2017,,,,,,
13230,,"17 Cazenove Street, Apt. BW",,,2199.0,UG,FT,,Fisher College,2016-2017,,,,,,
13231,,1156 Commonwealth Ave #49,,,2116.0,UG,FT,,Fisher College,2016-2017,,,,,,
13232,,"770 Boylston Street, Apt. 3J",,Apt. 18,2134.0,UG,PT,,Fisher College,2016-2017,,,,,,
13233,,"45 Stuart Street, #2804",,,2111.0,UG,FT,,Fisher College,2016-2017,,,,,,


The below functions takes in a complete address and breaks it down into the street number, street name, street suffix, unit number (if available) and zip code.

In [77]:
def parse_address(address):
    address = address.replace(","," ")
    print(address)
    add_split = address.split()
    try:
        stno=int(add_split[0])
        add_split.pop(0)
    except:
        # add_split=["N/A stno"]+add_split
        stno=None
    y= set(add_split).intersection(set(['St','Street','Ave','Avenue','Blvd','Boulevard','Rd','Road','Steet']))
    if len(y)==1:
        i=add_split.index(list(y)[0])
        j=add_split.pop(i)
        suff = j
    else:
        suff=None
    if "Unit" in add_split:
        i=add_split.index("Unit")
        apt = " ".join(add_split[i:])
        add_split = add_split[:i]
    elif "Apt" in add_split:
        i=add_split.index("Apt")
        apt = " ".join(add_split[i:])   
        add_split = add_split[:i]
    elif "Apt." in add_split:
        i=add_split.index("Apt.")
        apt = " ".join(add_split[i:])
        add_split = add_split[:i]
    elif "#" in add_split:
        i=add_split.index("#")
        apt = " ".join(add_split[i:])
        add_split = add_split[:i]
    else:
        if len(add_split)>=1:
            x=add_split[len(add_split)-1]
            if "#" in x:
                apt=x
                add_split.pop(-1)
            else:
                apt=None
        else:
            apt=None

    st_name = " ".join(add_split)

    return stno, st_name, suff, apt
        
        


In [78]:
temp[['6a. \nStreet #', '6b. \nStreet Name', '6c. \nStreet Suffix  ','6d.\n Unit #']] = temp['6b. \nStreet Name'].apply(parse_address).apply(pd.Series)

8 Westminster Ave Apt 1
17 Cazenove Street  Apt. BW
1156 Commonwealth Ave #49
770 Boylston Street  Apt. 3J
45 Stuart Street  #2804
1152 Commonwalth Avenue
630 Washington Street #405
27 Topliff St
170 Tremont Street
504 Beacon St
580 Washington St Unit 312
580 Washington St.
29 Saint Margret St.
401 Mount Vernon Street  Apt. 724
580 Washington St.
45 Fairbanks St
368 Riverway
1079 Commonweather Ave #327
1079 Commonwealth Ave  Apt. 505
101 Canal St
196 Hillside Street
79 Howland St
12 Kevin Rd
50 Harbor point blvd
1540 Tremont St
89 E Squantum Street
12 Hancock Street
12 Kevin rd
62 Boylston St Apt. 915
506 Beacon Street
660 Washington St
10 Parkerhill Ave
4 Fountain Place Apt. 5
217 Kelton Street  Unit 14
277 Marlborough st
276 Corey Rd  Apt #23
77 Walk Hill St
660 Washington Street
858 Huntington Ave Apt 1
32 Gallivan Blvd
50 Lewis St.
323 Beacon Street
660 Washington St.
9 Crestnay
66 Beach Point Place
12 Kevin Rd
61 South Huntington Ave  #203
1 Nassau St Apt. 907
630 Washington St
15

In [79]:
temp.head()

Unnamed: 0,6a. \nStreet #,6b. \nStreet Name,6c. \nStreet Suffix,6d.\n Unit #,6e. \nZip,7. \nUndergraduate (U) or Graduate (G),8. \nFull-time (FT) or \nPart-time (PT),9. \n 5 or More Undergrads/Unit\n(Y/N),University,Year,Unnamed: 8,Home address,Current address same as home address,SMFA student,7. \nUnderGaduate (U) or Gaduate (G),8. \nFT-time (FT) or \nPT-time (PT)
13229,8.0,Westminster,Ave,Apt 1,2134.0,UG,FT,,Fisher College,2016-2017,,,,,,
13230,17.0,Cazenove,Street,Apt. BW,2199.0,UG,FT,,Fisher College,2016-2017,,,,,,
13231,1156.0,Commonwealth,Ave,#49,2116.0,UG,FT,,Fisher College,2016-2017,,,,,,
13232,770.0,Boylston,Street,Apt. 3J,2134.0,UG,PT,,Fisher College,2016-2017,,,,,,
13233,45.0,Stuart,Street,#2804,2111.0,UG,FT,,Fisher College,2016-2017,,,,,,


In [80]:
temp.to_csv("Fisher College.csv")