# Exercise 6-2: Do more cleaning on the Awards data

## Read the data

In [56]:
import pandas as pd

In [57]:
# Load the cleaned NSF awards data into a DataFrame
# NOTE: If this fails, run the notebook from exercise 6.1 to create the file
awards_df = pd.read_pickle("nsf_awards_ex61_cleaned.pkl")

In [58]:
# Display the first few rows of the DataFrame
awards_df.head()

Unnamed: 0,awd_id,agcy_id,title,cfda_num,org_code,po_phone,po_email,po_sign_block_name,awd_eff_date,awd_exp_date,...,div_abbr,division,inst_name,inst_street_address,inst_street_address_2,inst_city_name,inst_state_code,inst_phone_num,inst_zip_code,inst_country_name
31,855351,NSF,Modeling Cloud Responses to Global Atmospheric...,47.05,6020105,,,edward bensman,2009-12-01,2014-11-30,...,AGS,Division of Atmospheric and Geospace Sciences,University of Texas at Dallas,800 WEST CAMPBELL RD.,SP2.25,RICHARDSON,TX,9728832000.0,750803021,United States
32,856009,NSF,Materials World Network: Local Surface Chemist...,47.049,3070010,,,Diana Farkas,2009-12-15,2013-11-30,...,DMR,Division Of Materials Research,University of Houston,4300 MARTIN LUTHER KING BLVD,,HOUSTON,TX,7137436000.0,772043067,United States
58,908968,NSF,RET Site: Nanotechnology Research Experience f...,47.041,7050000,,,Mary Poats,2010-04-01,2014-09-30,...,EEC,Division of Engineering Education and Centers,William Marsh Rice University,6100 MAIN ST,,Houston,TX,7133485000.0,770051827,United States
68,910527,NSF,Study of Theoretically Interesting Molecules a...,47.049,3090003,,,Tyrone Mitchell,2009-10-15,2013-09-30,...,CHE,Division Of Chemistry,University of North Texas,1112 DALLAS DR STE 4000,,DENTON,TX,9405654000.0,762051132,United States
75,911317,NSF,Collaborative Research: Impact of Spatial and ...,47.05,6030109,,,Thomas Torgersen,2010-04-01,2015-03-31,...,EAR,Division Of Earth Sciences,Texas A&M Research Foundation,400 HARVEY MITCHELL PKWY S STE 300,,COLLEGE STATION,TX,9798627000.0,778454375,United States


## Examine the data

In [59]:
# Display information about the awards_df DataFrame, including column data types and non-null counts
awards_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11040 entries, 31 to 187543
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   awd_id                 11040 non-null  int64         
 1   agcy_id                11040 non-null  object        
 2   title                  11040 non-null  object        
 3   cfda_num               11040 non-null  object        
 4   org_code               11040 non-null  int64         
 5   po_phone               7058 non-null   float64       
 6   po_email               7058 non-null   object        
 7   po_sign_block_name     11040 non-null  object        
 8   awd_eff_date           11040 non-null  datetime64[ns]
 9   awd_exp_date           11040 non-null  datetime64[ns]
 10  tot_intn_awd_amt       11040 non-null  float64       
 11  awd_amount             11040 non-null  float64       
 12  dir_abbr               11040 non-null  category      
 13  dire

In [60]:
# Count the occurrences of each unique value in the 'directorate' column
awards_df.directorate.value_counts()

directorate
Directorate for Mathematical and Physical Sciences                  2384
Directorate for Engineering                                         2129
Directorate for Computer and Information Science and Engineering    1963
Directorate for Geosciences                                         1088
Directorate for STEM Education                                      1009
Directorate for Biological Sciences                                  791
Directorate for Social, Behavioral and Economic Sciences             741
Directorate for Technology, Innovation, and Partnerships             725
Office Of The Director                                               210
Name: count, dtype: int64

In [61]:
# Display unique values in the 'inst_city_name' column
awards_df.inst_city_name.unique()

array(['RICHARDSON', 'HOUSTON', 'Houston', 'DENTON', 'COLLEGE STATION',
       'AUSTIN', 'EDINBURG', 'WACO', 'DALLAS', 'SAN ANTONIO',
       'KINGSVILLE', 'EL PASO', 'Dallas', 'FULSHEAR', 'College Station',
       'SAN MARCOS', 'Austin', 'ARLINGTON', 'LUBBOCK', 'FORT WORTH',
       'LAREDO', 'CORPUS CHRISTI', 'Beaumont', 'HUNTSVILLE', 'LONGVIEW',
       'KERRVILLE', 'Arlington', 'UNIVERSAL CITY', 'Brownsville',
       'AMARILLO', 'BAYTOWN', 'THE WOODLANDS', 'Prairie View', 'Spring',
       'Manchaca', 'Fort Worth', 'Richardson', 'El Paso', 'Lubbock',
       'West Lake Hills', 'austin', 'Bryan', 'NACOGDOCHES', 'PLANO',
       'Irving', 'TYLER', 'MARSHALL', 'PRAIRIE VIEW', 'ABILENE',
       'COMMERCE', 'GALVESTON', 'GEORGETOWN', 'GRAPEVINE',
       'Grand Prairie', 'CEDAR HILL', 'Waco', 'Laredo', 'Pearland',
       'SAN ANGELO', 'San Antonio', 'Georgetown', 'MCKINNEY', 'COPPELL',
       'Galveston', 'Flower Mound', 'Little Elm', 'FRISCO', 'Plano',
       'CANYON', 'Round Rock', 'Elgin', 

## Fix spelling and capitalization problems in the data

In [62]:
# Set contents of all columns with 'inst_' prefix to title case
inst_columns = [col for col in awards_df.columns if col.startswith('inst_')]
for col in inst_columns:
    if awards_df[col].dtype == 'object':  # Ensure the column is of string type
        awards_df[col] = awards_df[col].str.title()

# But actually, put inst_state_code back to upper case :-)
awards_df.inst_state_code = awards_df.inst_state_code.str.upper()

In [63]:
# Check unique values in the 'inst_city_name' column again
awards_df.inst_city_name.unique()

array(['Richardson', 'Houston', 'Denton', 'College Station', 'Austin',
       'Edinburg', 'Waco', 'Dallas', 'San Antonio', 'Kingsville',
       'El Paso', 'Fulshear', 'San Marcos', 'Arlington', 'Lubbock',
       'Fort Worth', 'Laredo', 'Corpus Christi', 'Beaumont', 'Huntsville',
       'Longview', 'Kerrville', 'Universal City', 'Brownsville',
       'Amarillo', 'Baytown', 'The Woodlands', 'Prairie View', 'Spring',
       'Manchaca', 'West Lake Hills', 'Bryan', 'Nacogdoches', 'Plano',
       'Irving', 'Tyler', 'Marshall', 'Abilene', 'Commerce', 'Galveston',
       'Georgetown', 'Grapevine', 'Grand Prairie', 'Cedar Hill',
       'Pearland', 'San Angelo', 'Mckinney', 'Coppell', 'Flower Mound',
       'Little Elm', 'Frisco', 'Canyon', 'Round Rock', 'Elgin',
       'Mansfield', 'Pasadena', 'Missouri City', 'Harker Heights',
       'Wichita Falls', 'Seguin', 'Lancaster', 'Sugar Land',
       'Farmers Branch', 'Hutto', 'Sherman', 'La Jolla', 'Port Aransas',
       'Brenham', 'Leander', 'Steph

In [64]:
# Check unique values in the 'inst_state_code' column
awards_df.inst_state_code.unique()

array(['TX'], dtype=object)

In [65]:
# Investigate the unique values in the 'inst_street_address' column
awards_df.inst_street_address.unique()

array(['800 West Campbell Rd.', '4300 Martin Luther King Blvd',
       '6100 Main St', '1112 Dallas Dr Ste 4000',
       '400 Harvey Mitchell Pkwy S Ste 300', '110 Inner Campus Dr',
       '1201 W University Dr', '700 S University Parks Dr', '3124 Tamu',
       '2700 Bay Area Blvd # M105', '6425 Boaz St Rm 130', '2147 Tamu',
       '1 Utsa Cir', '700 N University Blvd', '1 Trinity Pl',
       '500 W University Ave', '2515 Mckinney Ave',
       '8285 El Rio St Ste 150', '11525 Stonehollow Dr Ste A135',
       '7920 Belt Line Road', '8500 Shoal Creek Blvd',
       '3815 Walker Falls Ln', '10410 Miller Road',
       '11142 Hopes Creek Rd', '6611 Morningside Dr', '601 University Dr',
       nan, '701 S Nedderman Dr', '2500 Broadway',
       '3101 Bellaire Drive North', '1700 University Dr', '1 Baylor Plz',
       '5201 University Blvd', '6300 Ocean Dr Unit 5739',
       '4400 S M L King Jr Pkwy', '1806 Ave J', '6220 Culebra Rd',
       '2100 S Mobberly Ave', '2100 Memorial Blvd', '7000 Fan

In [66]:
# Replace specific abbreviations in the 'inst_street_address' column with standardized forms
# For example, 'St.' becomes 'St', 'Rd.' becomes 'Rd', etc. 
# NOTE: This is a simplified example; you may need to adjust the replacements based on your data
abbreviations = {
    'St.': 'St',
    'Rd.': 'Rd',
    'Ave.': 'Ave',
    'Blvd.': 'Blvd',
    'Dr.': 'Dr',
    'Pkwy.': 'Pkwy',
    'Cir.': 'Cir',
    'Ct.': 'Ct',
    'Pl.': 'Pl'
}
for abbr, full in abbreviations.items():
    awards_df.inst_street_address = awards_df.inst_street_address.str.replace(abbr, full, regex=False)


In [67]:
# Create a new column 'inst_full_address' by concatenating the street address, city name, state code, and zip code
awards_df['inst_full_address'] = (
    awards_df.inst_street_address + ', ' +
    awards_df.inst_city_name + ', ' +
    awards_df.inst_state_code + ' ' +
    awards_df.inst_zip_code
)


In [69]:
# Display the first 5 rows of the awards DataFrame
awards_df.head()

Unnamed: 0,awd_id,agcy_id,title,cfda_num,org_code,po_phone,po_email,po_sign_block_name,awd_eff_date,awd_exp_date,...,division,inst_name,inst_street_address,inst_street_address_2,inst_city_name,inst_state_code,inst_phone_num,inst_zip_code,inst_country_name,inst_full_address
31,855351,NSF,Modeling Cloud Responses to Global Atmospheric...,47.05,6020105,,,edward bensman,2009-12-01,2014-11-30,...,Division of Atmospheric and Geospace Sciences,University Of Texas At Dallas,800 West Campbell Rd,Sp2.25,Richardson,TX,9728832000.0,750803021,United States,"800 West Campbell Rd, Richardson, TX 750803021"
32,856009,NSF,Materials World Network: Local Surface Chemist...,47.049,3070010,,,Diana Farkas,2009-12-15,2013-11-30,...,Division Of Materials Research,University Of Houston,4300 Martin Luther King Blvd,,Houston,TX,7137436000.0,772043067,United States,"4300 Martin Luther King Blvd, Houston, TX 7720..."
58,908968,NSF,RET Site: Nanotechnology Research Experience f...,47.041,7050000,,,Mary Poats,2010-04-01,2014-09-30,...,Division of Engineering Education and Centers,William Marsh Rice University,6100 Main St,,Houston,TX,7133485000.0,770051827,United States,"6100 Main St, Houston, TX 770051827"
68,910527,NSF,Study of Theoretically Interesting Molecules a...,47.049,3090003,,,Tyrone Mitchell,2009-10-15,2013-09-30,...,Division Of Chemistry,University Of North Texas,1112 Dallas Dr Ste 4000,,Denton,TX,9405654000.0,762051132,United States,"1112 Dallas Dr Ste 4000, Denton, TX 762051132"
75,911317,NSF,Collaborative Research: Impact of Spatial and ...,47.05,6030109,,,Thomas Torgersen,2010-04-01,2015-03-31,...,Division Of Earth Sciences,Texas A&M Research Foundation,400 Harvey Mitchell Pkwy S Ste 300,,College Station,TX,9798627000.0,778454375,United States,"400 Harvey Mitchell Pkwy S Ste 300, College St..."


In [70]:
# Write the cleaned DataFrame to a new pickle file
awards_df.to_pickle("nsf_awards_ex62_cleaned.pkl")