In [98]:
import pandas as pd
from pathlib import Path
import re
from rapidfuzz import fuzz, process
from geopy.geocoders import Nominatim
import us
from datetime import datetime

BASE_DIR = Path("..")
ORIGINAL_DATA_DIR = BASE_DIR / "original_data"
CLEAN_DATA_DIR = BASE_DIR / "clean_data"
FILE_PATH = ORIGINAL_DATA_DIR / "nsf_awards_us_2019_2024.csv"
TARGET_PATH = CLEAN_DATA_DIR / "nsf_awards_us_2019_2024.csv"
def describe_dataset(df):
    summary = []
    for col in df.columns:
        col_data = df[col]
        summary.append({
            'Column': col,
            'Type': col_data.dtype,
            'Nulls': col_data.isnull().sum(),
            'Null %': round(col_data.isnull().mean() * 100, 2),
            'Unique': col_data.nunique(dropna=True),
            'Min': col_data.min() if pd.api.types.is_numeric_dtype(col_data) else None,
            'Max': col_data.max() if pd.api.types.is_numeric_dtype(col_data) else None,
            'Mean': round(col_data.mean(), 3) if pd.api.types.is_numeric_dtype(col_data) else None,
            'Std': round(col_data.std(), 3) if pd.api.types.is_numeric_dtype(col_data) else None,
            'Example values': col_data.dropna().unique()[:3].tolist()  # first 3 unique examples
        })
    
    summary_df = pd.DataFrame(summary)
    return summary_df



In [99]:
data = pd.read_csv(FILE_PATH)



In [100]:
describe_dataset(data)

Unnamed: 0,Column,Type,Nulls,Null %,Unique,Min,Max,Mean,Std,Example values
0,abstractText,object,246,0.36,43878,,,,,[This project aims to serve the national inter...
1,agency,object,0,0.0,2,,,,,"[NSF, US]"
2,awardeeCountryCode,object,0,0.0,232,,,,,"[US, Naval Postgraduate School, ADIALANTE L.L.C.]"
3,awardeeName,object,0,0.0,4507,,,,,"[Loyola University Maryland, Inc., University ..."
4,awardeeStateCode,object,0,0.0,295,,,,,"[MD, WY, AZ]"
5,date,object,0,0.0,1753,,,,,"[08/20/2025, 07/21/2025, 08/28/2025]"
6,startDate,object,0,0.0,344,,,,,"[12/15/2025, 12/01/2025, 11/15/2025]"
7,expDate,object,0,0.0,596,,,,,"[11/30/2028, 11/30/2029, 11/30/2027]"
8,title,object,29,0.04,43957,,,,,[Cornerstones for an Undergraduate Quantum Com...


In [101]:
import pandas as pd
from datetime import datetime

def normalize_cols(cols):
    fixed = []
    for c in cols:
        c_clean = c.strip()
        if c_clean.lower().endswith("bstracttext"):
            c_clean = "abstractText"
        fixed.append(c_clean)
    return fixed

data.columns = normalize_cols(data.columns)

for col in ["startDate", "expDate", "date"]:
    if col in data.columns:
        parsed = pd.to_datetime(data[col], format="%m/%d/%Y", errors="coerce")
        if parsed.isna().mean() > 0.5:
            parsed = pd.to_datetime(data[col], errors="coerce", infer_datetime_format=True)
        data[col] = parsed
target = datetime(2025, 4, 15)
data = data[
    (data["startDate"].notna()) &
    (data["expDate"].notna()) &
    (data["startDate"] <= target) &
    (data["expDate"] >= target)
].copy()

keep = ["id", "awardeeName", "awardeeStateCode", "startDate", "expDate", "title", "abstractText", "nsf_url"]
keep_existing = [c for c in keep if c in data.columns]
data = data[keep_existing].reset_index(drop=True)

print(f"✅ Active on 2024-12-31: {len(data)} rows; columns kept: {keep_existing}")


✅ Active on 2024-12-31: 38572 rows; columns kept: ['awardeeName', 'awardeeStateCode', 'startDate', 'expDate', 'title', 'abstractText']


In [102]:
string_cols = ["title", "abstractText", "awardeeName","awardeeStateCode"]

for col in string_cols:
    if col in ["title", "abstractText"]:
        def clean_to_string(text):
            if pd.isna(text):
                return ""
            text = text.lower()
            text = re.sub(r"[^a-z\s]", " ", text)
            text = re.sub(r"\s+", " ", text)
            return text.strip()

        data[col] = data[col].astype("string").apply(clean_to_string)
    else:
        data[col] = data[col].astype("string").str.strip()

In [103]:

us_state_abbrev = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
    'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware',
    'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
    'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
    'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
    'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
    'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
    'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
    'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah',
    'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
    'WI': 'Wisconsin', 'WY': 'Wyoming', 'DC': 'District of Columbia',
    'PR': 'Puerto Rico', 'GU': 'Guam', 'VI': 'Virgin Islands', 'AS': 'American Samoa'
}

data["org_state_full"] = data["awardeeStateCode"].map(us_state_abbrev)

print(f"Unmatched rows: {data[data['org_state_full'].isna()]}")

print(len(us_state_abbrev.values()))

Unmatched rows:                      awardeeName awardeeStateCode  startDate    expDate  \
12086  NORTHERN MARIANAS COLLEGE               MP 2023-10-01 2026-09-30   
12713  NORTHERN MARIANAS COLLEGE               MP 2023-10-01 2026-09-30   

                                                   title  \
12086  collaborative research epiic cultivating innov...   
12713  collaborative research epiic cultivating innov...   

                                            abstractText org_state_full  
12086  this is a collaborative project between the fo...            NaN  
12713  this is a collaborative project between the fo...            NaN  
55


In [104]:
data = data.dropna()

In [105]:
valid_states_and_dc = [
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN',
    'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV',
    'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN',
    'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'DC'
]
data = data[data["awardeeStateCode"].isin(valid_states_and_dc)].copy()


In [106]:


BASE_DIR = Path("..")
CLEAN_DATA_DIR = BASE_DIR / "clean_data"
FILE_PATH_TERM = CLEAN_DATA_DIR / "nsf_terminations_airtable.csv"
terminations = pd.read_csv(FILE_PATH_TERM)
total_dataset = data.copy()


terminations["org_name_norm"] = terminations["org_name"]
total_dataset["awardeeName_norm"] = total_dataset["awardeeName"]

term_names = set(terminations["org_name_norm"].unique())
total_names = set(total_dataset["awardeeName_norm"].unique())

# Check overlap
missing_in_total = term_names - total_names
missing_in_term = total_names - term_names

print(f"Universities in both datasets: {len(term_names & total_names)}")
print(f"Universities in terminations but not in total: {len(missing_in_total)}")
print(f"Universities in total but not in terminations: {len(missing_in_term)}")


Universities in both datasets: 475
Universities in terminations but not in total: 24
Universities in total but not in terminations: 1990


In [107]:
import difflib

print("\nFinding closest matches for universities in terminations not found in total_dataset:\n")
print(len(missing_in_total))
for name in list(missing_in_total):
    matches = difflib.get_close_matches(name, total_names, n=10, cutoff=0.8)

    original_term = terminations.loc[
        terminations["org_name_norm"] == name, "org_name"
    ].iloc[0]

    if matches:
        print(f"\n'{original_term}' → possible matches:")
        for i, m in enumerate(matches, 1):
            original_total = total_dataset.loc[
                total_dataset["awardeeName_norm"] == m, "awardeeName"
            ].iloc[0]
            print(f"  {i}. {original_total}")
    else:
        print(f"\n'{original_term}' → no close matches found")



Finding closest matches for universities in terminations not found in total_dataset:

24

'Coastal and Estuarine Research Federation' → no close matches found

'LEARNING NETWORK, LLC, THE' → no close matches found

'INSTITUTE FOR NATIVE PACIFIC EDUCATION AND CULTURE' → no close matches found

'Manhattanville College' → possible matches:
  1. Manhattan College

'Regis College' → possible matches:
  1. Rhodes College
  2. Reed College

'NEVADA STATE UNIVERSITY' → no close matches found

'Society of Women Engineers' → no close matches found

'Crow Canyon Archaeological Center' → no close matches found

'CLAUDIUS LEGAL INTELLIGENCE INC' → no close matches found

'Wepan Inc' → no close matches found

'Inclusive Engineering Consortium, Inc.' → no close matches found

'American Society For Cell Biology' → possible matches:
  1. American Society For Microbiology

'Allegheny College' → no close matches found

'GPRA Strategic Management, Inc.' → no close matches found

'American Chemical Societ

In [108]:
manual_replacements = {
    "University of Colorado at Boulder": "University of Colorado Boulder",
}

total_dataset["awardeeName"] = total_dataset["awardeeName"].replace(manual_replacements)
total_dataset["awardeeName_norm"] = total_dataset["awardeeName"]


In [109]:
data.to_csv(TARGET_PATH, index=False)
