In [2]:
import pandas as pd
from pathlib import Path
import re
from rapidfuzz import fuzz, process
from geopy.geocoders import Nominatim
import us

BASE_DIR = Path("..")
ORIGINAL_DATA_DIR = BASE_DIR / "original_data"
CLEAN_DATA_DIR = BASE_DIR / "clean_data"
FILE_PATH = ORIGINAL_DATA_DIR / "nsf_terminations_airtable.csv"
TARGET_PATH = CLEAN_DATA_DIR / "nsf_terminations_airtable.csv"
def describe_dataset(df):
    summary = []
    for col in df.columns:
        col_data = df[col]
        summary.append({
            'Column': col,
            'Type': col_data.dtype,
            'Nulls': col_data.isnull().sum(),
            'Null %': round(col_data.isnull().mean() * 100, 2),
            'Unique': col_data.nunique(dropna=True),
            'Min': col_data.min() if pd.api.types.is_numeric_dtype(col_data) else None,
            'Max': col_data.max() if pd.api.types.is_numeric_dtype(col_data) else None,
            'Mean': round(col_data.mean(), 3) if pd.api.types.is_numeric_dtype(col_data) else None,
            'Std': round(col_data.std(), 3) if pd.api.types.is_numeric_dtype(col_data) else None,
            'Example values': col_data.dropna().unique()[:3].tolist()  # first 3 unique examples
        })
    
    summary_df = pd.DataFrame(summary)
    return summary_df



In [3]:
data = pd.read_csv(FILE_PATH)

display(data.head(5))

Unnamed: 0,grant_id,status,terminated,suspended,termination_date,reinstated,reinstatement_date,reinstatement_indicator,nsf_url,usaspending_url,...,usaspending_obligated,usaspending_outlaid,estimated_budget,estimated_outlays,estimated_remaining,division,directorate,div,dir,record_sha1
0,2201796,❌ Terminated,True,False,2025-04-18,False,,,https://www.nsf.gov/awardsearch/showAward?AWD_...,https://www.usaspending.gov/award/ASST_NON_220...,...,1083218.0,901981.05,1083218,901981.1,181236.9,Research on Learning in Formal and Informal Se...,STEM Education,DRL,EDU,5866299f5d416a23e01823f57f857aad2950cd4d
1,2315095,❌ Terminated,True,False,2025-04-18,False,,,https://www.nsf.gov/awardsearch/showAward?AWD_...,https://www.usaspending.gov/award/ASST_NON_231...,...,,,3641116,3641116.0,0.0,,,,,136bbe81751eb4f4ae26f6dceb68ee1339877daa
2,2236163,❌ Terminated,True,False,2025-04-18,False,,,https://www.nsf.gov/awardsearch/showAward?AWD_...,https://www.usaspending.gov/award/ASST_NON_223...,...,609429.0,348712.21,609429,348712.2,260716.8,Undergraduate Education,STEM Education,DUE,EDU,075e2156a2996eba5595f0095b20da6da75c06e7
3,2243822,🔄 Possibly Reinstated,True,False,2025-04-18,True,2025-06-30,Thakur v. Trump (Jun 2025),https://www.nsf.gov/awardsearch/showAward?AWD_...,https://www.usaspending.gov/award/ASST_NON_224...,...,,,3000000,1079409.0,1920591.0,,,,,f9a85b61f807953ecdb2a77c92362c042dcfb494
4,2216648,❌ Terminated,True,False,2025-04-18,False,,,https://www.nsf.gov/awardsearch/showAward?AWD_...,https://www.usaspending.gov/award/ASST_NON_221...,...,2719700.0,1547300.07,2719700,1547300.0,1172400.0,Biological Infrastructure,Biological Sciences,DBI,BIO,f35dec9f84e3160f0ff2eee79200f25ff5471f67


In [4]:
describe_dataset(data)

Unnamed: 0,Column,Type,Nulls,Null %,Unique,Min,Max,Mean,Std,Example values
0,grant_id,int64,0,0.0,1970,1231319,2531008,2243661.047,148987.307,"[2201796, 2315095, 2236163]"
1,status,object,0,0.0,2,,,,,"[❌ Terminated, 🔄 Possibly Reinstated]"
2,terminated,bool,0,0.0,1,True,True,1.0,0.0,[True]
3,suspended,bool,0,0.0,1,False,False,0.0,0.0,[False]
4,termination_date,object,0,0.0,12,,,,,"[2025-04-18, 2025-04-21, 2025-04-22]"
5,reinstated,bool,0,0.0,2,False,True,0.213,0.41,"[False, True]"
6,reinstatement_date,object,1550,78.68,4,,,,,"[2025-06-30, 2025-08-20, 2025-08-13]"
7,reinstatement_indicator,object,1554,78.88,2,,,,,"[Thakur v. Trump (Jun 2025), Thakur v. Trump (..."
8,nsf_url,object,0,0.0,1970,,,,,[https://www.nsf.gov/awardsearch/showAward?AWD...
9,usaspending_url,object,0,0.0,1970,,,,,[https://www.usaspending.gov/award/ASST_NON_22...


NULLS VALUES

Our hypothesis is that the 711 columns that have null values in usa_* are all the same for all variables that have 711 null. Let's check this.

In [5]:
usa_cols = [
    "usa_start_date", "usa_end_date", "usa_nsf_office",
    "usaspending_obligated", "usaspending_outlaid",
    "division", "directorate", "div", "dir"
]

mask = data[usa_cols].isna().all(axis=1)

print("Rows with all USA-related columns null:", mask.sum())

Rows with all USA-related columns null: 711


This code confirms this. This means that there are 711 entries that are on the nsf_* and not on the usa_*. This is not a problem, this means that we should use the 
"nsf_total_budget" variable and ignore the ones that have null for the 711 entries. The other null values make sense since they are reinstated dates of rows that have not been reinstated.
There is one more NULL value to deal with, there is one row with null on the abstract. Let's check what this is.

In [6]:
row_null_abstract = data[data["abstract"].isna()]

This seems like a normal row, for the moment wi will set the abstract to " " and keep with it.

In [7]:
data.loc[data["abstract"].isna(), "abstract"] = " "

We will first get the only columns that interest  us, we have noticed that terminated and suspended are always True so we can eliminate those columns also

In [8]:
selected_cols = [
    "grant_id",
    "status",
    "reinstated",
    "project_title",  
    "abstract",
    "org_name",
    "org_state",
    "nsf_total_budget"
]

data = data[[col for col in selected_cols if col in data.columns]]

We check for duplicades, which we realise that there are none.

In [9]:
data[data.duplicated(subset=["grant_id"], keep=False)]

Unnamed: 0,grant_id,status,reinstated,project_title,abstract,org_name,org_state,nsf_total_budget


In [10]:
convert_types = {
    "grant_id": "Int64",                    
    "status": "category",                    
    "reinstated": "boolean",              
    "project_title": "string",                
    "abstract": "string",                    
    "org_name": "string",                     
    "org_state": "category",                  
    "usaspending_obligated": "float64"        
}


for col, dtype in convert_types.items():
    if col in data.columns:
        if dtype == "category":
            data[col] = data[col].astype("category")

        elif dtype == "boolean":
            data[col] = data[col].astype("boolean")

        elif dtype in ["Int64", "float64"]:
            data[col] = pd.to_numeric(data[col]).astype(dtype)

        elif dtype == "string":
            # Clean and normalize text fields
            if col in ["project_title", "abstract"]:
                def clean_to_string(text):
                    if pd.isna(text):
                        return ""
                    text = text.lower()
                    text = re.sub(r"[^a-z\s]", " ", text)
                    text = re.sub(r"\s+", " ", text)
                    return text.strip()

                data[col] = data[col].astype("string").apply(clean_to_string)

            else:
                data[col] = data[col].astype("string").str.strip()

In [11]:
if "status" in data.columns:
    data["status"] = (
        data["status"]
        .astype("string")
        .str.replace("❌", "", regex=False)
        .str.replace("🔄", "", regex=False)
        .str.strip()
        .replace({
            "Possibly Reinstated": "Reinstated",
        })
    )

In [12]:
describe_dataset(data)

Unnamed: 0,Column,Type,Nulls,Null %,Unique,Min,Max,Mean,Std,Example values
0,grant_id,Int64,0,0.0,1970,1231319,2531008,2243661.047,148987.307,"[2201796, 2315095, 2236163]"
1,status,string[python],0,0.0,2,,,,,"[Terminated, Reinstated]"
2,reinstated,boolean,0,0.0,2,False,True,0.213,0.41,"[False, True]"
3,project_title,object,0,0.0,1649,,,,,[collaborative research investigating gender d...
4,abstract,object,0,0.0,1665,,,,,[despite evidence that gender differences in m...
5,org_name,string[python],0,0.0,507,,,,,"[Carnegie-Mellon University, University of Tex..."
6,org_state,category,0,0.0,52,,,,,"[PA, TX, NM]"
7,nsf_total_budget,int64,0,0.0,1642,6774,25000000,871612.779,1305502.115,"[1051218, 3641116, 609429]"


Let's change the USA state code to complete name so it mathces with us_state_population.csv. Note that
this check is also checking wether the sates are real and there are no invalid state names of inexistent states because the us_state_abbrev dictionary is completely correct.

In [13]:

us_state_abbrev = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
    'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware',
    'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
    'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
    'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
    'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
    'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
    'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
    'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah',
    'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
    'WI': 'Wisconsin', 'WY': 'Wyoming', 'DC': 'District of Columbia',
    'PR': 'Puerto Rico', 'GU': 'Guam', 'VI': 'Virgin Islands', 'AS': 'American Samoa'
}

data["org_state_full"] = data["org_state"].map(us_state_abbrev)

print(f"Unmatched rows: {len(data[data['org_state_full'].isna()])}")

print(len(us_state_abbrev.values()))

Unmatched rows: 0
55


Let's now check if there are any inconsistencies in the ORG name column, for this we will use rapidfuzz library with compares the
similarity of two names, we will display the results with 90 threshold, but we analysed it with and 80 one. The ones between 80 and 90 were not good matches.

In [14]:

orgs = sorted(data["org_name"].dropna().unique())
similar_pairs = []

for i, name in enumerate(orgs):
    matches = process.extract(name, orgs[i+1:], scorer=fuzz.token_sort_ratio, limit=5)
    for match, score, _ in matches:
        if score >= 90:  
            similar_pairs.append((name, match, score))

similar_df = pd.DataFrame(similar_pairs, columns=["org_name_1", "org_name_2", "similarity"])
display(similar_df.sort_values("similarity", ascending=False).head(30))


Unnamed: 0,org_name_1,org_name_2,similarity
4,Northeastern University,Northwestern University,95.652174
0,Boise State University,Bowie State University,95.454545
5,University of Colorado Boulder,University of Colorado at Boulder,95.238095
6,University of Washington,Washington University,93.333333
2,Clark University,Clarkson University,91.428571
3,Miami University,University of Miami,91.428571
1,CUNY Kingsborough Community College,CUNY Queensborough Community College,90.140845


The only two that should bother us are the "University of Colorado Boulder ---- University of Colorado at Boulder" and the "Miami University ---- University of Miami".
After doing some research online, the Miami ones correspond to the one in Ohio and the one in Florida. However the Boulder one seems to be the same university.

In [15]:
replacements = {
    "University of Colorado at Boulder": "University of Colorado Boulder"
}

data["org_name"] = data["org_name"].replace(replacements)

Let's now verify that the universities are in the states.

In [16]:
#DESCOMENTAR PER L'ENTREGA FINAL TRIGA MOLT I NO FA RES, NOMÉS DONA INFORMACIÓ
"""


geolocator = Nominatim(user_agent="nsf_org_checker")

def get_state_from_geopy(org_name):
    try:
        location = geolocator.geocode(org_name, addressdetails=True, timeout=10)
        if location and "address" in location.raw:
            address = location.raw["address"]
            state = address.get("state") or address.get("state_code")
            return state
    except Exception:
        pass
    return None

sample_df = data.copy() 

sample_df["real_state"] = sample_df["org_name"].apply(get_state_from_geopy)

def normalize_state_name(state):
    if not state:
        return None
    state_lookup = us.states.lookup(state)
    return state_lookup.name if state_lookup else state

sample_df["org_state_norm"] = sample_df["org_state_full"].apply(normalize_state_name)
sample_df["real_state_norm"] = sample_df["real_state"].apply(normalize_state_name)

# Find mismatches
mismatch = sample_df[
    (sample_df["real_state_norm"].notna()) &
    (sample_df["org_state_norm"].notna()) &
    (sample_df["real_state_norm"] != sample_df["org_state_norm"])
]

print(f"⚠️ Found {len(mismatch)} organizations with inconsistent states.\n")
display(mismatch[["org_name", "org_state_norm", "real_state_norm"]])
"""

'\n\n\ngeolocator = Nominatim(user_agent="nsf_org_checker")\n\ndef get_state_from_geopy(org_name):\n    try:\n        location = geolocator.geocode(org_name, addressdetails=True, timeout=10)\n        if location and "address" in location.raw:\n            address = location.raw["address"]\n            state = address.get("state") or address.get("state_code")\n            return state\n    except Exception:\n        pass\n    return None\n\nsample_df = data.copy() \n\nsample_df["real_state"] = sample_df["org_name"].apply(get_state_from_geopy)\n\ndef normalize_state_name(state):\n    if not state:\n        return None\n    state_lookup = us.states.lookup(state)\n    return state_lookup.name if state_lookup else state\n\nsample_df["org_state_norm"] = sample_df["org_state_full"].apply(normalize_state_name)\nsample_df["real_state_norm"] = sample_df["real_state"].apply(normalize_state_name)\n\n# Find mismatches\nmismatch = sample_df[\n    (sample_df["real_state_norm"].notna()) &\n    (sample

Now, since the library is based on search it can make mistakes. I checked manually all these univeritse and they all checked out, the bot made a mistake. 
There is one weird organization SHAKOPEE MDEWAKANTON SIOUX COMMUNITY.

We will now add the population by state

In [17]:
POP_PATH = CLEAN_DATA_DIR / "us_state_population.csv"
us_state_population = pd.read_csv(POP_PATH)
print(us_state_population.columns.tolist)
data = data.merge(
    us_state_population[["State", "state_population", "political_status"]],
    left_on="org_state_full",
    right_on="State",
    how="left"
)
print(data.columns.tolist)


<bound method IndexOpsMixin.tolist of Index(['State', 'state_population', 'political_status'], dtype='object')>
<bound method IndexOpsMixin.tolist of Index(['grant_id', 'status', 'reinstated', 'project_title', 'abstract',
       'org_name', 'org_state', 'nsf_total_budget', 'org_state_full', 'State',
       'state_population', 'political_status'],
      dtype='object')>


In [18]:
display(data[["org_state_full", "state_population", "political_status"]].head())


Unnamed: 0,org_state_full,state_population,political_status
0,Pennsylvania,12961683,Republican
1,Texas,30503301,Republican
2,New Mexico,2114371,Democratic
3,California,38965193,Democratic
4,Oklahoma,4053824,Republican


In [19]:
describe_dataset(data)

Unnamed: 0,Column,Type,Nulls,Null %,Unique,Min,Max,Mean,Std,Example values
0,grant_id,Int64,0,0.0,1970,1231319,2531008,2243661.047,148987.307,"[2201796, 2315095, 2236163]"
1,status,string[python],0,0.0,2,,,,,"[Terminated, Reinstated]"
2,reinstated,boolean,0,0.0,2,False,True,0.213,0.41,"[False, True]"
3,project_title,object,0,0.0,1649,,,,,[collaborative research investigating gender d...
4,abstract,object,0,0.0,1665,,,,,[despite evidence that gender differences in m...
5,org_name,string[python],0,0.0,506,,,,,"[Carnegie-Mellon University, University of Tex..."
6,org_state,category,0,0.0,52,,,,,"[PA, TX, NM]"
7,nsf_total_budget,int64,0,0.0,1642,6774,25000000,871612.779,1305502.115,"[1051218, 3641116, 609429]"
8,org_state_full,object,0,0.0,52,,,,,"[Pennsylvania, Texas, New Mexico]"
9,State,object,2,0.1,51,,,,,"[Pennsylvania, Texas, New Mexico]"


In [20]:
missing_pop = data[data["political_status"].isna()]

print(f"Organizations with missing state population: {len(missing_pop)}\n")
display(missing_pop[["org_name", "org_state", "State", "org_state_full", "political_status"]].head(20))

Organizations with missing state population: 11



Unnamed: 0,org_name,org_state,State,org_state_full,political_status
237,University of Puerto Rico-Rio Piedras,PR,Puerto Rico,Puerto Rico,
242,University of Puerto Rico,PR,Puerto Rico,Puerto Rico,
587,University of Puerto Rico Medical Sciences Campus,PR,Puerto Rico,Puerto Rico,
1004,University of Puerto Rico Mayaguez,PR,Puerto Rico,Puerto Rico,
1227,University of The Virgin Islands,VI,,Virgin Islands,
1387,University of The Virgin Islands,VI,,Virgin Islands,
1529,University of Puerto Rico-Arecibo Campus,PR,Puerto Rico,Puerto Rico,
1541,University of Puerto Rico-Rio Piedras,PR,Puerto Rico,Puerto Rico,
1657,UNIVERSITY OF PUERTO RICO AT CAROLINA,PR,Puerto Rico,Puerto Rico,
1760,University of Puerto Rico-Rio Piedras,PR,Puerto Rico,Puerto Rico,


This are NOT a USA state, it is a USA territory, therefore we will drop these 11 rows. We have also noticed that there is the D.C which is neither a state nor a territory, however for the sake of visualization we belive that since it is located in the USA peninsula we can keep it.

In [21]:
data = data.dropna()

In [22]:
describe_dataset(data)

Unnamed: 0,Column,Type,Nulls,Null %,Unique,Min,Max,Mean,Std,Example values
0,grant_id,Int64,0,0.0,1959,1231319,2531008,2243762.447,148703.097,"[2201796, 2315095, 2236163]"
1,status,string[python],0,0.0,2,,,,,"[Terminated, Reinstated]"
2,reinstated,boolean,0,0.0,2,False,True,0.214,0.411,"[False, True]"
3,project_title,object,0,0.0,1642,,,,,[collaborative research investigating gender d...
4,abstract,object,0,0.0,1658,,,,,[despite evidence that gender differences in m...
5,org_name,string[python],0,0.0,499,,,,,"[Carnegie-Mellon University, University of Tex..."
6,org_state,category,0,0.0,50,,,,,"[PA, TX, NM]"
7,nsf_total_budget,int64,0,0.0,1635,6774,25000000,868922.155,1303328.684,"[1051218, 3641116, 609429]"
8,org_state_full,object,0,0.0,50,,,,,"[Pennsylvania, Texas, New Mexico]"
9,State,object,0,0.0,50,,,,,"[Pennsylvania, Texas, New Mexico]"


In [23]:
print(list(data["org_state_full"].unique()))

['Pennsylvania', 'Texas', 'New Mexico', 'California', 'Oklahoma', 'Alabama', 'Nevada', 'Washington', 'Maryland', 'Minnesota', 'North Dakota', 'South Carolina', 'Nebraska', 'Illinois', 'Michigan', 'Utah', 'Kentucky', 'Arizona', 'New York', 'Idaho', 'Ohio', 'North Carolina', 'Virginia', 'Georgia', 'Indiana', 'Arkansas', 'Colorado', 'Missouri', 'District of Columbia', 'Oregon', 'New Hampshire', 'Massachusetts', 'Tennessee', 'Louisiana', 'New Jersey', 'Wisconsin', 'Maine', 'Florida', 'South Dakota', 'Mississippi', 'Kansas', 'Connecticut', 'Hawaii', 'Montana', 'Delaware', 'Rhode Island', 'Iowa', 'Alaska', 'West Virginia', 'Vermont']


We are missing the state of Wyoming. I don't know what can we do with this.

Now we will juse add the Cruz list in the dataset.

In [24]:
LIST_PATH = CLEAN_DATA_DIR / "cruz_list.csv"
cruz_list = pd.read_csv(LIST_PATH, sep=";")
print(cruz_list.columns.tolist())
data = data.merge(
    cruz_list,
    left_on="grant_id",
    right_on="grant_number",
    how="left"
)

['grant_number', 'in_cruz_list']


In [25]:
describe_dataset(data)

Unnamed: 0,Column,Type,Nulls,Null %,Unique,Min,Max,Mean,Std,Example values
0,grant_id,Int64,0,0.0,1959,1231319,2531008,2243762.447,148703.097,"[2201796, 2315095, 2236163]"
1,status,string[python],0,0.0,2,,,,,"[Terminated, Reinstated]"
2,reinstated,boolean,0,0.0,2,False,True,0.214,0.411,"[False, True]"
3,project_title,object,0,0.0,1642,,,,,[collaborative research investigating gender d...
4,abstract,object,0,0.0,1658,,,,,[despite evidence that gender differences in m...
5,org_name,string[python],0,0.0,499,,,,,"[Carnegie-Mellon University, University of Tex..."
6,org_state,category,0,0.0,50,,,,,"[PA, TX, NM]"
7,nsf_total_budget,int64,0,0.0,1635,6774,25000000,868922.155,1303328.684,"[1051218, 3641116, 609429]"
8,org_state_full,object,0,0.0,50,,,,,"[Pennsylvania, Texas, New Mexico]"
9,State,object,0,0.0,50,,,,,"[Pennsylvania, Texas, New Mexico]"


In [26]:
describe_dataset(cruz_list)

Unnamed: 0,Column,Type,Nulls,Null %,Unique,Min,Max,Mean,Std,Example values
0,grant_number,int64,0,0.0,1041,2011780,2520318,2266765.392,114388.845,"[2011780, 2027519, 2034824]"
1,in_cruz_list,bool,0,0.0,2,False,True,0.451,0.498,"[False, True]"


In [35]:
true_count_data = data["in_cruz_list"].sum()
print(f"✅ Number of True values in 'data': {true_count_data}")
true_count_list = cruz_list["in_cruz_list"].sum()
print(f"✅ Number of True values in 'cruz_list': {true_count_list}")

print(f"📊 Total number of grants in 'data': {len(data)}")

if "status" in data.columns and "in_cruz_list" in data.columns:
    reinstated_true_count = data[
        (data["status"] == "Reinstated") & (data["in_cruz_list"] == True)
    ].shape[0]

    print(f"🔄 Number of grants that are Reinstated AND True in cruz: {reinstated_true_count}")
else:
    print("⚠️ Columns 'status' or 'in_cruz_list' not found in data.")


✅ Number of True values in 'data': 1389
✅ Number of True values in 'cruz_list': 469
📊 Total number of grants in 'data': 1959
🔄 Number of grants that are Reinstated AND True in cruz: 368


The grants that are not on the dataset they for sure are NOT in cruz list. Therefore we change all NULL -> False

In [28]:
data["in_cruz_list"] = data["in_cruz_list"].apply(lambda x: False if x==None or x==False else True)
data = data.drop(columns=["grant_number"])
describe_dataset(data)

Unnamed: 0,Column,Type,Nulls,Null %,Unique,Min,Max,Mean,Std,Example values
0,grant_id,Int64,0,0.0,1959,1231319,2531008,2243762.447,148703.097,"[2201796, 2315095, 2236163]"
1,status,string[python],0,0.0,2,,,,,"[Terminated, Reinstated]"
2,reinstated,boolean,0,0.0,2,False,True,0.214,0.411,"[False, True]"
3,project_title,object,0,0.0,1642,,,,,[collaborative research investigating gender d...
4,abstract,object,0,0.0,1658,,,,,[despite evidence that gender differences in m...
5,org_name,string[python],0,0.0,499,,,,,"[Carnegie-Mellon University, University of Tex..."
6,org_state,category,0,0.0,50,,,,,"[PA, TX, NM]"
7,nsf_total_budget,int64,0,0.0,1635,6774,25000000,868922.155,1303328.684,"[1051218, 3641116, 609429]"
8,org_state_full,object,0,0.0,50,,,,,"[Pennsylvania, Texas, New Mexico]"
9,State,object,0,0.0,50,,,,,"[Pennsylvania, Texas, New Mexico]"


In [29]:
data.to_csv(TARGET_PATH, index=False)
