In [19]:
import pandas as pd
import numpy as np

def gini(x):
    # (Warning: This is a concise implementation, but it is O(n**2)
    # in time and memory, where n = len(x).  *Don't* pass in huge
    # samples!)

    # Mean absolute difference
    mad = np.abs(np.subtract.outer(x, x)).mean()
    # Relative mean absolute difference
    rmad = mad/np.mean(x)
    # Gini coefficient
    g = 0.5 * rmad
    return g

In [20]:
file_encoding = 'utf8'        # set file_encoding to the file encoding (utf8, latin1, etc.)
input_fd = open("data/DUE_awards.csv", encoding=file_encoding, errors = 'backslashreplace')
nsf_df = pd.read_csv(input_fd)
nsf_df['AwardedAmountToDate'] = nsf_df['AwardedAmountToDate'].apply(lambda x: x.replace('$','').replace(',','').replace('.00','')).astype(int)
print("The total amount awarded by NSF per year was $" + str(int(round(nsf_df['AwardedAmountToDate'].sum()/10,-6)/1000000))+"M")
#Quick check for duplicate award numbers
print(nsf_df['AwardNumber'].duplicated().value_counts())

The total amount awarded by NSF per year was $324M
AwardNumber
False    5187
Name: count, dtype: int64


In [21]:
nsf_df = nsf_df.groupby('Organization')['AwardedAmountToDate'].sum().to_frame().reset_index()
nsf_df

Unnamed: 0,Organization,AwardedAmountToDate
0,Montana Technological University,597181
1,"ACE Mentor Program of America, Inc.",25000
2,ARMSTRONG STATE UNIVERSITY,329334
3,"AUGUSTA UNIVERSITY RESEARCH INSTITUTE, INC.",905444
4,Adams State University,510021
...,...,...
1297,Yale University,2840555
1298,Yavapai College,1055348
1299,Yosemite Community College District,1103175
1300,Youngstown State University,999971


In [23]:
nsf_df.sort_values(by='AwardedAmountToDate', ascending=False).head(20)

Unnamed: 0,Organization,AwardedAmountToDate
1204,University of Wisconsin-Madison,39172430
1089,University of Colorado at Boulder,33619071
610,Michigan State University,32257438
766,Pennsylvania State Univ University Park,28515449
788,Purdue University,27941380
50,Arizona State University,27430493
807,Regents of the University of Michigan - Ann Arbor,27070328
827,Rochester Institute of Tech,25185368
1140,University of Nebraska-Lincoln,24001581
24,American Association for the Advancement of Sc...,22903335


In [24]:
nsf_df.sort_values(by='AwardedAmountToDate', ascending=False).tail(20)

Unnamed: 0,Organization,AwardedAmountToDate
391,Franklin and Marshall College,38982
569,Luther College,36690
28,American Chemical Society (ACS),35650
432,Hamline University,33737
1094,University of Connecticut Health Center,32808
669,National Collegiate Inventors and Innovators A...,29945
38,American Psychological Assoc,29100
543,Lehigh University,27001
526,Lake Superior State University,26158
194,Centralia College,25000


In [22]:
print(gini(nsf_df['AwardedAmountToDate'].values))


0.6313838807995312
