In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from glob import glob
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
import codecs
import jinja2
import json
import os

class StemTokenizer(object):
    def __init__(self):
        self.ignore_set = {'placeholder','appliance','toward','available'}

    def __call__(self, doc):
        words = []
        for word in word_tokenize(doc):
            word = word.lower()
            w = wn.morphy(word)
            if w and len(w) > 3 and w not in self.ignore_set:
                words.append(w)
        return words

class StemTokenizerRemoveDoubles(object):
    def __init__(self):
        self.ignore_set = {'placeholder','appliance','toward','available'}

    def __call__(self, doc):
        words = []
        for word in word_tokenize(doc):
            word = word.lower()
            w = wn.morphy(word)
            if w and len(w) > 3 and w not in self.ignore_set and w not in words:
                words.append(w)
        return words


In [2]:
%matplotlib inline

import pandas as pd

grants_df = pd.read_csv("data/grants.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
def get_year(x):
  m = x.split('-')
  return int(m[0])

grants_df['year'] = grants_df['Award Date'].apply(get_year)

In [6]:
pd.value_counts(grants_df['year'].values).sort_index()

1991        1
1996        3
1997      140
1998      337
1999      454
2000      392
2001      482
2002      448
2003      447
2004    18725
2005    25966
2006    21012
2007    15606
2008    13927
2009    16494
2010    15930
2011    16029
2012    20833
2013    19328
2014    22805
2015    22387
2016    25121
2017    26027
2018     1054
dtype: int64

In [7]:
grants_df2 = grants_df.loc[grants_df.Description.apply(type) != float]
grants_df3 = grants_df2[(grants_df2['year'] > 2004) & (grants_df2['year'] < 2018)]

def get_raw_data():
  grouped_df = grants_df3.groupby('year')
  indices = list(grouped_df.indices.keys())
  indices.sort()
  print(indices)
#  return list(zip(indices,grouped_df['Description'].apply('\n'.join).tolist()))
  return list(zip(indices,grouped_df['description_strings'].apply('\n'.join).tolist()))

In [8]:
grouped_df = grants_df3.groupby('year')
indices = list(grouped_df.indices.keys())
indices.sort()
print(indices)

[2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]


In [9]:
tokenizer = StemTokenizerRemoveDoubles()
grants_df3['token_descriptions'] = grants_df3['Description'].apply(tokenizer)
grants_df3['description_strings'] = grants_df3['token_descriptions'].apply(' '.join)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
yearmask = grants_df3['year'] == 2008
token_mask = grants_df3['token_descriptions'].apply((lambda x, key: True if key in x else False),args=('horse',))
size = len(grants_df3[(yearmask) & (token_mask)])
dict = grants_df3[(yearmask) & (token_mask)].sample(5 if size > 5 else size)[['Title','Identifier','Amount Awarded','Funding Org:Name','Recipient Org:Name']].to_dict()
dict['size'] = size

In [11]:
def award_data(keywords,year):
  print('data')
  dict = {}
  yearmask = grants_df3['year'] == year
  for idx,kw in enumerate(keywords):
    dict[kw] = get_grant_data(grants_df3,kw,yearmask)
    if(idx % 100 == 0): print(idx)        
  return dict

def award_values(keywords,year):
  print('award values')
  dict = {}
  yearmask = grants_df3['year'] == year
  for idx,kw in enumerate(keywords):
    dict[kw] = get_amount_awarded(grants_df3,kw,yearmask)
    if(idx % 100 == 0): print(idx)        
  return dict

def beneficiaries(keywords,year):
  print('benefs')
  dict = {}
  yearmask = grants_df3['year'] == year
  for idx,kw in enumerate(keywords):
    dict[kw] = get_beneficiary(grants_df3,kw,yearmask)
    if(idx % 100 == 0): print(idx)        
  return dict

def get_mask(df,keyword,yearmask):
  token_mask = grants_df3['token_descriptions'].apply((lambda x, key: True if key in x else False),args=(keyword,))
  return (token_mask) & (yearmask)

def get_amount_awarded(df,keyword,yearmask):
  token_mask = grants_df3['token_descriptions'].apply((lambda x, key: True if key in x else False),args=(keyword,))
#  mask = get_mask(df,keyword,yearmask)
  return df[(yearmask) & (token_mask)]['Amount Awarded'].sum()

def get_beneficiary(df,keyword,yearmask):
  token_mask = grants_df3['token_descriptions'].apply((lambda x, key: True if key in x else False),args=(keyword,))
#  mask = get_mask(df,keyword,yearmask)
  return df[(yearmask) & (token_mask)]['Funding Org:Name'].value_counts().to_dict()    

def get_grant_data(df,keyword,yearmask):
  token_mask = grants_df3['token_descriptions'].apply((lambda x, key: True if key in x else False),args=(keyword,))
  size = len(grants_df3[(yearmask) & (token_mask)])
  dict = grants_df3[(yearmask) & (token_mask)].sample(5 if size > 5 else size)[['Title','Identifier','Amount Awarded','Funding Org:Name','Recipient Org:Name']].to_dict()
  dict['size'] = size
  return dict

In [12]:
def process_text(counts, vectorizer, text, year, index):
    result = {w: counts[index][vectorizer.vocabulary_.get(w)]
              for w in vectorizer.get_feature_names()}

    result = {w: c for w, c in result.items() if c > 4}
    normalizing_factor = max(c for c in result.values())

    result = {w: c / normalizing_factor
              for w, c in result.items()}

    return result

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

data = list(get_raw_data())
print('Data loaded')
n = len(data)

vectorizer = TfidfVectorizer(stop_words='english',
                             max_df=(n-1) / n,
                             tokenizer=StemTokenizer())

tfids = vectorizer.fit_transform(text for p, text in data).toarray()

print('Vectorization done.')

[2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
Data loaded
Vectorization done.


In [15]:
index = 0
result = {w: tfids[index][vectorizer.vocabulary_.get(w)]
          for w in vectorizer.get_feature_names()}
sorted_by_value = sorted(result.items(), key=lambda kv: kv[1], reverse=True)
sorted_by_value[0:199]

[('spouse', 0.8686458800117584),
 ('60th', 0.37118179864504475),
 ('malta', 0.17968260495918828),
 ('commemoration', 0.11013677743733379),
 ('holland', 0.09648384923770248),
 ('wartime', 0.08152991457816042),
 ('egypt', 0.07564146432955801),
 ('singapore', 0.07523902610762334),
 ('memorabilia', 0.07051552971112636),
 ('legion', 0.06794159548180034),
 ('reunion', 0.06627410032589758),
 ('burma', 0.057639148879980576),
 ('gibraltar', 0.03638403543245197),
 ('tunisia', 0.03332369600355414),
 ('remembrance', 0.033254142212994546),
 ('evacuee', 0.03192554504880753),
 ('1940s', 0.029944629119756445),
 ('sicily', 0.027034777724644254),
 ('jersey', 0.025162790859826596),
 ('victory', 0.021762413716606786),
 ('norway', 0.021137385261004553),
 ('trafalgar', 0.01862631571282927),
 ('russia', 0.01826635980713785),
 ('postage', 0.01803217570704634),
 ('naval', 0.017589411630041022),
 ('greece', 0.016158702906314253),
 ('cyprus', 0.014621621715845185),
 ('raid', 0.01440099890366156),
 ('normandy', 0

In [18]:
def process_tfidfs(tfidfs, vectorizer, index):
    result = {w: tfidfs[index][vectorizer.vocabulary_.get(w)]
              for w in vectorizer.get_feature_names()}

    result = {w: c for w, c in result.items() if c > 0}
    normalizing_factor = max(c for c in result.values())

    result = {w: c / normalizing_factor
              for w, c in result.items()}
    result = sorted(result.items(), key=lambda kv: kv[1], reverse=True)
    result = result[:200]
    
    return dict((x,y) for x, y in result)

In [21]:
json_output = {}
for i, (year, text) in enumerate(data):
    json_output[year] = {}
    result = process_tfidfs(tfids, vectorizer, i)
    award_data_vals = award_data(result.keys(),year)
    with codecs.open('output/{}-awarddata.json'.format(year), 'w', encoding='utf-8') as f:
      f.write(json.dumps(award_data_vals, ensure_ascii=False))
    json_output[year]['award_data'] = award_data_vals
    award_vals = award_values(result.keys(),year)
    json_output[year]['award'] = award_vals
    benec_vals = beneficiaries(result.keys(),year)
    json_output[year]['benefactors'] = benec_vals
    result = {w: tfids[i][vectorizer.vocabulary_.get(w)]
          for w in vectorizer.get_feature_names() if tfids[i][vectorizer.vocabulary_.get(w)] > 0 }
    json_output[year]['word_values'] = result

    with codecs.open('output/{}-awards.json'.format(year), 'w', encoding='utf-8') as f:
      f.write(json.dumps(award_vals, ensure_ascii=False))
    with codecs.open('output/{}-benecs.json'.format(year), 'w', encoding='utf-8') as f:
      f.write(json.dumps(benec_vals, ensure_ascii=False))
    with codecs.open('output/{}-words.json'.format(year), 'w', encoding='utf-8') as f:
      f.write(json.dumps(result, ensure_ascii=False))    

    print('Processing done for {}'.format(year))

with codecs.open('output/alljson.json'.format(year), 'w', encoding='utf-8') as f:
  f.write(json.dumps(json_output, ensure_ascii=False))    
    

data
0
100
award values
0
100
benefs
0
100
Processing done for 2005
data
0
100
award values
0
100
benefs
0
100
Processing done for 2006
data
0
100
award values
0
100
benefs
0
100
Processing done for 2007
data
0
100
award values
0
100
benefs
0
100
Processing done for 2008
data
0
100
award values
0
100
benefs
0
100
Processing done for 2009
data
0
100
award values
0
100
benefs
0
100
Processing done for 2010
data
0
100
award values
0
100
benefs
0
100
Processing done for 2011
data
0
100
award values
0
100
benefs
0
100
Processing done for 2012
data
0
100
award values
0
100
benefs
0
100
Processing done for 2013
data
0
100
award values
0
100
benefs
0
100
Processing done for 2014
data
0
100
award values
0
100
benefs
0
100
Processing done for 2015
data
0
100
award values
0
100
benefs
0
100
Processing done for 2016
data
0
100
award values
0
100
benefs
0
100
Processing done for 2017
