# Clean Data

In [16]:
# import packages
import os
import pandas as pd
import nltk
from nltk import WhitespaceTokenizer
from nltk.corpus import stopwords
from string import punctuation as original_punct
import re
from collections import Counter
import string
from nltk.stem import WordNetLemmatizer 

# read in data
df = pd.read_csv("Data_IRS_ExcludeMuseumOnly.csv")

In [17]:
# get name column and make lowercase
names = df["name"]
names = [str(name).lower() for name in names]

In [18]:
# word equivalents
names2 = []
for name in names:
    name = re.sub(r"\bctr\b", "center", name)
    name = re.sub(r"\bcntr\b", "center", name)
    name = re.sub(r"\bassn\b", "association", name)
    name = re.sub(r"\bassoc\b", " association", name)
    names2.append(name)

In [19]:
# remove stop words
nltk.download('stopwords')
stop_words = stopwords.words('english') + list(original_punct)
# lemmatize
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer() 
from nltk.stem import PorterStemmer
ps = PorterStemmer()

name_tokens = []
for name in names2:
    name = WhitespaceTokenizer().tokenize(name)           # tokenize
    name = filter(lambda x: x.isalpha(), name)            # remove numerics
    name = [lemmatizer.lemmatize(w) for w in name]
    name = [w for w in name if not w in stop_words]          
    name_tokens.append(name)
    
# string tweets together again
names = [" ".join(name) for name in name_tokens] 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ydeng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ydeng\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
names

['pta oregon congress',
 'societi cosmet chemist',
 'g p librari',
 'open door baptist church council bluff',
 'phi upsilon omicron inc',
 'fellowship commun church evangel free church',
 'nation wild turkey feder inc',
 'oneeighti church san diego inc',
 'ladi day raffl fund inc',
 'fruit harvest ministri',
 'regent univers minnesota',
 'socal media pro',
 'good friend lowcountri',
 'asian dream foundat',
 'center atheism',
 'ayla olson inc',
 'sigma chi fratern',
 'public school employe washington',
 'hardi counti extens servic foundat inc',
 'summit academi commun school altern learner',
 'legion christ consecr r c member assist foundat',
 'bridg hope ministri intl',
 'commun health initi inc',
 'share life ministri',
 'kenmor commun',
 'nation societi daughter american revolut',
 'american legion auxiliari',
 'eric dolch children enceph foundat inc',
 'bethani presbyterian parish',
 'pentecost christian church boston',
 'first baptist church',
 'jasi jamaican advantag sport youth',

## Count Keyword Frequencies

In [15]:
# create lemmatized keyword list
keywords = pd.read_csv("uni_bi_grams_frequency_lemmatized.csv").head(200).iloc[:,2:3]
keywords

Unnamed: 0,ngram
0,museum
1,inc
2,society
3,foundation
4,historical
...,...
195,legacy
196,force
197,preservation foundation
198,street


In [41]:
frequencies = []
for keyword in keywords["ngram"]:
    freq = sum(keyword in name for name in names)
    frequencies.append(freq)
    print(frequencies)
    
freq_df = pd.DataFrame(list(zip(keywords, frequencies)), columns = ["keyword", "freq"])
freq_df

[3489]
[3489, 598427]
[3489, 598427, 24]
[3489, 598427, 24, 258]
[3489, 598427, 24, 258, 13]
[3489, 598427, 24, 258, 13, 740]
[3489, 598427, 24, 258, 13, 740, 0]
[3489, 598427, 24, 258, 13, 740, 0, 57860]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707]
[3489, 598427, 24

[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2, 118]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2, 118, 9]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 2

[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2, 118, 9, 537, 209, 2, 8783, 0, 15498, 9, 12760, 16, 20941, 26477, 171, 486437, 13941, 22923, 7232, 9658, 135, 14, 100, 400, 400]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2, 118, 9, 537, 209, 2, 8783, 0, 15498, 9, 12760, 16, 20941, 26477, 171, 486437, 13941, 22923, 7232, 9658, 135, 14, 100, 400, 400, 36526]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 

[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2, 118, 9, 537, 209, 2, 8783, 0, 15498, 9, 12760, 16, 20941, 26477, 171, 486437, 13941, 22923, 7232, 9658, 135, 14, 100, 400, 400, 36526, 12153, 53, 0, 1, 181, 702, 0, 308, 0, 0, 0, 0, 6274, 0, 8221, 5052, 0, 0, 57975]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2, 118, 9, 537, 209, 2, 8783, 0, 15498, 9, 12760, 16, 20941, 26477, 171, 486437, 13941, 22923, 7232, 9658, 135, 14, 100, 400, 400, 36526, 12153, 53, 0, 1, 181, 702, 0, 308, 0, 0, 0, 0, 6274, 0, 8221, 5052, 0, 0, 57975, 15

[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2, 118, 9, 537, 209, 2, 8783, 0, 15498, 9, 12760, 16, 20941, 26477, 171, 486437, 13941, 22923, 7232, 9658, 135, 14, 100, 400, 400, 36526, 12153, 53, 0, 1, 181, 702, 0, 308, 0, 0, 0, 0, 6274, 0, 8221, 5052, 0, 0, 57975, 150, 4532, 0, 77295, 2, 2, 21315, 36, 10572, 13621, 35, 7900, 0, 0, 171, 199, 21466]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2, 118, 9, 537, 209, 2, 8783, 0, 15498, 9, 12760, 16, 20941, 26477, 171, 486437, 13941, 22923, 7232, 9658, 135, 14, 100, 400, 400, 36526

[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2, 118, 9, 537, 209, 2, 8783, 0, 15498, 9, 12760, 16, 20941, 26477, 171, 486437, 13941, 22923, 7232, 9658, 135, 14, 100, 400, 400, 36526, 12153, 53, 0, 1, 181, 702, 0, 308, 0, 0, 0, 0, 6274, 0, 8221, 5052, 0, 0, 57975, 150, 4532, 0, 77295, 2, 2, 21315, 36, 10572, 13621, 35, 7900, 0, 0, 171, 199, 21466, 3, 8794, 4566, 204494, 0, 5, 32644, 20394, 0, 504, 0, 0, 0, 3770]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2, 118, 9, 537, 209, 2, 8783, 0, 15498, 9, 12760, 16, 20941, 26477, 17

[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2, 118, 9, 537, 209, 2, 8783, 0, 15498, 9, 12760, 16, 20941, 26477, 171, 486437, 13941, 22923, 7232, 9658, 135, 14, 100, 400, 400, 36526, 12153, 53, 0, 1, 181, 702, 0, 308, 0, 0, 0, 0, 6274, 0, 8221, 5052, 0, 0, 57975, 150, 4532, 0, 77295, 2, 2, 21315, 36, 10572, 13621, 35, 7900, 0, 0, 171, 199, 21466, 3, 8794, 4566, 204494, 0, 5, 32644, 20394, 0, 504, 0, 0, 0, 3770, 10671, 6290, 0, 10099, 17526, 18028, 3890, 388, 105, 206, 3936, 6, 6809]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 

[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2, 118, 9, 537, 209, 2, 8783, 0, 15498, 9, 12760, 16, 20941, 26477, 171, 486437, 13941, 22923, 7232, 9658, 135, 14, 100, 400, 400, 36526, 12153, 53, 0, 1, 181, 702, 0, 308, 0, 0, 0, 0, 6274, 0, 8221, 5052, 0, 0, 57975, 150, 4532, 0, 77295, 2, 2, 21315, 36, 10572, 13621, 35, 7900, 0, 0, 171, 199, 21466, 3, 8794, 4566, 204494, 0, 5, 32644, 20394, 0, 504, 0, 0, 0, 3770, 10671, 6290, 0, 10099, 17526, 18028, 3890, 388, 105, 206, 3936, 6, 6809, 9, 0, 23, 0, 15, 0, 25, 0, 3420, 22780, 8571, 15]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 1

[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2, 118, 9, 537, 209, 2, 8783, 0, 15498, 9, 12760, 16, 20941, 26477, 171, 486437, 13941, 22923, 7232, 9658, 135, 14, 100, 400, 400, 36526, 12153, 53, 0, 1, 181, 702, 0, 308, 0, 0, 0, 0, 6274, 0, 8221, 5052, 0, 0, 57975, 150, 4532, 0, 77295, 2, 2, 21315, 36, 10572, 13621, 35, 7900, 0, 0, 171, 199, 21466, 3, 8794, 4566, 204494, 0, 5, 32644, 20394, 0, 504, 0, 0, 0, 3770, 10671, 6290, 0, 10099, 17526, 18028, 3890, 388, 105, 206, 3936, 6, 6809, 9, 0, 23, 0, 15, 0, 25, 0, 3420, 22780, 8571, 15, 5022, 4, 825, 26, 1, 60, 8201, 3750, 1048, 96, 8]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 

[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 17095, 207, 726, 72, 17291, 0, 24, 0, 0, 0, 0, 23798, 136, 15598, 80432, 40513, 1934, 2, 0, 33, 12302, 303, 2259, 40434, 11, 10120, 16, 2, 0, 542, 1, 2, 118, 9, 537, 209, 2, 8783, 0, 15498, 9, 12760, 16, 20941, 26477, 171, 486437, 13941, 22923, 7232, 9658, 135, 14, 100, 400, 400, 36526, 12153, 53, 0, 1, 181, 702, 0, 308, 0, 0, 0, 0, 6274, 0, 8221, 5052, 0, 0, 57975, 150, 4532, 0, 77295, 2, 2, 21315, 36, 10572, 13621, 35, 7900, 0, 0, 171, 199, 21466, 3, 8794, 4566, 204494, 0, 5, 32644, 20394, 0, 504, 0, 0, 0, 3770, 10671, 6290, 0, 10099, 17526, 18028, 3890, 388, 105, 206, 3936, 6, 6809, 9, 0, 23, 0, 15, 0, 25, 0, 3420, 22780, 8571, 15, 5022, 4, 825, 26, 1, 60, 8201, 3750, 1048, 96, 8, 13, 11206, 8518, 0, 6607, 0, 0, 684, 17, 0]
[3489, 598427, 24, 258, 13, 740, 0, 57860, 135, 13, 59761, 42, 9, 0, 0, 24900, 4174, 781, 707, 50744, 7, 6178, 30, 451, 0, 12029, 170

KeyboardInterrupt: 

In [36]:
freq

598427

In [37]:
frequencies = []
frequencies.append(freq)
frequencies
frequencies.append(2)
frequencies

[598427, 2]