# Clean Data

In [74]:
# import packages
import os
import pandas as pd
import nltk
from nltk import WhitespaceTokenizer
from nltk.corpus import stopwords
from string import punctuation as original_punct
import re
from collections import Counter
import string
from nltk.stem import WordNetLemmatizer 

# read in data
df = pd.read_csv("Data_IRS_ExcludeMuseumOnly.csv")

# read in list of keywords
keyword_df = pd.read_csv("Search Terms_TT.csv")


In [85]:
# get name column and make lowercase
names = df["name"]
names = [str(name).lower() for name in names]

keywords = keyword_df["Keyword"]
keywords = [str(keyword).lower() for keyword in keywords]


In [76]:
# remove word equivalents
names2 = []
for name in names:
    name = re.sub(r"\bctr\b", "center", name)
    name = re.sub(r"\bcntr\b", "center", name)
    name = re.sub(r"\bassn\b", "association", name)
    name = re.sub(r"\bassoc\b", " association", name)
    names2.append(name)


In [77]:
# create stop word list
nltk.download('stopwords')
stop_words = stopwords.words('english') + list(original_punct)

# define lemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer() 

name_tokens = []
for name in names2:
    name = WhitespaceTokenizer().tokenize(name)           # tokenize
    name = filter(lambda x: x.isalpha(), name)            # remove numerics
    name = [lemmatizer.lemmatize(w) for w in name]        # lemmatize
    name = [w for w in name if not w in stop_words]       # remove stop words 
    name_tokens.append(name)
    
# string tweets together again
names = [" ".join(name) for name in name_tokens] 

df["name"] = names

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ydeng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ydeng\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [86]:
# lemmatize keywords
keyword_tokens = []
for keyword in keywords:
    keyword = WhitespaceTokenizer().tokenize(keyword)           # tokenize
    keyword = [lemmatizer.lemmatize(w) for w in keyword]        # lemmatize
    keyword = [w for w in keyword if not w in stop_words]       # remove stop words 
    keyword_tokens.append(keyword)
    
# string keywords together again and put back in keyword_df
keywords = [" ".join(keyword) for keyword in keyword_tokens]
keyword_df["Keyword"] = keywords

keyword_df.drop_duplicates("Keyword")
keywords = list(set(keywords))


## Count Keyword Frequencies

In [91]:
is_1 = keyword_df["Include"] == 1
keyword_df_1 = keyword_df[is_1]

# get frequencies for keywords in list 1
matching_keyword = []
names_w_keyword = []
codes = []   
for keyword in keyword_df_1["Keyword"]:
    for index, row in df.iterrows():
        if keyword in row["name"]:
            matching_keyword.append(keyword)
            names_w_keyword.append(row["name"])
            codes.append(row["ntee_cd"])
matching_keywords_1 = pd.DataFrame(list(zip(matching_keyword, names_w_keyword, codes)), columns = ["keyword", "name", "ntee_cd"])
    
matching_keywords_1

Unnamed: 0,keyword,name,ntee_cd
0,hall fame,steel guitar hall fame inc,
1,hall fame,philadelphia music hall fame,A23
2,hall fame,westford academy athletic hall fame corp,O50
3,hall fame,bbe hall fame,N03
4,hall fame,national softball association sport hall fame,
...,...,...,...
30684,historical societ,richland county historical society inc,
30685,historical societ,lebanon historical society inc,
30686,historical societ,kennesaw historical society inc,A80
30687,historical societ,western connecticut chapter national railway h...,


In [92]:
is_2 = keyword_df["Include"] == 2
keyword_df_2 = keyword_df[is_2]

# get frequencies for keywords in list 2
matching_keyword = []
names_w_keyword = []
codes = []   
for keyword in keyword_df_2["Keyword"]:
    for index, row in df.iterrows():
        if keyword in row["name"]:
            matching_keyword.append(keyword)
            names_w_keyword.append(row["name"])
            codes.append(row["ntee_cd"])
matching_keywords_2 = pd.DataFrame(list(zip(matching_keyword, names_w_keyword, codes)), columns = ["keyword", "name", "ntee_cd"])
    
matching_keywords_2

Unnamed: 0,keyword,name,ntee_cd
0,infantry regiment,virginia volunteer infantry regiment,A82Z
1,infantry regiment,ohio volunteer infantry regiment band,A6C
2,infantry regiment,infantry regiment manchu association,
3,infantry regiment,airborne infantry regiment association,W30
4,infantry regiment,infantry regiment association,P99
...,...,...,...
373929,historic,richland county historical society inc,
373930,historic,lebanon historical society inc,
373931,historic,kennesaw historical society inc,A80
373932,historic,western connecticut chapter national railway h...,


In [100]:
matching_keywords_1.to_csv("z_matching_keywords_1.csv")
matching_keywords_2.to_csv("z_matching_keywords_2.csv")

In [93]:
# started running previous two chunks at 4:26pm (3hrs 20 mins)

import datetime

datetime_object = datetime.datetime.now()
print(datetime_object)

2021-12-07 19:46:12.434363


In [95]:
# get frequencies for names containing "seum" but not "museum"
matching_keyword = []
names_w_keyword = []
codes = []

for index, row in df.iterrows():
    if "seum" in row["name"] and not "museum" in row["name"]:
        matching_keyword.append("seum")
        names_w_keyword.append(row["name"])
        codes.append(row["ntee_cd"])
matching_keywords_seum = pd.DataFrame(list(zip(matching_keyword, names_w_keyword, codes)), columns = ["keyword", "name", "ntee_cd"])

matching_keywords_1 = matching_keywords_1.append(matching_keywords_seum)

TypeError: 'tuple' object is not callable

In [108]:
freq_df_1 = matching_keywords_1.groupby(["keyword", "ntee_cd"]).size().reset_index(name='count')
freq_df_2 = matching_keywords_2.groupby(["keyword", "ntee_cd"]).size().reset_index(name='count')
freq_df_1 = freq_df_1.assign(Include=1)
freq_df_2 = freq_df_2.assign(Include=2)
freq_df_1_2 = freq_df_1.append(freq_df_2)

freq_df_2

Unnamed: 0,keyword,ntee_cd,count,Include
0,ancient,A05,2,2
1,ancient,A20,3,2
2,ancient,A23,16,2
3,ancient,A24Z,1,2
4,ancient,A25,1,2
...,...,...,...,...
13482,zoological,D500,68,2
13483,zoological,D50Z,3,2
13484,zoological,D99,1,2
13485,zoological,O50,1,2


In [None]:
# combine list 1 and 2
freq_df_1_2 = freq_df_1.append(freq_df_2)

In [109]:
# save frequencies to csv
freq_df_1.to_csv("2_keyword_frequencies_1s_only.csv")
freq_df_2.to_csv("2_keyword_frequencies_2s_only.csv")
freq_df_1_2.to_csv("2_keyword_frequencies_1s_2s.csv")