In this document I will filter to only the abstracts that are from the NSF, to see if we can get a more specific corpus to run our topic model on.

In [2]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
#from git/dspg21RnD/wheat_filtration/wheat_filtration import keywords
#from git/dspg21RnD/wheat_filtration/wheat_filtration import filter
#import keywords

In [3]:
#larger dataset

df = pd.read_pickle("../../data/dspg21RnD/smaller-final-dataset.pkl")

In [4]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,ORG_COUNT,PI_COUNT,nchar,final_frqwds_removed,PROJECT_TERMS,PROJECT_TITLE,...,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
0,17608,152242,The multiprotein complex y-secretase proteolyt...,2008,1,1,1402,"[multiprotein, y_secretase, proteolytically_cl...",Active Sites; Affect; Alzheimer's Disease; Am...,STRUCTURE OF SIGNAL PEPTIDE PEPTIDASE,...,BOSTON,MA,21156110,UNITED STATES,12/1/2007,1/1/2008,93.866,2008,3483.0,
1,111864,190316,DESCRIPTION (provided by applicant): The Kis...,2008,1,1,2553,"[kissl, gene, encode, peptide, kisspeptin, bin...",Affect; Animal Model; Axon; Behavior; Behavio...,ROLE OF KISS1 NEURONS IN THE SEASONAL AND CIRC...,...,SEATTLE,WA,981959472,UNITED STATES,9/1/2008,1/1/2009,93.865,2008,39175.0,
2,22052,154213,DESCRIPTION (provided by applicant): The objec...,2008,1,1,1414,"[biophysical, basis, thermodynamics_kinetic, m...",Agreement; Antibodies; base; Binding; Biochem...,CARBONIC ANHYDRASE AS A MODEL TO UNDERSTAND DI...,...,CAMBRIDGE,MA,21385319,UNITED STATES,1/2/2008,1/1/2009,93.859,2008,49646.0,
3,35004,159362,Obesity is the cause of many adverse pregnancy...,2008,1,1,1545,"[obesity, adverse_pregnancyoutcome, great, hea...",African; Analysis of Variance; Asians; Birth;...,OBESITY ON VAGAL TONE AND HBA1C DURING PREGNANCY,...,HOUSTON,TX,770305400,UNITED STATES,4/1/2008,1/1/2009,93.361,2008,20406.0,
4,371628,594482,Local potato advisory groups have expressed in...,2010,1,1,271,"[local, potato, advisory, express, interest, m...",cost; Health; interest; Manure; Parasitic nem...,PLANT-PARASITIC NEMATODE MANAGEMENT AS A COMPO...,...,CORVALLIS,OR,97331,UNITED STATES,,,10.203,2010,,


NSF only:

In [5]:
#filter to NSF

nsf = df[df["AGENCY"] == "NSF"]

In [6]:
df.shape

(690814, 31)

In [7]:
nsf.shape

(121715, 31)

In [8]:
df["AGENCY"].unique()

array(['NIH', 'NIFA', 'DVBIC', 'NASA', 'ARS', 'VA', 'CDMRP', 'EPA', 'IES',
       'ALLCDC', 'ACF', 'AHRQ', 'NIDILRR', 'FS', 'CCCRP', 'NSF', 'FDA',
       'CNRM', nan], dtype=object)

In [9]:
nsf["AGENCY"].unique()

array(['NSF'], dtype=object)

In [10]:
#start with the core terms from the OECD paper
core_terms = ["adaboost","artificial intelligence","artificial neural network","back propagation"
,"back propagation neural network","computational intelligence","computer vision"
,"convolutional neural network","deep belief network","deep convolutional neural network"
,"deep learn","deep neural network","elman network","elman neural network"
,"expert system","fee forward neural network","inference engine","machine intelligence"
,"machine learn","machine translation","machine vision","multilayer neural network"
,"natural language process","perceptron","random forest","rbf neural network","recurrent neural network"
,"self organize map","spike neural network","supervise learn","support vector machine"
,"svm classifier","unsupervised learn","artificial_intelligence","artificial_neural_network","back_propagation"
,"back_propagation_neural_network","computational_intelligence","computer_vision"
,"convolutional_neural_network","deep_belief_network","deep_convolutional_neural_network"
,"deep_learn","deep_neural_network","elman_network","elman_neural_network"
,"expert_system","fee_forward_neural_network","inference_engine","machine_intelligence"
,"machine_learn","machine_translation","machine_vision","multilayer_neural_network"
,"natural_language_process","random_forest","rbf_neural_network","recurrent_neural_network"
,"self_organize_map","spike_neural_network","supervise_learn","support_vector_machine"
,"svm_classifier","unsupervised_learn", "machine_learning"]

In [11]:
tokens = nsf["final_frqwds_removed"]

text = [] # text will contain the processed tokens in string form (1 string per abstract)


for abstract in tokens:
    text.append(" ".join(abstract))
    
text = pd.Series(text)

In [12]:
# TRY TOPIC MODELING WITH LDA

lda_vectorizer = CountVectorizer(max_df=0.6, min_df=20)
#this is our way to filter out words that don't appear enough, and those that appear way too often (we want the middle set of terms)
#^this filters the size of our matrix

lda_dtm = lda_vectorizer.fit_transform(text)
#text = our abstract text, right in the dataframe
#fits our doc-term matrix to our specific text 
#this is the standard for scikit-learn
 


In [13]:
#functions for creating a topic dictionary, viewing the topics in the topic model,
#and selecting only the relevant topics based on a threshold and our keyword list.


def topic_dictionary(lda_model, lda_vectorizer, top_n=10):
    topic_ls = {} #append keys, append the values
    
    
    for idx, topic in enumerate(lda_model.components_):  # loop through each row of H.  idx = row index.  topic = actual row

        print_list = [(lda_vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        topic_ls[idx] = print_list

    return topic_ls

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
            
            
def relevant_topics(topic_dictionary, keyword_list, threshold = 0):
    """returns a list of the topics which contain a threshold % of the
    relevant words in the keyword list"""
    relevant_topic = []
    for key in topic_dictionary:
        relevant_words = 0
        for i in range(len(topic_dictionary[key])):
            if topic_dictionary[key][i][0] in keyword_list:
                relevant_words += 1
            else: relevant_words += 0
        if (relevant_words) >= threshold :#/len(topic_dictionary[key]) >= threshold :
            relevant_topic.append(key)
    return relevant_topic  

def super_keywords(keyword_list, relevant_topic_list, topic_dictionary):
    superkeyword = keyword_list
    for key in topic_dictionary:
        if key in relevant_topic_list:
            for i in range(len(topic_dictionary[key])):
                superkeyword.append(topic_dictionary[key][i][0])
    return superkeyword

Now, trying a 75 topic model on the NSF corpus.

In [31]:
# create model

num_topics = 75
lda_model_75 = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=29, random_state = 0)

doc_top_dist_75 = lda_model_75.fit_transform(lda_dtm)
top_term_dist_75 = lda_model_75.components_


In [32]:
nsf_dic75 = topic_dictionary(lda_model_75, lda_vectorizer, 50)

In [34]:
#print_topics(lda_model_75, lda_vectorizer, 10)

In [36]:
nsf_dic75[59][0:10]

[('performance', 7800.409371582714),
 ('computing', 7343.015099344587),
 ('software', 5958.477080531771),
 ('architecture', 4528.284629600627),
 ('memory', 4122.096325295902),
 ('hardware', 4113.827991459793),
 ('computer', 4029.246460657543),
 ('parallel', 3489.1940486052117),
 ('programming', 2824.0297297148327),
 ('level', 2686.914908740683)]

In [38]:
nsf_dic75[10][0:10]

[('user', 9067.0361735743),
 ('software', 6199.237384664498),
 ('robot', 5763.631364107451),
 ('task', 4509.956711010427),
 ('human', 4080.6550480123565),
 ('search', 2557.349311948212),
 ('web', 2077.8788977733784),
 ('code', 2009.8207692561439),
 ('robotics', 1861.684610586474),
 ('enable', 1649.5467258057824)]

In [39]:
nsf_dic75[2][0:10]

[('computer', 10797.920295914659),
 ('computational', 9449.366877402887),
 ('science', 9304.498089473023),
 ('software', 6552.863237372862),
 ('computing', 6460.395668127229),
 ('open', 3744.741382892853),
 ('create', 3533.82968544871),
 ('scientific', 3454.934270348386),
 ('enable', 3286.0033656274363),
 ('technology', 2993.4955403200884)]

In [40]:
nsf_dic75[65][0:10]

[('algorithm', 11967.831495315058),
 ('statistical', 7918.781122632267),
 ('optimization', 6197.971995828134),
 ('framework', 5506.894508569184),
 ('computational', 4249.287311182363),
 ('modeling', 4226.22289767596),
 ('uncertainty', 4046.8904162257286),
 ('methodology', 3891.579111367414),
 ('parameter', 3307.839982719072),
 ('estimation', 3127.9878302988445)]

Now, trying with 100

In [14]:
# create model

num_topics = 100
lda_model_100 = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=39, random_state = 0)

doc_top_dist_100 = lda_model_100.fit_transform(lda_dtm)
top_term_dist_100 = lda_model_100.components_


In [15]:
nsf_dic100 = topic_dictionary(lda_model_100, lda_vectorizer, 50)

In [16]:
print_topics(lda_model_100, lda_vectorizer, 10)


Topic 0:
('interface', 13690.403359320218)
('wall', 1668.7415093380755)
('interfacial', 924.7037941083363)
('domain', 825.7595682850509)
('interaction', 358.6314725524922)
('alignment', 271.466402548093)
('oscillator', 263.60866803527387)
('cmp', 223.6346958009765)
('pad', 163.77532046519406)
('ccny', 150.6748892514722)

Topic 1:
('learning', 16255.850634281973)
('science', 15018.82336051211)
('learn', 8743.711286834357)
('education', 8076.008103155073)
('stem', 7284.484807979463)
('practice', 6257.100047873937)
('assessment', 4707.09766638463)
('classroom', 4504.595622460356)
('teacher', 4083.560951891076)
('engage', 3885.820663128089)

Topic 2:
('computer', 12695.964982701378)
('computational', 9428.945545862658)
('computing', 5403.734431307827)
('science', 5289.983153736736)
('create', 2488.2345690240263)
('graph', 2301.4807302692784)
('cybersecurity', 2176.353983490687)
('technology', 2000.9747644170577)
('online', 1783.092665998352)
('open', 1708.0313735504385)

Topic 3:
('networ