# Eads et al Method, using NSF subsetted corpus to cfda = 47.070

In [7]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import filter
#from git/dspg21RnD/wheat_filtration/wheat_filtration import keywords
#from git/dspg21RnD/wheat_filtration/wheat_filtration import filter
#import keywords

In [8]:
def total_topic_proportion(document_topics, relevant_topics):
    """Return sum of relevant topic proportions for a document.
    Arguments:
        document_topics (iterable of float): topic proportions for one document.
        relevant topics (iterable of int): a list of the numbers corresponding
            with the topics considered relevant by the user."""
    assert (len(relevant_topics) <= len(document_topics)
            )  # TODO make this the right kind of error
    return sum([document_topics[i] for i in relevant_topics])


def keyword_proportion(document, keyword_list):
    """Return percentage of words in the given doc that are present in keyword_list."""
    doc_tokens = document.split()
    num_keywords = sum(
        [1 if word in keyword_list else 0 for word in doc_tokens])
    return float(num_keywords)/len(doc_tokens)


def superkeyword_presence(document, superkeywords):
    """Return 1 if document contains any superkeywords, 0 if not."""
    for word in superkeywords:
        if word in document.split():
            return True
    return False


class FilterHelper():
    """Creates a filter object containing filter criteria such as keyword list,
    superkeyword list, total topic proportion threshold, and keyword proportion
    threshold.

    Arguments:
        topic_model (TopicModel): a TopicModel object instantiated with a corpus or
            files from a Mallet topic model.
        relevant_topics (iterable of int): a list of the numbers corresponding
            with the topics considered relevant by the user. Note that the number
            corresponding with the first topic is '0', the second topic is '1', etc.
        n_keywords: number of keywords to include in keyword list. Default is 20.
        superkeywords (iterable of str): a list of keywords which signify immediate relevance
            of the document that contains them (better wording). Default is an empty list.
        keyword_list: A list of keywords ordered by [the relevance they signify]. Default is
            a keyword list generated using the relative entropy method.
        total_topic_prop_threshold (float): the threshold of relevance for the total proportion
            of relevant topics in a document. If a document surpases the threshold, it is considered relevant.
        keyword_prop_threshold (float): the threshold of relevance for the proportion of words
            on the keyword list that appear in a document. If a document surpases the threshold,
            it is considered relevant.

    Attributes:
        topic_model (TopicModel): a TopicModel object instantiated with a corpus or
            files from a Mallet topic model.
        relevant_topics (iterable of int): a list of the numbers corresponding
            with the topics considered relevant by the user.
        superkeywords (iterable of str): a list of keywords which signify immediate relevance
            of the document that contains them (better wording). Default is an empty list.
        keyword_list: A list of keywords ordered by [the relevance they signify]. Default is
            a keyword list generated using the relative entropy method.
        total_topic_prop_threshold (float): the threshold of relevance for the total proportion
            of relevant topics in a document. If a document surpases the threshold, 
            it is considered relevant. Default is 0.25.
        keyword_prop_threshold (float): the threshold of relevance for the proportion of words
            on the keyword list that appear in a document. If a document surpases the threshold,
            it is considered relevant. Default is 0.15.

    Raises:
        RuntimeError: if user enters both keyword list and n_keywords when using the
        keyword_list setter method.
        """

    def __init__(self, topic_model, vectorizer, relevant_topics, keyword_list=None, n_keywords=100, superkeywords=[],
                 total_topic_prop_threshold=0.25, keyword_prop_threshold=0.15):
        self._relevant_topics = relevant_topics
        if keyword_list is None:
            keyword_list = keywords.rel_ent_key_list(
                topic_model, n_keywords, relevant_topics)
        self._keyword_list = keyword_list

        lower_superkeys = [word.lower() for word in superkeywords]
        # TODO: deal with this appropriately when making lowercasing optional
        extended_superkeys = [
            word for word in vectorizer.get_feature_names() if
            word in lower_superkeys or
            any([(chunk in lower_superkeys) for chunk in word.split('_')])
        ]
        self._superkeywords = extended_superkeys

        self._total_topic_prop_threshold = total_topic_prop_threshold
        self._keyword_prop_threshold = keyword_prop_threshold
        self._topic_model = topic_model
        self._vectorizer = vectorizer

    @property
    def topic_model(self):
        """Get topic_model used to create filter"""
        return self._topic_model

    @property
    def relevant_topics(self):
        """Get list of relevant topics"""
        return self._relevant_topics

    @property
    def keyword_list(self):
        """Get or set keyword list. Input either a list of keywords, or input an integer n
        to generate a keyword list containing n words."""
        return self._keyword_list

    @keyword_list.setter
    def keyword_list(self, keyword_list=None, n_keywords=None):
        if keyword_list is not None:
            self._keyword_list = keyword_list
        elif n_keywords is not None:
            self._keyword_list = keywords.rel_ent_key_list(
                self.topic_model, n_keywords, self.relevant_topics)
        else:
            raise RuntimeError(
                "Enter either a keyword list or an integer for number of keywords")

    @property
    def superkeywords(self):
        return self._superkeywords

    @superkeywords.setter
    def superkeywords(self, superkeywords):
        self._superkeywords = superkeywords

    @property
    def total_topic_prop_threshold(self):
        return self._total_topic_prop_threshold

    @total_topic_prop_threshold.setter
    def total_topic_prop_threshold(self, total_topic_prop_threshold):
        self._total_topic_prop_threshold = total_topic_prop_threshold

    @property
    def keyword_prop_threshold(self):
        return self._keyword_prop_threshold

    @keyword_prop_threshold.setter
    def keyword_prop_threshold(self, keyword_prop_threshold):
        self._keyword_prop_threshold = keyword_prop_threshold


def is_relevant(doc, doc_topics, filter_helper):
    """Returns a boolean for relevance of given document. A document is considered
    relevant if: it contains any superkeywords(filter_helper.superkeywords), passes
    the total topic proportion threshold(filter_helper.total_topic_prop_threshold),
    or passes the keyword proportion threshold(filter_helper.keyword_prop_threshold).
    Arguments:
        doc (string): preprocessed document from the corpus
        doc_topics (iterable of float): proportion of each topic present in the given document
        filter_helper (FilterHelper): an object containing the necessary information
            to label the relevance of the given document
    Returns:
        (bool): Representing whether or not the given document is relevant according
        to the information in filter_helper"""

    has_superkeyword = superkeyword_presence(
        doc, filter_helper.superkeywords)
    passes_total_topic_thresh = total_topic_proportion(
        doc_topics, filter_helper.relevant_topics) > filter_helper.total_topic_prop_threshold
    passes_keyword_thresh = keyword_proportion(
        doc, filter_helper.keyword_list) > filter_helper.keyword_prop_threshold

    return has_superkeyword or passes_total_topic_thresh or passes_keyword_thresh


def filter_corpus(topic_model, filter_helper):
    """Filters corpus used to make topic_model according to criteria entered in filter_helper.
    Arguments:
        topic_model (TopicModel): a TopicModel object instantiated with a corpus or
        files from a Mallet topic model.
        filter_helper (FilterHelper): a FilterHelper object instantiated with filter
        properties.
    Returns:
        subcorpus (dict): a dictionary containing the subset of the corpus that passed
        the relevance filter. keys are the unique document ids and values are the (unprocessed)
        document text"""
    subcorpus = {}
    for i, doc_id in enumerate(topic_model.docs):
        doc = topic_model.docs[doc_id]
        doc_topics = topic_model.doc_topic_proportions[i, :]
        if is_relevant(doc, doc_topics, filter_helper):
            # add full document to subcorpus as <doc_id>: <doc_body>
            subcorpus[doc_id] = topic_model.full_docs[doc_id]
    return subcorpus

#####################################################
######### under this line are things it would be nice to add later #############
# TODO (faunam|6/20/19): implement


def proportion_lists():
    """makes a matrix or list of ttp, superkeyword, and keyword proportion for the docs in corpus
    and sets the respective topic model attributes"""
    pass


def subset_quality(threshs, labeled_subset):  # also had args word_list_gen and scorefun
    """Calculate F1 score for the array of thresholds threshs
    (max topic prop, total topic prop, vocab prop, and number of words
    in vocabulary list) on labeled subset"""
    pass


def subset_info(threshs):  # seems like a cool feature to include
    """Return set of false positives, true positives, false negatives, and true negatives, as
    well as the sizes of the false neg and false pos sets, as well as the size of set
    predicted as relevant, about the subset created by the given set of thresholds
    (mtp, ttp, voc prop, and voc list length, in that order).
    This function can be edited to output any kind of info about the subset, eg the filenames."""
    pass

In [9]:
#functions for creating a topic dictionary, viewing the topics in the topic model,
#and selecting only the relevant topics based on a threshold and our keyword list.


def topic_dictionary(lda_model, lda_vectorizer, top_n = 10):
    topic_ls = {} #append keys, append the values
    
    
    for idx, topic in enumerate(lda_model.components_):  # loop through each row of H.  idx = row index.  topic = actual row

        print_list = [(lda_vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        topic_ls[idx] = print_list

    return topic_ls

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
            
            
def relevant_topics(topic_dictionary, keyword_list, threshold = 0.15):
    """returns a list of the topics which contain a threshold % of the
    relevant words in the keyword list"""
    relevant_topic = []
    for key in topic_dictionary:
        relevant_words = 0
        for i in range(len(topic_dictionary[key])):
            if topic_dictionary[key][i][0] in keyword_list:
                relevant_words += 1
        if (relevant_words) / len(topic_dictionary[key]) >= threshold :
            relevant_topic.append(key)
    return relevant_topic  

def rel_ent_key_list(topic_model, vectorizer, n_top_keywords, relevant_topics):
    """Returns a list of the top n keywords based on relative entropy score
     Arguments:
       topic_model (TopicModel): a topic by vocabulary word matrix where each entry
       is the total word count for that word in that topic
       n_top_words (int): the number of keywords the method will return
       relevant_topics (iterable of int)
     Returns:
       keyword_list (iterable of str): list of the top n keywords, sorted
     """
    topic_word_matrix = topic_model.components_
    lda_vectorizer = vectorizer
    
    # Log of probabilities of vocab words
    #this works
    vocab_logs = np.log(topic_word_matrix.sum(
        axis=0) / topic_word_matrix.sum())

    # Log of probabilities of vocab words given they were in each relevant topic
    #this is being built to calculate p(w)*log[p(w)/q(w)]
    #this works
    topic_logs = np.log(topic_word_matrix[relevant_topics, :].sum(
        axis=0) / topic_word_matrix[relevant_topics, :].sum())

    # relative entropy proportions, unsorted
    #log rules: log[p(w)/q(w)] = log(p(w)) - log(q(w))
    unsorted_props = np.asarray(topic_word_matrix.sum(axis=0) /
                                topic_word_matrix.sum()) * np.asarray(topic_logs - vocab_logs)

    unsorted_props = np.matrix.flatten(unsorted_props)

    sorted_props_and_voc = sorted([(unsorted_props[i], lda_vectorizer.get_feature_names()[i]) for i in list(
        np.argpartition(unsorted_props, len(lda_vectorizer.get_feature_names()) - n_top_keywords))[-n_top_keywords:]], reverse=True)
    ordered_vocab = []
    for (_, voc) in sorted_props_and_voc:
        ordered_vocab.append(voc)
    return ordered_vocab

In [10]:
#start with the core terms from the OECD paper
core_terms = ["adaboost","artificial intelligence","artificial neural network","back propagation"
,"back propagation neural network","computational intelligence","computer vision"
,"convolutional neural network","deep belief network","deep convolutional neural network"
,"deep learn","deep neural network","elman network","elman neural network"
,"expert system","fee forward neural network","inference engine","machine intelligence"
,"machine learn","machine translation","machine vision","multilayer neural network"
,"natural language process","perceptron","random forest","rbf neural network","recurrent neural network"
,"self organize map","spike neural network","supervise learn","support vector machine"
,"svm classifier","unsupervised learn","artificial_intelligence","artificial_neural_network","back_propagation"
,"back_propagation_neural_network","computational_intelligence","computer_vision"
,"convolutional_neural_network","deep_belief_network","deep_convolutional_neural_network"
,"deep_learn","deep_neural_network","elman_network","elman_neural_network"
,"expert_system","fee_forward_neural_network","inference_engine","machine_intelligence"
,"machine_learn","machine_translation","machine_vision","multilayer_neural_network"
,"natural_language_process","random_forest","rbf_neural_network","recurrent_neural_network"
,"self_organize_map","spike_neural_network","supervise_learn","support_vector_machine"
,"svm_classifier","unsupervised_learn", "machine_learning"]

In [11]:
df = pd.read_pickle("../../data/dspg21RnD/smaller-final-dataset.pkl")
nsf = df[df["AGENCY"] == "NSF"]
# filter where cfda = 47.070

nsf_csci = nsf[nsf["CFDA_CODE"] == "47.070"]

In [12]:
nsf_csci.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,ORG_COUNT,PI_COUNT,nchar,final_frqwds_removed,PROJECT_TERMS,PROJECT_TITLE,...,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
1996,11849,101844,This symposium is a premiere forum for researc...,2008,1,1,697,"[symposium, premiere, forum, interaction, comp...",Arts; Award; Collaborations; Communities; Com...,SUPPORT FOR THE THIRTEENTH INTERNATIONAL CONFE...,...,CHAMPAIGN,IL,61820-7406,UNITED STATES,,,47.07,2008,4651.0,
2067,11747,101739,This award is to the Computer Research Associa...,2008,1,1,2638,"[award, computer, association, cra, coordinate...",Address; Architecture; Award; base; Collabora...,THE 2007 FIND PIS MEETING,...,WASHINGTON,DC,20036-0000,UNITED STATES,,,47.07,2008,29940.0,
2187,12250,102249,IIS-0808994PI: Jonathan FurnerUniversity of Ca...,2008,1,1,1682,"[iis, pi, furneruniversity, california_los, an...",Arts; Award; California; Development; Dimensi...,WORKSHOP: I-CONFERENCE DOCTORAL RESEARCH COLLO...,...,LOS ANGELES,CA,90095-1406,UNITED STATES,,,47.07,2008,25859.0,
2234,12405,102404,ABSTRACT0812795Vijay K. VaishnaviGa State U Re...,2008,1,1,574,"[vaishnaviga, res, fdnthis, seek, funding, enc...",computer science; design; Discipline; Funding...,STUDENT PARTICIPATION IN 3RD INTERNATIONAL CON...,...,ATLANTA,GA,30303-3999,UNITED STATES,,,47.07,2008,9000.0,
2235,12451,102448,The 3rd International Conference on emerging N...,2008,1,1,3025,"[3rd, international, conference, emerge, netwo...",Award; base; career; Commit; Communication; C...,STUDENT TRAVEL SUPPORT FOR CONEXT 2007 CONFERENCE,...,MADISON,WI,53715-1218,UNITED STATES,,,47.07,2008,10875.0,


In [13]:
tokens = nsf_csci["final_frqwds_removed"]

text = [] # text will contain the processed tokens in string form (1 string per abstract)


for abstract in tokens:
    text.append(" ".join(abstract))
    
text = pd.Series(text)

In [14]:
# TRY TOPIC MODELING WITH LDA

lda_vectorizer = CountVectorizer(max_df=0.6, min_df=20)
#this is our way to filter out words that don't appear enough, and those that appear way too often (we want the middle set of terms)
#^this filters the size of our matrix

lda_dtm = lda_vectorizer.fit_transform(text)
#text = our abstract text, right in the dataframe
#fits our doc-term matrix to our specific text 
#this is the standard for scikit-learn
 


In [15]:
num_topics = 100
lda_model_100 = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=39, random_state = 0)

doc_top_dist_100 = lda_model_100.fit_transform(lda_dtm)
top_term_dist_100 = lda_model_100.components_

In [16]:
nsfcs_dic100 = topic_dictionary(lda_model_100, lda_vectorizer, 50)

In [17]:
relevant_topics(nsfcs_dic100, core_terms, 0.04)

[97]

In [18]:
2/50

0.04

So, we get 5 topics when we do 1 word out of 50 ahhaha.  We only get topic 97 when we use 2 words out of 50.  I will look through these topics and add to the topics i picked out myself and decide the relevant topics, then pick out the relative entropy keyword list before making a superkeyword list.

In [19]:
nsfcs_dic100[15][0:20]

[('health', 2116.0966517857764),
 ('patient', 1244.7450881968607),
 ('care', 819.0339929744473),
 ('medical', 676.7615454630422),
 ('healthcare', 623.2943716504553),
 ('clinical', 556.9735215587403),
 ('technology', 223.57570780885894),
 ('individual', 210.91677664812553),
 ('cost', 207.34347137206652),
 ('population', 190.37128252654986),
 ('hospital', 184.70507724109828),
 ('medicine', 181.99000710864803),
 ('record', 177.60667439014955),
 ('personalize', 171.30428620903686),
 ('family', 170.9999364264955),
 ('challenge', 165.7542634300328),
 ('need', 160.33314127369772),
 ('behavioral', 150.11844464162309),
 ('electronic', 131.96048414061747),
 ('monitoring', 131.77603124067357)]

In [20]:
nsfcs_dic100[23][0:20]

[('search', 1368.653014681624),
 ('feature', 588.7102878122361),
 ('extraction', 153.1498066330481),
 ('heuristic', 123.9245757396409),
 ('space', 75.89387486559578),
 ('chinese', 72.84054190082773),
 ('selection', 70.46146991527411),
 ('proxy', 51.84053060678479),
 ('classification', 50.7655025843738),
 ('patent', 47.71687693737752),
 ('china', 46.84252098056582),
 ('relevant', 45.849468757338485),
 ('narrative', 44.00343995713097),
 ('combinatorial', 42.86278148086518),
 ('case', 40.83760602745175),
 ('exploration', 39.40466537566612),
 ('select', 39.33212718844669),
 ('probe', 36.31510692372034),
 ('aaai', 34.8912867766287),
 ('ai', 34.75250071433617)]

"AI" is the 20th term.  and there are only 34 times it comes up in this topic?  Not gonna include

In [21]:
nsfcs_dic100[28][0:20]

[('decision', 1211.7633387852838),
 ('domain', 324.96557996466885),
 ('label', 260.52331656824236),
 ('example', 221.35194514993535),
 ('machine_learning', 217.52270449024778),
 ('learn', 201.98552761463682),
 ('uncertainty', 198.61946907103538),
 ('classifier', 189.4280778482446),
 ('ml', 174.54927586033412),
 ('reasoning', 161.5635472118136),
 ('algorithm', 141.7574753819408),
 ('real', 128.6539416936688),
 ('task', 123.45169384502242),
 ('explanation', 104.5974839087037),
 ('automated', 103.00937330998413),
 ('framework', 99.95478220465216),
 ('robust', 99.6318830974613),
 ('human', 99.45204627261225),
 ('source', 94.48550058915484),
 ('uncertain', 93.46741756910629)]

I will keep 27.

In [22]:
relevant_topics_HT = [27]

In [23]:
nsfcs_dic100[87][0:20]

[('algorithm', 1819.06996660863),
 ('machine_learning', 1059.3157087702514),
 ('learn', 866.006523246125),
 ('inference', 809.5514827848334),
 ('statistical', 733.4181344556185),
 ('learning', 732.3256051256889),
 ('scale', 566.1333020691231),
 ('dataset', 564.3152416367084),
 ('dimensional', 562.5206095408722),
 ('set', 557.742088611085),
 ('framework', 482.4322917225055),
 ('big', 417.2567183957516),
 ('clustering', 323.7412237170666),
 ('sparse', 316.4792781225677),
 ('efficient', 313.0336721213797),
 ('massive', 311.97348253810674),
 ('challenge', 311.1389008875552),
 ('estimation', 280.519515354007),
 ('representation', 278.6236354818512),
 ('machine', 263.5076627080198)]

In [24]:
relevant_topics_HT.append(87)

In [25]:
nsfcs_dic100[97][0:20]

[('learning', 1753.425587928131),
 ('deep', 1206.3295947990032),
 ('learn', 977.393948584993),
 ('machine_learning', 449.65095379314283),
 ('neural', 411.89155081104036),
 ('ai', 273.8011831510943),
 ('machine', 266.3093513909305),
 ('intelligence', 210.80992532602272),
 ('intelligent', 199.38004638266955),
 ('network', 196.9212463435522),
 ('algorithm', 192.03032816490338),
 ('artificial_intelligence', 167.46230118906087),
 ('reinforcement', 167.4450670234159),
 ('architecture', 166.82888205136672),
 ('advance', 118.04581497835483),
 ('vision', 107.12164855307428),
 ('world', 106.54101575482942),
 ('domain', 102.06689086441504),
 ('train', 99.33526221413456),
 ('real', 95.65366884119737)]

In [26]:
relevant_topics_HT.append(97)

When I ran it on my own, I picked out 19, 52, 54, 76, 79, 86, 97

In [27]:
relevant_topics_HT

[27, 87, 97]

In [28]:
nsfcs_dic100[19][0:20]

[('graph', 2462.2972841025025),
 ('mining', 660.3066246687682),
 ('algorithm', 559.8030180085358),
 ('pattern', 289.113236307129),
 ('edge', 227.46269296479247),
 ('real', 197.20648487656408),
 ('domain', 137.2843781269611),
 ('anomaly', 134.39008641464014),
 ('social', 134.07527965190465),
 ('analytics', 130.93391842624442),
 ('network', 129.21605419748968),
 ('irregular', 127.48866737124281),
 ('node', 119.53130373091832),
 ('world', 108.44226220914099),
 ('processing', 105.90993203052052),
 ('represent', 105.57140975731879),
 ('efficient', 105.02798903145188),
 ('scale', 103.15140867694929),
 ('dynamic', 102.46176242573956),
 ('scalable', 91.12291265802436)]

In [29]:
relevant_topics_HT.append(19)

In [30]:
nsfcs_dic100[52][0:20]

[('task', 1667.5998674114032),
 ('robot', 1205.2733479871963),
 ('human', 667.5030619334059),
 ('planning', 651.9088761238436),
 ('motion', 547.4441514047675),
 ('action', 545.034947422538),
 ('autonomous', 524.3019424622092),
 ('manipulation', 508.5319744841163),
 ('environment', 507.2456930949452),
 ('robotic', 475.71316628732137),
 ('algorithm', 413.94966228240946),
 ('enable', 288.71012963912256),
 ('robotics', 283.509516724759),
 ('agent', 279.35572225212815),
 ('learn', 228.39561697061572),
 ('level', 190.82268774181435),
 ('uncertainty', 189.2610487214873),
 ('capability', 171.53116752407882),
 ('framework', 159.42105370299586),
 ('domain', 156.37989153139853)]

In [31]:
relevant_topics_HT.append(52)

In [32]:
nsfcs_dic100[54][0:20]

[('robot', 400.84901342166),
 ('motor', 286.6005205219227),
 ('hand', 285.2745892318118),
 ('human', 223.16141139120617),
 ('robotic', 218.12729661842502),
 ('movement', 212.42065577045534),
 ('force', 210.76716086016995),
 ('arm', 173.53051174420514),
 ('environment', 145.77342117895245),
 ('robotics', 118.73722282530731),
 ('assistive', 116.94376450869532),
 ('pis', 115.4334127330146),
 ('object', 114.63801914889936),
 ('walk', 110.31475543395928),
 ('gait', 107.15655344429868),
 ('contact', 106.36771297027417),
 ('motion', 105.5398986508488),
 ('task', 102.95519664082595),
 ('exoskeleton', 102.08365972980206),
 ('prosthesis', 101.07030785125205)]

In [33]:
#I don't know about this, since it is just robot and not the other AI terms.

In [34]:
nsfcs_dic100[76][0:20]

[('algorithm', 1240.1942831168114),
 ('computational', 747.3377107663017),
 ('solution', 602.5737568888037),
 ('numerical', 435.97774869387007),
 ('optimization', 415.7614482832579),
 ('solving', 403.99019929766405),
 ('linear', 384.42221155603283),
 ('computer', 333.66799399344507),
 ('solve', 333.16176886312076),
 ('science', 333.07163175650385),
 ('mathematical', 280.08281954069486),
 ('engineering', 266.6018952432416),
 ('solver', 251.9626318867719),
 ('matrix', 243.3638556521591),
 ('equation', 234.6849094327548),
 ('mathematics', 231.58240590608483),
 ('uas', 211.099695064392),
 ('apply', 198.6032935523939),
 ('pis', 173.8920765561421),
 ('real', 173.7369329425362)]

In [35]:
nsfcs_dic100[79][0:20]

[('language', 2144.887478365342),
 ('speech', 1131.1117051254016),
 ('natural', 599.6745213872944),
 ('processing', 479.1224783524312),
 ('linguistic', 447.9412757359252),
 ('text', 444.0013130334293),
 ('recognition', 415.10750001393774),
 ('annotation', 323.56805380072103),
 ('translation', 299.85525376117687),
 ('human', 271.64202926108186),
 ('automatic', 261.98584160629355),
 ('word', 260.42863801789616),
 ('speaker', 236.27280804289842),
 ('audio', 227.33951009200655),
 ('technology', 208.12115178435536),
 ('nlp', 203.06825159780698),
 ('machine', 200.47687588575002),
 ('corpus', 186.28160800955476),
 ('computational', 171.36350838667278),
 ('english', 169.86888666804097)]

In [36]:
relevant_topics_HT.append(79)

In [37]:
nsfcs_dic100[86][0:20]

[('human', 3485.8322547179414),
 ('robot', 1777.589081373325),
 ('interaction', 699.6492100943879),
 ('robotics', 657.2793637927633),
 ('environment', 393.39388202687496),
 ('team', 382.00487124254914),
 ('people', 218.92423623762568),
 ('machine', 207.63928741334777),
 ('task', 203.30837221711272),
 ('cognitive', 189.08865713300628),
 ('operator', 176.57447472975485),
 ('interact', 165.88041193385916),
 ('intelligent', 151.45301088124222),
 ('robotic', 149.51433987674383),
 ('collaborative', 140.59228383853102),
 ('hri', 126.5535598340643),
 ('worker', 114.59998404856047),
 ('physical', 113.6439584131836),
 ('real', 108.76366521289732),
 ('integrate', 106.22562654370202)]

In [38]:
relevant_topics_HT.append(86)

In [39]:
relevant_topics_HT

[27, 87, 97, 19, 52, 79, 86]

Ok, so with my judgement plus some that the relevant_topics function picked out, we have 7 topics that should be roughly about AI.

Creating the relative entropy keyword list:

In [58]:
rel_ent = rel_ent_key_list(lda_model_100, lda_vectorizer, 1000, relevant_topics_HT)

In [59]:
#relative entropy keywork list
rel_ent[950:1000]

['stimulation',
 'imply',
 'difficult_impossible',
 'schema',
 'jointly',
 'constituent',
 'overlap',
 'conform',
 'planetary',
 'sorting',
 'agriculture',
 'daunting_task',
 'stop',
 'aged',
 'upcoming',
 'reflective',
 'university_texas_dallas',
 'retain',
 'restriction',
 'heavily',
 'sufficiently',
 'anchor',
 'available',
 'opinion',
 'leap',
 'heighten',
 'fraud',
 'rapidly',
 'essentially',
 'credible',
 'impression',
 'personality',
 'advantage',
 'publicly',
 'face',
 'subject',
 'queueing',
 'complicated',
 'precursor',
 'milestone',
 'difference',
 'instantiate',
 'escape',
 'retail',
 'inter_related',
 'season',
 'glean',
 'giant',
 'remote',
 'fortunately']

Creating the superkeyword list:

"To create the super keyword list, we examine an expanded list -- the top 1000 words -- of high-relative-entropy-constribution words from the last step and select those words that are unambiguously related to the concept of interest, i.e. likely to be used when referring to the concept of interest and no other concepts.

In [42]:
#So, expanding this to 1000 and I select those that should be unambigiously about AI
rel_ent_superkeyword = rel_ent_key_list(lda_model_100, lda_vectorizer, 1000, relevant_topics_HT)

In [43]:
#the way I will do this is go through it in sets of 25 and add to "superkeyword" all but those that I don't think are relevant

In [44]:
superkeyword = []

In [45]:
for index in range(len(rel_ent_superkeyword[0:25])):
    print(index, rel_ent_superkeyword[index])

0 human
1 algorithm
2 learning
3 task
4 learn
5 robot
6 language
7 graph
8 machine_learning
9 processing
10 environment
11 machine
12 domain
13 interaction
14 set
15 statistical
16 natural
17 framework
18 deep
19 real
20 speech
21 robotics
22 scale
23 dataset
24 mining


In [46]:
for index in range(25):
    print(index + 25, rel_ent_superkeyword[index + 25])

25 inference
26 pattern
27 world
28 planning
29 representation
30 big
31 neural
32 action
33 dimensional
34 recognition
35 motion
36 robotic
37 team
38 autonomous
39 text
40 advance
41 intelligent
42 manipulation
43 eg
44 feature
45 agent
46 efficient
47 apply
48 massive
49 train


In [47]:
for index in range(25):
    print(index + 50, rel_ent_superkeyword[index + 50])

50 scalable
51 art
52 robust
53 represent
54 extract
55 linguistic
56 class
57 combine
58 sparse
59 recent
60 estimation
61 prediction
62 accuracy
63 automatic
64 classification
65 analyze
66 efficiently
67 challenge
68 sound
69 incorporate
70 operator
71 annotation
72 input
73 vision
74 intelligence


In [48]:
for index in range(25):
    print(index + 75, rel_ent_superkeyword[index + 75])

75 cognitive
76 clustering
77 example
78 generate
79 edge
80 exploit
81 capability
82 estimate
83 word
84 translation
85 probabilistic
86 general
87 size
88 people
89 order
90 automatically
91 accurate
92 relationship
93 tree
94 statistic
95 parameter
96 ai
97 analytics
98 handle
99 neuron


In [49]:
for index in range(25):
    print(index + 100, rel_ent_superkeyword[index + 100])

100 uncertainty
101 dimension
102 variable
103 recognize
104 able
105 interact
106 discover
107 variety
108 situation
109 modern
110 manufacturing
111 perception
112 observe
113 animal
114 audio
115 capable
116 sampling
117 operate
118 assist
119 artificial_intelligence
120 speaker
121 highly
122 demonstration
123 typically
124 constraint


In [50]:
for index in range(25):
    print(index + 125, rel_ent_superkeyword[index + 125])

125 thrust
126 success
127 challenging
128 flexible
129 adapt
130 brain
131 insight
132 series
133 rank
134 graphical
135 significantly
136 single
137 contribution
138 space
139 noisy
140 unified
141 setting
142 exploratory
143 auditory
144 effectively
145 inspire
146 exhibit
147 anomaly
148 voice
149 distribution


creating the filter helper to see if we can start trying to filter the corpus to get some sort of sense the abstracts that are about AI

In [51]:
my_filter_helper = FilterHelper(topic_model = lda_model_100,
                                vectorizer = lda_vectorizer,
                               relevant_topics = relevant_topics_HT,
                               superkeywords = rel_ent_superkeyword[0:500],
                               keyword_list = core_terms,
                               total_topic_prop_threshold = 0.25,
                               keyword_prop_threshold = 0.15)

In [52]:
my_filter_helper

<__main__.FilterHelper at 0x7f6eb1c47b20>

In [53]:
filtered_corpus_text = filter_corpus(lda_model_100, my_filter_helper)

AttributeError: 'LatentDirichletAllocation' object has no attribute 'docs'