# NMF Fitting

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)

In [4]:
#TODO:Input a dataframe that are ai related abstracts, need variables: final_frqwds_removed
abstracts = pd.read_csv(r'/home/zz3hs/git/dspg21RnD/data/dspg21RnD/bert_ai_abstracts.csv') 
abstracts2 = pd.read_csv(r'/home/zz3hs/git/dspg21RnD/data/dspg21RnD/bert_ai_abstracts_2.csv')   

In [3]:
lim_docs = abstracts["final_frqwds_removed"]
len(lim_docs)

22558

In [7]:
# input needed for LDA, NMF (all from Scikit-Learn) is one string per document (not a list of strings)

text = []

for token_list in lim_docs:
    text.append(token_list)

In [9]:
text[0:1]

["['effectiveness', 'computerize', 'instructional', 'adjunct', 'standard', 'integrate', 'brain', 'injury', 'rehabilitation', 'member', 'satisfaction', 'utilize', 'computerize', 'inform', 'participant', 'instructional', 'selection', 'maximize', 'benefit']"]

In [10]:
# Create a TF-IDF document-term matrix for the AI corpus 

# TRY DIFFERENT PARAMETERS IN THE TF-IDF DOC-TERM MATRIX SET-UP
nmf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=3, lowercase=True) #, max_features=int(len(lim_docs)/2))

# by default TfidfVectorizer has l2 normalization for rows: 
# from Scikit Learn documentation: Each output row will have unit norm, either: * ‘l2’: Sum of squares of vector 
# elements is 1. The cosine similarity between two vectors is their dot product when l2 norm has been applied.

nmf_tf_idf = nmf_vectorizer.fit_transform(text)
nmf_tf_idf.shape

(22558, 23678)

In [11]:
AI_terms = nmf_vectorizer.get_feature_names()

In [12]:
AI_terms[1:10]

['1000_genomes',
 '100_gigabase',
 '100x',
 '1012',
 '10_601_solicitation',
 '10th',
 '10th_grade',
 '10x',
 '1167']

# 50 Topics

In [18]:
# topic modeling with NMF

nmf_model = NMF(n_components=50, random_state=1)  # TRY DIFFERENT NUMBERS OF TOPICS
W = nmf_model.fit_transform(nmf_tf_idf)
H = nmf_model.components_
print_topics(nmf_model, nmf_vectorizer, 10)


Topic 0:
('science', 11.648691683007828)
('scientific', 2.913279702812651)
('computer', 2.6015072707604006)
('scientist', 2.21679274967142)
('citizen', 0.8550061730863956)
('collaboration', 0.8287452633986282)
('interdisciplinary', 0.8137891711456349)
('discipline', 0.8072069458310462)
('computational', 0.7933356121487791)
('inquiry', 0.6780273278930433)

Topic 1:
('brain', 6.081451426776679)
('imaging', 1.061623246620713)
('functional', 1.0527302782697665)
('neuroscience', 1.029983773490052)
('fmri', 0.8371160250410016)
('region', 0.8275769293897733)
('connectivity', 0.782642504536121)
('neuroimaging', 0.6632640582243604)
('mri', 0.5146158721220198)
('disorder', 0.46525151751257565)

Topic 2:
('student', 6.4533153092934565)
('course', 1.910493097791551)
('undergraduate', 1.2443842116749417)
('skill', 1.1642851634808566)
('faculty', 0.8425617535235923)
('graduate', 0.7523288343134794)
('instructor', 0.6056685969156311)
('college', 0.599223205099363)
('school', 0.5433000331994449)
('co

# 30 Topics

In [19]:
#topic modeling with NMF

nmf_model = NMF(n_components=30, random_state=1)  # TRY DIFFERENT NUMBERS OF TOPICS
W = nmf_model.fit_transform(nmf_tf_idf)
H = nmf_model.components_
print_topics(nmf_model, nmf_vectorizer, 10)




Topic 0:
('science', 6.616599589937251)
('scientific', 3.1381582772920638)
('scientist', 1.9075292488139284)
('computer', 1.2368303055758512)
('collaboration', 1.0296751483972815)
('interdisciplinary', 0.6380691243414848)
('discipline', 0.6237744064907922)
('cyberinfrastructure', 0.6074970571388755)
('citizen', 0.5192391939913057)
('discovery', 0.4628513812241846)

Topic 1:
('brain', 4.088675361595686)
('neuroscience', 0.7736573455926599)
('imaging', 0.735741842708921)
('functional', 0.7080715164100737)
('fmri', 0.5613019289567031)
('region', 0.5538502161500162)
('connectivity', 0.5322248778401627)
('neuroimaging', 0.45215808460245516)
('mri', 0.35063392469888144)
('disorder', 0.31613605364137587)

Topic 2:
('student', 4.554168798828008)
('course', 1.3670624038242878)
('skill', 0.9198300944100193)
('undergraduate', 0.9000467212048953)
('faculty', 0.6658673416348081)
('graduate', 0.5254266049973584)
('college', 0.4581778648683564)
('instructor', 0.45029726075693244)
('career', 0.418857

# 20 Topics

In [20]:
#topic modeling with NMF

nmf_model = NMF(n_components=20, random_state=1)  # TRY DIFFERENT NUMBERS OF TOPICS
W = nmf_model.fit_transform(nmf_tf_idf)
H = nmf_model.components_
print_topics(nmf_model, nmf_vectorizer, 10)


Topic 0:
('algorithm', 3.9101351017621417)
('optimization', 1.9675654955363222)
('computational', 1.4251959625055275)
('theory', 1.1819012763211847)
('solution', 0.9013837341171325)
('mathematical', 0.8762221024256794)
('computer', 0.8698038139387129)
('efficient', 0.8459697779195765)
('complexity', 0.7913816550296224)
('computation', 0.780355972397962)

Topic 1:
('brain', 3.2076891248434145)
('neuroscience', 0.7687632888580744)
('cognitive', 0.6912193919355095)
('functional', 0.6444193889904412)
('imaging', 0.6135358049599486)
('fmri', 0.5296183592129374)
('neuroimaging', 0.46789429012600575)
('region', 0.44693587708459437)
('connectivity', 0.4214388802102649)
('human', 0.4205487708921512)

Topic 2:
('student', 3.22582028926964)
('teacher', 1.2771546672083962)
('stem', 1.0349866446187344)
('mathematics', 1.0105262843665423)
('school', 0.7951116514674325)
('course', 0.7486279816969755)
('skill', 0.6233596674529364)
('classroom', 0.6211961389416918)
('assessment', 0.5906942152073664)
(

In [None]:
# hot and cold figure 