In [11]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances


#from gensim.models.coherencemodel import CoherenceModel

import TextCleaning

In [12]:
# pull in raw abstracts

raw_df=pd.read_csv('../../data/original/raw_abstracts.csv',engine='python')

# remove null abstracts and duplicates

df = TextCleaning.remove_nulls(raw_df, "ABSTRACT")
df = TextCleaning.remove_duplicates(df)

df.reset_index(inplace = True)
df.rename(columns={'index':'original index'}, inplace=True)

3 nulls in  ABSTRACT . These rows removed.
11 duplicate abstracts removed
0 project ID duplicates - not removed


In [13]:
df.head()
#papers = df.copy()
df.columns


Index(['original index', 'PROJECT_ID', 'ABSTRACT', 'FY', 'FIRST_CHAR',
       'LAST_CHAR', 'DEPARTMENT', 'AGENCY', 'IC_CENTER', 'PROJECT_NUMBER',
       'PROJECT_TITLE', 'PROJECT_TERMS', 'CONTACT_PI_PROJECT_LEADER',
       'OTHER_PIS', 'ORGANIZATION_NAME', 'CFDA_CODE', 'FY_TOTAL_COST'],
      dtype='object')

In [14]:
docs = df["ABSTRACT"] 

In [15]:
docs

0         This is a project to explore Game-based, Metap...
1         Institution: Franklin Institute Science Museum...
2         Through programs (including small group conver...
3         In partnership with the American Chemical Soci...
4         Amphibian populations around the world are exp...
                                ...                        
550069    The Title IV-E Prevention Services Clearinghou...
550070    This mixed-methods study seeks to deepen our u...
550071    The purpose of this project is to examine the ...
550072    The 2014 Child Care and Development Block Gran...
550073    The goal of this study is to understand the us...
Name: ABSTRACT, Length: 550074, dtype: object

In [16]:
#!pip install spacy
import spacy
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [17]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to /home/mc9bn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/mc9bn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [None]:
import random
text_data = []
f = docs
for line in f:
    tokens = prepare_text_for_lda(line)
    if random.random() > .99:
        #print(tokens)
        text_data.append(tokens)

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [None]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [None]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [None]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [None]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

In [None]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [None]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)