In [1]:
import re
import nltk

In [2]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import numpy as np

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
data = pd.read_csv('papers.csv')

In [5]:
data.shape

(7241, 7)

In [6]:
stop_words = set(stopwords.words('english'))
# snippet for custom stop words
custom_stop_words = [
    'fig','figure','sample','image','using',
    'show','result','large',
    'also','one', 'two','three',
    'four','five','six','seven','eight','nine','ten'
]
custom_stop_words = set(custom_stop_words)

In [7]:

stop_words = list(stop_words | custom_stop_words)


In [8]:
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    # remove stopwords
    text = [word for word in text if word not in stop_words]

    # remove words less than three letters
    text = [word for word in text if len(word) >= 3]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]
    
    return ' '.join(text)

In [9]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [10]:
docs = data['paper_text'].iloc[:1000].apply(lambda x : pre_process(x))

In [11]:
docs

0      self organization associative database applica...
1      mean field theory layer visual cortex applicat...
2      storing covariance associative long term poten...
3      bayesian query construction neural network mod...
4      neural network ensemble cross validation activ...
                             ...                        
995    farotimi demho kailath neural network weight m...
996    foundation circuit complexity theory sensory p...
997    reinforcement learning function approximation ...
998    smart vision chip fabricated dimensional integ...
999    shape context new descriptor shape matching ob...
Name: paper_text, Length: 1000, dtype: object

In [12]:
# Using TF-IDF
from sklearn.feature_extraction.text import CountVectorizer

#docs = docs.tolist()
#create a vocabulary of words, 
cv=CountVectorizer(max_df=0.85,         # ignore words that appear in 85% of documents
                   max_features=1500,  # the size of the vocabulary
                   ngram_range=(1,3)    # vocabulary contains single words, bigrams, trigrams
                  )
cv.fit(docs)
word_count_vector= cv.transform(docs)

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [14]:
import pickle 

In [15]:
pickle.dump(cv, open('countvectorizer.pkl','wb'))
pickle.dump(tfidf_transformer,open('transformer.pkl', 'wb'))

In [16]:
def sort_coo(coo_matrix):
    
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

In [30]:
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [31]:
feature_names = cv.get_feature_names_out()

In [32]:
feature_names[230:234]

array(['computer science', 'computing', 'concept', 'conclusion'],
      dtype=object)

In [38]:
pickle.dump(feature_names,open('feature_names.pkl', 'wb'))

In [23]:
def get_keywords(idx, docs):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

In [28]:
def get_keywords_text(docs):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

In [27]:
testing = docs[227]

In [33]:
keywords = get_keywords_text(testing)

In [34]:
for k in keywords:
    print(k,keywords[k])
    

state 0.474
log 0.353
log likelihood 0.292
circuit 0.29
word 0.246
likelihood 0.244
chip 0.184
hmm 0.132
analog 0.132
transistor 0.126


In [35]:
def print_results(idx,keywords, df):
    # now print the results
    print("Title")
    print(df['title'][idx])
    print("\nAbstract")
    print(df['abstract'][idx])
    print("\nKeywords for this text")
    for k in keywords:
        print(k,keywords[k])

In [37]:
idx=127
keywords=get_keywords(idx, docs)
print_results(idx,keywords, data)

Title
Stock Selection via Nonlinear Multi-Factor Models

Abstract
Abstract Missing

Keywords for this text
return 0.605
factor 0.386
model 0.338
risk 0.17
nonlinear 0.156
short 0.149
long 0.123
performance 0.119
network 0.117
linear model 0.105
