In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("C://Users//Ansh Jhoshi//Downloads//keyword_extraction//papers.csv")
df

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."
...,...,...,...,...,...,...,...
7236,994,1994,Single Transistor Learning Synapses,,994-single-transistor-learning-synapses.pdf,Abstract Missing,Single Transistor Learning Synapses\n\nPaul Ha...
7237,996,1994,"Bias, Variance and the Combination of Least Sq...",,996-bias-variance-and-the-combination-of-least...,Abstract Missing,"Bias, Variance and the Combination of\nLeast S..."
7238,997,1994,A Real Time Clustering CMOS Neural Engine,,997-a-real-time-clustering-cmos-neural-engine.pdf,Abstract Missing,A Real Time Clustering CMOS\nNeural Engine\nT....
7239,998,1994,Learning direction in global motion: two class...,,998-learning-direction-in-global-motion-two-cl...,Abstract Missing,Learning direction in global motion: two\nclas...


In [3]:
print("Number of unique values")
for col in df.columns:
    print(col, "    ", df[col].nunique())

Number of unique values
id      7241
year      31
title      7241
event_type      3
pdf_name      7241
abstract      3923
paper_text      7237


In [4]:
print("Unique values of particular columns\n")
print(df["event_type"].value_counts())
print(df["event_type"].isna().sum())

Unique values of particular columns

Poster       2146
Spotlight     181
Oral           95
Name: event_type, dtype: int64
4819


In [5]:
print("Unique values of particular columns\n")
print(df["year"].value_counts())
print(df["year"].isna().sum())

Unique values of particular columns

2017    679
2016    569
2014    411
2015    403
2012    368
2013    360
2011    306
2010    292
2009    262
2008    250
2007    217
2002    207
2004    207
2005    207
2006    204
2003    198
2001    197
1993    158
2000    152
1996    152
1995    152
1998    151
1999    150
1997    150
1991    144
1990    143
1994    140
1992    127
1989    101
1988     94
1987     90
Name: year, dtype: int64
0


In [6]:
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
new_words = ["fig", "figure", "image", "sample", "using",
            "show", "result", "large", "also",
            "one", "two", "three", "four", "five",
            "six", "seven", "eight", "nine", "zero"]

stop_words = list(stop_words.union(new_words))


In [7]:
def cleaning(text):
    
    #convert everything to lowercase
    text = text.lower()
    
    #remove tags
    # lt gt - less than greater than
    text = re.sub("&lt;/?.*?&gt;" , "&lt;&gt;", text)
    
    #remove special characters and digits
    #\d - digit |(or) \W - not a word character
    # capital for inverse
    #() is for checking groups
    text = re.sub("(\\d|\\W)"," ", text)
    
    #covert to list from string, by splitting it
    text = text.split()
    
    #remove stopwords
    text = [words for words in text if words not in stop_words]
    
    #remove words less than 3 letters
    text = [words for words in text if len(words) >= 3]
    
    #lemmatize - get the root word
    #trim words to their root words
    lemma = WordNetLemmatizer()
    
    text = [lemma.lemmatize(word) for word in text]
    
    return ' '.join(text)


docs = df['paper_text'].apply(lambda x:cleaning(x))

docs

0       self organization associative database applica...
1       mean field theory layer visual cortex applicat...
2       storing covariance associative long term poten...
3       bayesian query construction neural network mod...
4       neural network ensemble cross validation activ...
                              ...                        
7236    single transistor learning synapsis paul hasle...
7237    bias variance combination least square estimat...
7238    real time clustering cmos neural engine serran...
7239    learning direction global motion class psychop...
7240    correlation interpolation network real time ex...
Name: paper_text, Length: 7241, dtype: object

In [8]:
docs.dtype

dtype('O')

In [11]:
#Tfidf - convert text to feature vectors

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_df = 0.95,        #ignore elements that appear in 95% documents
                    max_features = 1000,  #size of vocabulary
                    ngram_range = (1,2))   #vocabulary contains single words, bigrams, trigrams


word_counter = cv.fit_transform(docs)

word_counter

<7241x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 2701864 stored elements in Compressed Sparse Row format>

In [12]:
"""

df = pd.DataFrame(record)
 
# Creating a dataframe with 50%
# values of original dataframe
part_50 = df.sample(frac = 0.5)
 
# Creating dataframe with
# rest of the 50% values
rest_part_50 = df.drop(part_50.index)

This is the way to divide the dataframe into 2 parts

"""

from sklearn.feature_extraction.text import TfidfTransformer

model = TfidfTransformer(smooth_idf = True, use_idf = True)
model.fit(word_counter)

TfidfTransformer()

In [18]:
def sort_it(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True) 


def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results


# get feature names
feature_names=cv.get_feature_names()



def get_keywords(idx, docs):

    #generate tf-idf for the given document
    tf_idf_vector=model.transform(cv.transform([docs[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_it(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords



idx = 941
keywords = get_keywords(idx, docs)

In [20]:
def print_results(idx,keywords, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][idx])
    print("\n=====Abstract=====")
    print(df['abstract'][idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])


print_results(idx,keywords, df)


=====Title=====
Algorithms for Non-negative Matrix Factorization

=====Abstract=====
Non-negative matrix factorization (NMF) has previously been shown to 
be a useful decomposition for multivariate data. Two different multi- 
plicative algorithms for NMF are analyzed. They differ only slightly in 
the multiplicative factor used in the update rules. One algorithm can be 
shown to minimize the conventional least squares error while the other 
minimizes the generalized Kullback-Leibler divergence. The monotonic 
convergence of both algorithms can be proven using an auxiliary func- 
tion analogous to that used for proving convergence of the Expectation- 
Maximization algorithm. The algorithms can also be interpreted as diag- 
onally rescaled gradient descent, where the rescaling factor is optimally 
chosen to ensure convergence. 

===Keywords===
update 0.45
rule 0.302
matrix 0.258
factorization 0.236
theorem 0.176
gradient 0.174
divergence 0.168
negative 0.162
factor 0.158
funct