## Import

In [1]:
import PyPDF2 #read the pdf

import pandas as pdd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

## Getting the data

In [2]:
file = open('JavaBasics-notes.pdf', 'rb')
fileReader = PyPDF2.PdfFileReader(file)

total = fileReader.numPages

#### Getting the function for feature name

In [3]:
def get_topics(model, feature_names, no_top_words):
    all_ = []
    for topic_idx, topic in enumerate(model.components_):
        #print ("Topic %d:" % (topic_idx))
        x = " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])
        all_.append(str(x))
    return all_


In [4]:
tra = []
for i in range(total):
  pg = fileReader.getPage(i)
  tra.append(pg.extractText())

### Algorithms:
 NMF :Non-negative Matrix factorization      
 LDA : Latent Derilicht Analysis

In [5]:
documents = tra

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

idf = tfidf_vectorizer.idf_
x = dict(zip(tfidf_vectorizer.get_feature_names(), idf))

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = len(tra)

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)





In [6]:
no_top_words = 10 #words for each page

In [7]:
all_ = get_topics(nmf, tfidf_feature_names, no_top_words)#nmf

In [8]:
all_2 = get_topics(lda, tf_feature_names, no_top_words)#lda

### Getting weights

In [9]:
weights = {}
weights_2 = {}
for i in range(len(all_)):
  rest = all_[i].split(' ')
  rest2 = all_2[i].split(' ')
  for j in rest:
    if j in x:
      weights[str(j)] = x[str(j)]
  for k in rest2:
    if k in x:
      weights_2[str(k)] = x[str(k)]
        

### Making dataframe

In [10]:
df1 = pdd.DataFrame(list(weights.items()), columns=['topic', 'weights'])

In [11]:
df2 = pdd.DataFrame(list(weights_2.items()), columns=['topic', 'weights'])

In [12]:
print(df1)

            topic   weights
0      parameters  3.079442
1         defines  3.079442
2    getparameter  3.079442
3        instance  2.791759
4           added  3.079442
5         boolean  2.568616
6          object  1.613104
7            stop  3.079442
8            data  2.098612
9         pointer  2.791759
10           does  2.098612
11         return  2.791759
12       existing  3.079442
13          stack  2.568616
14         prints  3.079442
15        methods  2.568616
16        machine  3.079442
17      following  2.098612
18           true  2.791759
19           make  2.568616
20          class  1.538997
21    compilation  3.079442
22           loop  3.079442
23          array  1.980829
24         button  2.568616
25         string  1.980829
26           byte  2.791759
27        applets  2.568616
28          refer  2.386294
29     arithmetic  3.079442
..            ...       ...
96        library  3.079442
97        garbage  2.232144
98           null  2.232144
99         arrays  3

In [13]:
print(df2)

            topic   weights
0         similar  2.386294
1         defines  3.079442
2    getparameter  3.079442
3        instance  2.791759
4         boolean  2.568616
5          method  1.875469
6            stop  3.079442
7            file  2.568616
8            data  2.098612
9        checking  3.079442
10           does  2.098612
11         return  2.791759
12        garbage  2.232144
13       comments  3.079442
14        methods  2.568616
15        machine  3.079442
16      following  2.098612
17        version  2.791759
18          class  1.538997
19        sockets  3.079442
20          shows  2.791759
21           loop  3.079442
22          types  2.232144
23         button  2.568616
24         easily  2.791759
25         string  1.980829
26           byte  2.791759
27          refer  2.386294
28             10  2.791759
29        program  2.098612
..            ...       ...
101        arrays  3.079442
102         paint  3.079442
103    references  2.791759
104     reference  2

In [14]:
print('NMF')
for i in range(len(all_)):
    print('page = ', i, 'keywords : ' , all_[i])

NMF
page =  0 keywords :  button new ok init allocate sizeof calloc objects null create
page =  1 keywords :  method applet main args browser class called file inthis static
page =  2 keywords :  machine platform library code byte applets program arithmetic pointer portable
page =  3 keywords :  data new stack public int code true consider return types
page =  4 keywords :  literal constant int 14 default package unicode false expressions note
page =  5 keywords :  args method new operator note set expr added constructor class
page =  6 keywords :  basicsjava www garbage following final file features false extends expressions
page =  7 keywords :  left example final file features false extends expressions expr existing
page =  8 keywords :  expr boolean passed int loop const statements objects reference types
page =  9 keywords :  www example final file features false extends expressions expr existing
page =  10 keywords :  comments p2 code does use output getparameter prints width pla

In [15]:
print('LDA')
for i in range(len(all_2)):
    print('page = ', i , 'keywords : ', all_2[i])

LDA
page =  0 keywords :  array int element new arrays allocate code static final basicsjava
page =  1 keywords :  garbage new memory collection instance codevoid collector following int automatic
page =  2 keywords :  data new return int public language object code button programs
page =  3 keywords :  string left use objects type compiler look section elements element
page =  4 keywords :  new data method code language class program args operator machine
page =  5 keywords :  prints wide executed init secure test hi create contain sockets
page =  6 keywords :  instantiated parameters reference suspend statements basicsjava animations false references sockets
page =  7 keywords :  basicsjava consider constructor automatic executed literal value variable left platform
page =  8 keywords :  expr boolean int passed objects loop statements reference method primitive
page =  9 keywords :  button new string applet array objects object int code ok
page =  10 keywords :  appletviewer static t

In [18]:
#save the weights in a csv
df1.to_csv('NMF.csv')
df2.to_csv('LDA.csv')