In [29]:
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 27 18:55:35 2019

@author: JM
"""

import os
import re
import nltk
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from tkinter.filedialog import askopenfilename 
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter,resolve1
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter


class preprocessing:
    def __init__(self, input_path, output_path=None):
        self.input_path = input_path
        self.output_path = output_path

    def isExistFile(self):
        file_name = self.output_path.split('/')[-1]

        for i in os.listdir("."):
            if file_name == i:
                return True

        return False

    def pdf2txt(self):
        '''
        input_path : str, PDF File

        =============================

        return : str, text File path
        '''

        # input
        password=''
        pagenos=set()
        maxpages=0

        # output
        imagewriter = None
        rotation = 0
        codec = 'UTF-8'
        pageno = 1
        scale = 1
        caching = True
        showpageno = True
        laparams = LAParams()

        infp = open(self.input_path,"rb")
        
        if self.output_path == None:
            self.output_path = self.input_path[:-4]+'_trans.txt'
            outfp = open(self.output_path,"w",encoding='UTF8')
        else:
            outfp = open(self.output_path,"w",encoding='UTF8')
            
            
        #page total num
        parser = PDFParser(infp)
        document = PDFDocument(parser)
        page_total_num = resolve1(document.catalog['Pages'])['Count']

        #
        rsrcmgr = PDFResourceManager(caching=caching)

        # pdf -> text converter
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams, 
                               imagewriter=imagewriter)

        # pdf -> text interpreter
        interpreter = PDFPageInterpreter(rsrcmgr,device)

        # pdf -> text start
        with tqdm(total=page_total_num) as pbar:
            for page in PDFPage.get_pages(infp,
                                          pagenos,
                                          maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):

                page.rotate = (page.rotate+rotation) % 360     
                interpreter.process_page(page)

                pbar.update(1)

        print('[INFO] pdf -> text')

        outfp.close()
        infp.close()
    
    def clean_text(self):
        '''
        path : str, text File Path


        ===========================

        return : list, sentences
        '''

        f = open(self.output_path,"rb")
        line_list = []

        while True:
            line = f.readline()
            line_list.append(line)
            if not line: break

        f.close()

        # remove nextline
        word = b" ".join(line_list).split()
        sentences = b" ".join(word)


        # remove ASCII
        # define pattern 
        pattern = re.compile(b"[\x80-\xff]")
        sentences = re.sub(pattern,b"",sentences)

        sentences = sentences.split(b". ")

        f = open(self.output_path,"wb")

        for sentence in sentences:
            sentence = sentence.replace(b"- ",b'')
            sentence = sentence.replace(b"-",b'')
            #cleaned_txt.append(sentence)
            f.write(sentence + b'. ')

        f.close()
        
        print('[INFO] clean text file')
        
    def example
        
    def word_Frequency(self):
        f = open(self.output_path,"r")

        text = f.readline()

        # 단어의 빈도수
        shortword = re.compile(r'\W*\b\w{1,2}\b')
        text = shortword.sub('', text)

        stop_words = set(stopwords.words('english')) 
        word_tokens = word_tokenize(text)

        result = [] 

        # 불용어 제거
        for w in word_tokens: 
            if w not in stop_words: 
                parsing = ''.join([i for i in w if not i.isdigit()]) 
                parsing = re.sub('[-=+,#/\?:^$.@*\"※~&%}{ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', parsing)


                if parsing and parsing.isdigit() == False and len(parsing) > 2:
                    result.append(w)  

        cnt = Counter(result)
        
        print('[INFO] generation word frequency')
        
        return cnt
    


input_path1 ='C:/Users/JM/Desktop/MCNN.pdf'
input_path2 ='C:/Users/JM/Desktop/YOLO.pdf'

pdf1 = preprocessing(input_path1)
pdf2 = preprocessing(input_path2)

pdf1.pdf2txt()
pdf2.pdf2txt()

pdf1.clean_text()
pdf2.clean_text()

cnt1 = pdf1.word_Frequency()
cnt2 = pdf2.word_Frequency()

cnt = cnt1 + cnt2

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:02<00:00,  3.27it/s]


[INFO] pdf -> text


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:06<00:00,  5.02it/s]


[INFO] pdf -> text
[INFO] clean text file
[INFO] clean text file
[INFO] generation word frequency
[INFO] generation word frequency


In [30]:
cnt

Counter({'adopted': 1,
         'connected': 14,
         'support': 1,
         'generic': 1,
         'vision': 1,
         'methods': 25,
         'composed': 1,
         'scene': 9,
         'shanghaitech.edu': 1,
         'layer': 50,
         'agglomerative': 1,
         'Acknowledgement': 1,
         'distributed': 2,
         'derive': 1,
         'correctly': 1,
         'remaining': 2,
         'Many': 1,
         'Transfer': 2,
         'sanctuaries': 1,
         'provides': 1,
         'embedding': 1,
         'Shapebased': 1,
         'Class|': 2,
         'score': 23,
         'single': 15,
         'trafc': 2,
         'evaluation': 5,
         'available': 1,
         'Networks': 2,
         'Unfortunately': 3,
         'Training': 15,
         'features': 22,
         'propose': 4,
         'Average': 2,
         'uses': 4,
         'kernels': 6,
         'group': 4,
         'Both': 1,
         'coord': 1,
         'piecewise': 1,
         'layers': 15,
         'MCNN

In [33]:
sorted_cnt = sorted(cnt.items(), key=lambda t : t[1],reverse=True)
sorted_values = sorted(cnt.values(), reverse=True)
sorted_keys = sorted(cnt, key=cnt.get, reverse=True)

print(sorted_cnt[:30])

[('image', 93), ('crowd', 92), ('box', 86), ('object', 80), ('density', 71), ('dataset', 64), ('cell', 58), ('YOLO', 54), ('detection', 53), ('layer', 50), ('images', 49), ('grid', 49), ('MCNN', 48), ('bounding', 48), ('network', 46), ('map', 42), ('model', 37), ('The', 36), ('different', 35), ('Part', 34), ('training', 34), ('counting', 33), ('confidence', 31), ('class', 31), ('number', 29), ('error', 29), ('data', 28), ('methods', 25), ('method', 25), ('score', 23)]


In [32]:
import pytagcloud
import pandas as pd
import webbrowser

def generate_cloud(dic, top, save_path):
    taglist = pytagcloud.make_tags(dict(dic[:top]).items())
    pytagcloud.create_tag_image(taglist, save_path, size=(360, 280), rectangular=False)

In [5]:
generate_cloud(sorted_cnt, 50, './cloud.jpg')

In [6]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

print("Matplotlib version", matplotlib.__version__)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Matplotlib version 3.0.2


In [10]:
import plotnine
from plotnine import *
print("plontnine version :",plotnine.__version__)

plontnine version : 0.5.1


In [26]:
df = pd.DataFrame(sorted_cnt[:30],columns=['word','freq'])

df = pd.DataFrame({
    'word' : sorted_keys[:30] * 2,
    'freq' : sorted_values[:30] * 2
})


(ggplot(df)
 + geom_col(aes(x='freq',y='word',fill='freq'))
 + scale_color_hue(l=0.45)                                  # some contrast to make the lines stick out
 + ggtitle('Greek Letter Analysis')
 + theme(axis.text.x=element_text(angle=45, hjust=1))
)

SyntaxError: keyword can't be an expression (<ipython-input-26-290d0a66b113>, line 13)