In [3]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import TextConverter
from io import StringIO
from pdfminer.pdfpage import PDFPage
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!pip install pdfminer

Collecting pdfminer
[?25l  Downloading https://files.pythonhosted.org/packages/71/a3/155c5cde5f9c0b1069043b2946a93f54a41fd72cc19c6c100f6f2f5bdc15/pdfminer-20191125.tar.gz (4.2MB)
[K     |████████████████████████████████| 4.2MB 8.0MB/s 
[?25hCollecting pycryptodome
[?25l  Downloading https://files.pythonhosted.org/packages/17/55/17fa0b55849dc135f7bc400993a9206bf06d1b5d9520b0bc8d47c57aaef5/pycryptodome-3.9.8-cp36-cp36m-manylinux1_x86_64.whl (13.7MB)
[K     |████████████████████████████████| 13.7MB 296kB/s 
[?25hBuilding wheels for collected packages: pdfminer
  Building wheel for pdfminer (setup.py) ... [?25l[?25hdone
  Created wheel for pdfminer: filename=pdfminer-20191125-cp36-none-any.whl size=6140087 sha256=3c0aff47c305a4e9c986fabdf25b3511c746e05b61ec5e2681daa9c9c10d1e1b
  Stored in directory: /root/.cache/pip/wheels/e1/00/af/720a55d74ba3615bb4709a3ded6dd71dc5370a586a0ff6f326
Successfully built pdfminer
Installing collected packages: pycryptodome, pdfminer
Successfully instal

In [4]:
def get_pdf_file_content(path_to_pdf):
    
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''
    
    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''
    
    resource_manager = PDFResourceManager(caching=True)
    
    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    out_text = StringIO()
    
    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    codec = 'utf-8'
    
    """
    LAParams is the object containing the Layout parameters with a certain default value. 
    """
    laParams = LAParams()
    
    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''
    text_converter = TextConverter(resource_manager, out_text, laparams=laParams)
    fp = open(path_to_pdf, 'rb')

    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''
    interpreter = PDFPageInterpreter(resource_manager, text_converter)

    '''
    We are going to process the content of each page of the original PDF File
    '''
    for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True):
        interpreter.process_page(page)

    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''
    text = out_text.getvalue()

    '''
    Closing all the ressources we previously opened
    '''
    fp.close()
    text_converter.close()
    out_text.close()
    
    '''
    Return the final variable containing all the text of the PDF
    '''
    return text

In [6]:
# Convreting the PDF file to text file
path_to_pdf = '/content/drive/My Drive/Colab Notebooks/EIB_Data/AMEX_BDR_2018.pdf'

In [None]:
#print(get_pdf_file_content(path_to_pdf)

In [8]:
text1 = (get_pdf_file_content(path_to_pdf))
text1

'2018\nANNUAL\nREPORT\n\n(800) 523-6049 // www.blondertongue.com\n\n\x0cTo our Stockholders*\n\nOverview\n\nOur net sales for 2018 were $21,700,000, 6.8 % lower than \nthe $23,283,000 that we reported for 2017.  This was clearly \nnot in line with the goals we set for 2018, and not where we \nneed to be.  The reduced sales resulted in an operating loss \nof $(854,000), contrasted with operating income of $463,000, \nwhich  we  achieved  in  2017.    It  is  clear  from  our  reported \nperformance this year that we faced significant headwinds \nin  our  traditional  distribution-based  and  service-operator-\nbased markets. While we are concerned about prospective \nrevenues for the first half of 2019, we are diligently working \nthe  areas  that  need  to  be  addressed,  in  an  effort  to \nreverse this trend during the second half of the year.  The \nimprovement in our liquidity as a result of the sale/leaseback \nof  our  Old  Bridge  facility,  provides  us  with  the  flexibilit

In [9]:
# Tokenization:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(text1.lower())
tokens[:10]

['2018',
 'annual',
 'report',
 '(',
 '800',
 ')',
 '523-6049',
 '//',
 'www.blondertongue.com',
 'to']

In [11]:
# To get the most frequent words on the document 
from collections import Counter
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(text1.lower())
token_counts = Counter(tokens)
token_counts.most_common(10)
# A lof of stop words

[('...', 3776),
 ('the', 2355),
 (',', 2071),
 ('of', 1392),
 ('and', 1340),
 ('to', 1036),
 ('in', 722),
 ('company', 601),
 ('(', 477),
 (')', 477)]

In [12]:
# To remove the stop words:
import nltk
nltk.download('stopwords', quiet=True)
stopwords = nltk.corpus.stopwords.words('english')
tokens = [x for x in tokens if x not in stopwords]
BDR_counts = Counter(tokens)
BDR_counts.most_common(20)

[('...', 3776),
 (',', 2071),
 ('company', 601),
 ('(', 477),
 (')', 477),
 ('$', 306),
 ('’', 293),
 ('“', 206),
 ('”', 206),
 ('2018', 187),
 ('financial', 163),
 ('products', 155),
 ('december', 142),
 ('agreement', 142),
 ('31', 129),
 ('stock', 126),
 ('2017', 108),
 ('.', 108),
 ('may', 107),
 ('sales', 106)]

In [13]:
# To remove punctuations
puncs = set((',', '.', '--', '-', '!', '?', ':', ';', '``', "''", '(', ')', '[', ']','$','...','$','’','“','”', '%', '..', '•'))
tokens = [x for x in tokens if x not in puncs]
token_counts = Counter(tokens)
token_counts.most_common(100)

[('company', 601),
 ('2018', 187),
 ('financial', 163),
 ('products', 155),
 ('december', 142),
 ('agreement', 142),
 ('31', 129),
 ('stock', 126),
 ('2017', 108),
 ('may', 107),
 ('sales', 106),
 ('blonder', 92),
 ('tongue', 88),
 ('cable', 82),
 ('facility', 79),
 ('could', 79),
 ('new', 78),
 ('operations', 77),
 ('sterling', 77),
 ('net', 74),
 ('inc.', 71),
 ('common', 71),
 ('plan', 69),
 ('cash', 68),
 ('statements', 68),
 ('loan', 67),
 ('video', 65),
 ('laboratories', 65),
 ('assets', 65),
 ('including', 63),
 ('years', 62),
 ('market', 62),
 ('results', 62),
 ('digital', 62),
 ('certain', 61),
 ('sale', 61),
 ('shares', 60),
 ('income', 59),
 ('old', 59),
 ('bridge', 59),
 ('product', 59),
 ('approximately', 59),
 ('based', 57),
 ('customers', 57),
 ('consolidated', 57),
 ('form', 56),
 ('registrant', 56),
 ('subordinated', 56),
 ('incorporated', 55),
 ('loss', 54),
 ('2019', 54),
 ('ended', 54),
 ('reference', 54),
 ('manufacturing', 54),
 ('increase', 53),
 ('value', 53),
 

In [14]:
# To remove numbers
tokens = [x for x in tokens if x.isalpha()]
token_counts = Counter(tokens)
token_counts.most_common(20)

[('company', 601),
 ('financial', 163),
 ('products', 155),
 ('december', 142),
 ('agreement', 142),
 ('stock', 126),
 ('may', 107),
 ('sales', 106),
 ('blonder', 92),
 ('tongue', 88),
 ('cable', 82),
 ('facility', 79),
 ('could', 79),
 ('new', 78),
 ('operations', 77),
 ('sterling', 77),
 ('net', 74),
 ('common', 71),
 ('plan', 69),
 ('cash', 68)]

In [21]:
# Number of words in the document (tokens)
len(tokens)

18748

In [19]:
# Vectorizing 
document_vector = []
doc_length = len(tokens)
for key, value in token_counts.most_common():
  document_vector.append(value / doc_length)
document_vector

[0.03205675272029016,
 0.008694260721143588,
 0.008267548538510774,
 0.007574141241732451,
 0.007574141241732451,
 0.0067207168764668234,
 0.005707275442713889,
 0.0056539364198847876,
 0.004907190100277363,
 0.004693834008960956,
 0.004373799871986345,
 0.00421378280349904,
 0.00421378280349904,
 0.004160443780669938,
 0.004107104757840836,
 0.004107104757840836,
 0.003947087689353531,
 0.0037870706208662256,
 0.003680392575208022,
 0.0036270535523789203,
 0.0036270535523789203,
 0.0035737145295498185,
 0.0034670364838916153,
 0.0034670364838916153,
 0.0034670364838916153,
 0.0033603584382334117,
 0.00330701941540431,
 0.00330701941540431,
 0.00330701941540431,
 0.00330701941540431,
 0.003253680392575208,
 0.003253680392575208,
 0.0032003413697461063,
 0.0031470023469170045,
 0.0031470023469170045,
 0.0031470023469170045,
 0.0031470023469170045,
 0.0031470023469170045,
 0.003040324301258801,
 0.003040324301258801,
 0.003040324301258801,
 0.002986985278429699,
 0.002986985278429699,
 0

In [20]:
# Vectorizing with TFDIF function
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = tokens
vectorizer = TfidfVectorizer(min_df=1)
model = vectorizer.fit_transform(corpus)
print(model.todense().round(5))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
