# TextRank EN v01

## imports

In [1]:
%load_ext lab_black

In [2]:
import sys

sys.path.append("..")

In [24]:
import os
import re
import numpy as np
import pandas as pd
import spacy

from glob import glob
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from utils import pdf_to_text
from types_ import *

# ignore warning
import warnings

warnings.filterwarnings(action="ignore")

In [4]:
nlp = spacy.load("en_core_web_sm")

## 01. Data Load

In [5]:
dir_path = "../data/en"
file_list = glob(f"{dir_path}/*.pdf")

## 02. Extract Text from PDF

In [6]:
%%time

sample = file_list[2]
sents = pdf_to_text(sample)

CPU times: user 1.25 s, sys: 51.4 ms, total: 1.31 s
Wall time: 1.35 s


In [7]:
len(sents)

182

## 03. Summarization using Spacy

In [18]:
doc = nlp(sents[0])

In [21]:
for token in doc:
    print(token, token.pos_)

Is AUX
your DET
intelligent ADJ
digital ADJ
workforce NOUN
secure ADJ
against ADP
cyberattacks NOUN


In [28]:
class LemmaTokenizer(object):
    def __init__(self, use_pos=False):
        self.spacynlp = spacy.load("en_core_web_sm")
        self.pos_filter = False
        if use_pos:
            self.pos_filter = ["NOUN", "ADJ"]

    def __call__(self, doc):
        nlpdoc = self.spacynlp(doc)
        if self.pos_filter:
            nlpdoc = [
                token.lemma_
                for token in nlpdoc
                if ((len(token.lemma_) > 1) or (token.lemma_.isalnum()))
                and (token.pos_ in self.pos_filter)
            ]
        else:
            nlpdoc = [
                token.lemma_
                for token in nlpdoc
                if (len(token.lemma_) > 1) or (token.lemma_.isalnum())
            ]
        return nlpdoc

In [25]:
stopwords = stopwords.words("english")
stopwords += [",", "-", ":", ";", "!", "?", "'", '"']

In [29]:
vect = TfidfVectorizer(tokenizer=LemmaTokenizer(use_pos=True), stop_words=stopwords)
vect.fit(sents)
print(vect.vocabulary_)

{'intelligent': 259, 'digital': 154, 'workforce': 575, 'secure': 459, 'cyberattack': 129, 'focus': 205, 'area': 36, 'risk': 442, 'control': 111, 'direct': 155, 'impact': 235, 'attack': 45, 'hard': 225, 'colossal': 92, 'bank': 61, 'insurance': 255, 'company': 94, 'punitive': 410, 'fine': 203, 'regulatorypliance': 424, 'operation': 346, 'consumer': 108, 'trust': 546, 'high': 227, 'cost': 114, 'security': 460, 'incident': 240, 'adoption': 11, 'new': 332, 'technology': 519, 'general': 214, 'loss': 300, 'confidence': 99, 'key': 272, 'senior': 462, 'stakeholder': 488, 'potential': 382, 'related': 425, 'cyber': 128, 'chief': 85, 'officer': 341, 'head': 226, 'team': 517, 'appropriate': 35, 'governance': 218, 'framework': 206, 'assessment': 41, 'automation': 52, 'value': 560, 'case': 77, 'definition': 140, 'lifecycle': 290, 'nature': 328, 'vulnerability': 568, 'standard': 489, 'software': 477, 'implementation': 237, 'different': 152, 'approach': 34, 'paper': 359, 'financial': 201, 'service': 46