<a href="https://colab.research.google.com/github/ELehmann91/NLP-Contract-Analysis/blob/master/prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install

In [0]:
%%capture
!sudo apt install poppler-utils
!pip install pdf2image
!sudo apt install tesseract-ocr
!pip install pytesseract
!pip install spacy
!pip install spacy-langdetect
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

In [0]:
import os
import json
import re
import pytesseract
from PIL import Image
import pickle
import tempfile
from pdf2image import convert_from_path
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import pandas as pd
import spacy
nlp_en = spacy.load("en_core_web_sm")
nlp_de = spacy.load("de_core_news_sm")
from spacy.matcher import PhraseMatcher
from spacy_langdetect import LanguageDetector


# Define functions (insert yours)


In [0]:
def multi_corpus(text):
  '''
  check if german or english and use spacy language model accordingly 
  return spacy doc
  '''
  nlp = spacy.load("en_core_web_sm")
  nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
  doc = nlp(text)
  # document level language detection. Think of it like average language of the document!
  if doc._.language['language'] == 'en':
    #print('english detected')
    doc = nlp_en(text)
  elif doc._.language['language'] == 'de':
    #print('german detected')
    doc = nlp_en(text)
  else:
    print('not found', doc._.language['language'])
  return doc


In [0]:
def find_ident(doc,ident_list,window_w=150):
  '''
  input spacy doc and list with string identifiers
  loops throgh the document and searches for identifiers
  if found return a window of words
  '''
  windows = []
  for i, token in enumerate(doc):
      if token.text in ident_list:
        window = [tok for tok in doc[i-1:i+window_w]]
        windows.extend(window)
      if doc[i-1].text +' '+ token.text in ident_list:
        window = [tok for tok in doc[i-2:i+window_w]]
        windows.extend(window)
      #print(token.text, token.pos_, token.tag_, token.dep_, token.shape_)
  return windows

In [0]:
def find_number(doc,length=8,min=60,max=200):
  '''
  scans through a documents for numers between 60 and 200
  '''
  for tok in doc:
    tag = tok.tag_ in ['CD','CARD']
    lgt = len(tok.text)<length
    try:
      num_o = min < float(re.sub('[^0-9.,]', ' ',tok.text).replace(',','.')) < max
    except:
      bum_o = False
      next
    if tag and lgt and num_o:
      return tok.text

In [0]:
def find_issue_price(text):
  issue_price_ident = ["Ausgabepreis", "Issue Price", "Emissionskurs"]

  doc = multi_corpus(text)
  windows = find_ident(doc,issue_price_ident,window_w=150)
  no = find_number(windows,length=8,min=60,max=200)
  return no

In [0]:
def OCR_pdf(path_,file_,no_pages=40):
    filename = file_ #path_ + file_ +'/' + file_ +'.pdf' #'target.pdf'
    if os.path.exists(path_+ file_+'/extracted_text.txt') == False:

        # import pdf to img
        with tempfile.TemporaryDirectory() as path:
            # mag page number
            images_from_path = convert_from_path(filename, output_folder=path, last_page=no_pages, first_page =0)
        #filename
        base_filename  =  os.path.splitext(os.path.basename(filename))[0] + '.jpg'     
        #create folder for images
        img_dir = path_ + file_ + '/pdf2img'
        try:
          os.mkdir(img_dir)
        except FileExistsError:
          next
          #print('check')

        text = ''
        for i,page in enumerate(images_from_path):
            #save as jpg
            page.save(os.path.join(img_dir, base_filename+str(i)), 'JPEG')
            #read jpg and extract text
            text += pytesseract.image_to_string(Image.open(os.path.join(img_dir, base_filename+str(i)))) + ' '
            #
        file1 = open(path_+ file_+'/extracted_text.txt','w') 
        file1.writelines(text) 
        file1.close()

        #print(filename,'pages:',len(images_from_path))
    else:
      next
      #print(filename,' allready exists')


In [0]:
def propro(sen):
  sen = sen.lower()
  sen = sen.replace('\n',' ')
  sen = sen.replace('%','prozent')
  sen = re.sub('[^A-Za-zÄÖÜßäöüß]+', ' ',sen)
  #sen = ' '.join([w for w in sen.split() if w not in sw_en])
  return sen

In [0]:
fields_dict = { '001_type':'classification',
                '002_issuer_name':'extraction',
                '003_product_subclass':'classification',
                '004_identifiers':'extraction',
                '004_isin':'extraction',
                '004_wkn':'extraction',
                '005_currency':'classification',
                '006_issue_date':'extraction',
                '007_issue_price':'extraction',
                '008_maturity_date':'extraction',
                '009_nominal_amount':'extraction',
                '010_redemption_formula':'extraction',
                '011_seniority':'classification',
                '012_business_day_convention':'classification',
                '013_day_count_convention':'classification',
                '014_business_day_calendar':'classification',
                '015_payment_frequency':'classification',
                '017_first_payment_date':'extraction',
                '018_payment_dates':'extraction',
                '020_paragraph_489':'classification',
                '021_formula':'extraction',
                '022_legislation':'classification',
                '023_amendment':'classification'}

In [0]:
def load_and_process():
  # path where results are stored
  path_ = '/content/gdrive/Shared drives/FS-AI Base Data/predict/'
  # upload from loacal drive
  uploaded = files.upload()
  file_ = str(list(uploaded.keys())[0])
  # create folder in outputpath
  try:
    os.mkdir(path_+file_)
  except FileExistsError:
    print('Oo')
  #create text from pdf
  OCR_pdf(path_,file_)
  # read and preprocess text
  with open(path_+file_+'/extracted_text.txt',"r") as file1:
    text = ' '.join(file1.readlines())
    text_pre = propro(text)
    file1.close()
  print('text length',len(text_pre))

  pdf_extract = {}
  # load models and predict classification tasks
  for field in fields_dict.keys():
    if fields_dict[field]=='classification':
      save_path = '/content/gdrive/Shared drives/FS-AI Base Data/models/'
      filename = field +'_model.sav'
      loaded_model = pickle.load(open(save_path+filename, 'rb'))
      pdf_extract[field] = loaded_model.predict([text_pre])[0]

  ############################################ include your function here
  pdf_extract['007_issue_price'] = find_issue_price(text)
  # store dictionary with values

  for name in fields_dict.keys():
    if name not in pdf_extract:
      pdf_extract[name] = 'none'

  print(pdf_extract)
  with open(path_+'/'+file_+'/values.json', 'w') as fp:
      json.dump(pdf_extract, fp)

  print('Done')



# Execution

In [10]:
from google.colab import drive
from google.colab import files

drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [63]:
#try it twice ;)
load_and_process()

Saving Final_Terms_BASF-2018-2025.pdf to Final_Terms_BASF-2018-2025 (5).pdf
Oo
text length 115379
{'001_type': 'Floating', '003_product_subclass': 'Bond', '005_currency': 'EUR', '011_seniority': '', '012_business_day_convention': 'no_class', '013_day_count_convention': 'act/act', '014_business_day_calendar': 'TARGET2', '015_payment_frequency': '1', '020_paragraph_489': 'no_class', '022_legislation': 'no_class', '023_amendment': 'no_class', '007_issue_price': '99.568', '002_issuer_name': 'none', '004_identifiers': 'none', '004_isin': 'none', '004_wkn': 'none', '006_issue_date': 'none', '008_maturity_date': 'none', '009_nominal_amount': 'none', '010_redemption_formula': 'none', '017_first_payment_date': 'none', '018_payment_dates': 'none', '021_formula': 'none'}
Done
