# Import Libraries

In [12]:
import os
import re
import torch
import pdfplumber
import PyPDF2

import warnings
warnings.filterwarnings('ignore')

from itertools import chain

from transformers import BertTokenizer
from transformers import BertModel
from pinecone import Pinecone

# Load Data

In [2]:
os.chdir('..')
curr_dir = (os.getcwd()).replace('\\', '/')
file_path = curr_dir + '/Dataset/Project plan outline.pdf'

In [3]:
def read_pdf(document):
       
    with pdfplumber.open(document) as pdf:
        text = [page.extract_text().replace('\n', ' ') for page in pdf.pages]

    return text    
        
def pdf_to_chunks(text, 
                  word_limit, 
                  overlap):
    
    sentences = []
    
    for page in text:
        sentences.append(re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', 
                                  page))
        
    sentences = list(chain(*sentences))
    sentences = list(map(str.strip, 
                         sentences))
    sentences = list(filter(None, 
                            sentences))
    
    chunks = []
    current_chunk = []
    
    for sentence in sentences:
        words = sentence.split()
        if len(current_chunk + words) <= word_limit:
            current_chunk.extend(words)
        else: 
            chunks.append(' '.join(current_chunk[:word_limit - overlap]))
            current_chunk = current_chunk[word_limit - overlap:] + words
            
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    
    return chunks

In [4]:
text = read_pdf(file_path)
pdf_chunks = pdf_to_chunks(text, 
                           200, 
                           20)

In [5]:
pdf_chunks

['ELEC6259 Project Preparation – Project Plan Template 1 Title Personal Loan defaulter prediction Student name: Ashish Sasanapuri Supervisor name: Jize Yan Aims/research question and Objectives Banking industry are one of the most important aspects of a nation’s economy. It has in many ways helped in contributing to the economic household of the citizens. Among those is the loan business which has been established by the banks for the consumers to run their households. The loan lending system is a risky business that has to be carried out by the banks in order to balance their revenue. However, many banks follow a rigorous procedure of analyzing and running legal background checks on the customer to lend a loan. Customers borrow loans for various reasons, be it for a business, education, or house. The loan lending process can be a part of the traditional bank industry as well as the internet finance industry. The major problem faced by banks that affect their economy is loan defaults. 

# Text Embedding

In [9]:
def embed_chunks(chunks):
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    token_chunks = [tokenizer.encode(chunk, 
                                     add_special_tokens=True, 
                                     max_length=512, 
                                     truncation=True) for chunk in chunks]
    
    bertModel = BertModel.from_pretrained('bert-base-uncased')
    bertModel.eval()
    
    with torch.no_grad():
        embeddings = []
        for chunk_tokens in token_chunks:
            inp = torch.tensor([chunk_tokens])
            opt = bertModel(inp)
            last_hidden_states = opt.last_hidden_state
            chunk_embeddings = torch.mean(last_hidden_states, dim=1).squeeze().numpy()
            embeddings.append(chunk_embeddings)
            
    return embeddings

In [10]:
embeddings = embed_chunks(pdf_chunks)
embeddings

[array([-7.09551573e-02,  1.88406929e-01,  1.77855477e-01,  6.19408600e-02,
         4.18429047e-01, -7.52814189e-02,  2.50253137e-02,  4.19338673e-01,
        -1.77125767e-01, -1.50448620e-01,  6.55021667e-02, -1.30619183e-01,
         4.09881696e-02,  3.08581322e-01, -3.15144062e-01,  1.20168388e-01,
         5.87714553e-01,  9.46423486e-02, -6.64934143e-02,  2.87650287e-01,
         6.84934184e-02,  1.11030333e-01,  3.95917267e-01,  6.25361145e-01,
         3.76325160e-01, -1.03382371e-01, -5.76870516e-03,  2.28222370e-01,
        -1.40221253e-01, -1.87881529e-01,  6.04598939e-01,  8.96116048e-02,
        -1.09501727e-01, -3.87716778e-02,  2.47322604e-01, -5.60134165e-02,
        -1.51710764e-01, -3.35682601e-01, -1.13408603e-01,  7.09447786e-02,
        -2.25329801e-01, -3.05569887e-01, -1.62142739e-01,  9.65224653e-02,
        -3.37338805e-01, -2.22147003e-01,  1.70249134e-01,  6.17718771e-02,
         6.03728853e-02, -1.11968048e-01, -3.67611140e-01,  3.57512146e-01,
        -1.7

# Store embeddings in Pineco