In [1]:
from docx import Document
import os
from pprint import pprint

ModuleNotFoundError: No module named 'exceptions'

In [5]:
doc_dir_path = 'docs/'
os.listdir(doc_dir_path)

['credit card.docx',
 'Current Account.docx',
 'loans.docx',
 'Payroll.docx',
 'Saving account.docx']

In [11]:
import re


def clean_text(text):
    # Replace two or more consecutive empty lines with a single empty line
    cleaned_text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
    return cleaned_text


def word_wrap(text, width=80):
    """
    Wraps the input text to fit within a specified width.

    :param text: The text to be wrapped.
    :param width: The maximum width of each line. Default is 80 characters.
    :return: The wrapped text.
    """
    wrapped_text = []
    words = text.split()
    current_line = []

    for word in words:
        # If adding the new word exceeds the width, start a new line
        if sum(len(w) + 1 for w in current_line) + len(word) > width:
            wrapped_text.append(' '.join(current_line))
            current_line = [word]
        else:
            current_line.append(word)

    # Add the last line
    if current_line:
        wrapped_text.append(' '.join(current_line))

    return '\n'.join(wrapped_text)

In [7]:
def read_docs(doc_dir_path):
    docs = []
    for filename in os.listdir(doc_dir_path):
        filepath = os.path.join(doc_dir_path, filename)
        content = ''
        if os.path.isfile(filepath):
            doc = Document(filepath)
            full_text = []
            for para in doc.paragraphs:
                full_text.append(para.text)

            content = '\n'.join(full_text)

        docs.append({'id': filename.replace('.docx', '').title(),
                    'content': clean_text(content)})
    return docs

In [8]:
res_docs = read_docs(doc_dir_path)

In [9]:
print(res_docs[0]['content'])


Product Type: Credit Card
Product Name: Classic Credit Card
Product Description: The Classic Credit Card offers a range of benefits including the ability to make both local and international transactions, with a 100% credit limit available for cash withdrawals. Cardholders can enjoy a grace period of up to 56 days and access to supplementary cards. The card includes features such as contactless payment, installment options for purchases, and international usage after two months of issuance. Fees include issuance and renewal charges of EGP 250 each, with supplementary cards costing EGP 100. Interest rates are 4% per month, and penalties for delays or exceeding credit limits are EGP 75. Additional charges apply for cash withdrawals and transactions outside Egypt. The card also provides access to discounts and promotions and allows online and contactless purchases.

Product Type: Credit Card
Product Name: Platinum Visa - Master Credit Card
Product Description: The Platinum Visa - Master 

In [10]:
all_content = '\n\n'.join([doc['content'] for doc in res_docs])

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

In [12]:
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ".", "!", "?", ",", ";", " ", ""],
    chunk_size=1000,
    chunk_overlap=100
)

In [13]:
character_split_texts = character_splitter.split_text(
    text=all_content
)

In [14]:
print(character_split_texts[5])
print(f"\nTotal chunks: {len(character_split_texts)}")

Product Type: Credit Card
Product Name: Asatha MasterCard

Total chunks: 37


In [15]:
token_splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap=0,
    tokens_per_chunk=256
)



In [16]:
token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

In [17]:
print(token_split_texts[1])
print(f"\nTotal chunks: {len(token_split_texts)}")

product type : credit card product name : platinum visa - master credit card product description : the platinum visa - master credit card offers an array of benefits including internet banking, purchase protection, and access to vip lounges in over 25 airports worldwide. cardholders can earn 2 reward points for every egp spent domestically, which can be redeemed for electronic vouchers or cashback. the card also provides an extended warranty period for purchases, and various discounts, such as an 11 % discount on gettransfer. com and 20 % off careem rides using a mastercard. the card has international usage available after 2 - 6 months of issuance, with limits for online and contactless purchases both inside and outside egypt. fees include egp 500 for issuance and renewal, and a 4 % monthly interest rate, with penalties for late payments and exceeding credit limits. installment services are available with varying interest rates, and early repayment fees apply.

Total chunks: 37


In [18]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction()

In [19]:
print(embedding_function([token_split_texts[10]]))

[[0.014443992637097836, 0.02853766828775406, -0.00611827801913023, 0.0033783791586756706, -0.047200240194797516, -0.015899082645773888, 0.06143093481659889, 0.03314751386642456, -0.030062370002269745, -0.012941266410052776, -0.019773082807660103, 0.02250715158879757, -0.06029749661684036, 0.011913609690964222, 0.026892902329564095, 0.0031092066783457994, 0.06084461882710457, -0.05087249353528023, -0.006364523433148861, 0.037964966148138046, 0.09198709577322006, -0.10366902500391006, -0.016510073095560074, 0.027267219498753548, -0.021844498813152313, -0.013097005896270275, -0.011068581603467464, 0.04404715448617935, 0.0055885217152535915, -0.03723466768860817, 0.05463098734617233, 0.037858761847019196, 0.10777626931667328, 0.014495913870632648, -0.1112748309969902, -0.034084636718034744, -0.06673093885183334, -0.002038982231169939, -0.02376371994614601, -0.05633348226547241, 0.018502887338399887, 0.04083627089858055, 0.03255241736769676, -0.028331853449344635, 0.04529084265232086, -0.01

In [20]:
chroma_client = chromadb.Client()
chroma_collection = chroma_client.get_or_create_collection(
    name="banking-products-chroma",
    embedding_function=embedding_function
)

new_ids = [str(id_) for id_ in range(len(token_split_texts))]

chroma_collection.add(
    ids=new_ids,
    documents=token_split_texts
)
chroma_collection.count()

37

In [21]:
import pickle

# Assuming 'token_split_texts' contains the documents and 'new_ids' contains the IDs

# Save the data to disk
with open('chroma_data/collection_data.pkl', 'wb') as file:
    pickle.dump({
        'ids': new_ids,
        'documents': token_split_texts
    }, file)

In [7]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import pickle


def load_chroma_collection():

    chroma_client = chromadb.Client()
    embedding_function = SentenceTransformerEmbeddingFunction()

    chroma_collection = chroma_client.get_or_create_collection(
        name="banking-products-chroma",
        embedding_function=embedding_function
    )

    with open('chroma_data/collection_data.pkl', 'rb') as file:
        data = pickle.load(file)

    # Extract IDs and documents
    new_ids = data['ids']
    token_split_texts = data['documents']

    chroma_collection.add(
        ids=new_ids,
        documents=token_split_texts
    )
    return chroma_collection

In [8]:
chroma_collection = load_chroma_collection()

In [9]:
# query = '''
# What are the loan options available, and what are their interest rates?
# '''
query = '''
What are the loan for machinery and equipment?
'''

In [None]:

res_docs = chroma_collection.query(
    query_texts=[query],
    n_results=10
)['documents'][0]  # type: ignore

In [12]:
for res in res_docs:
    print(word_wrap(res))
    print('\n')

product type : loans product name : equipment financing product product
description : the equipment financing product is a loan designed for financing
the purchase, replacement, or refurbishment of machinery and equipment for
factories. it supports both new and used machinery and offers medium to long -
term financing with a maximum term of 5 years, including up to 9 months of grace
period. the loan covers up to 70 % of the value of new machinery and 60 % for
used machinery. required documentation includes certified financial statements (
3 years, with a minimum of 2 years ), recent tax and insurance status, a
financial and technical study from an accredited consultant, quotes for the
machinery, and various business licenses and registrations.


product type : loans product name : medical equipment financing product product
description : the medical equipment financing product is a loan designed to
finance medical equipment and devices for laboratories, radiological centers,
private ho

In [13]:
from sentence_transformers import CrossEncoder
import numpy as np
import warnings
warnings.filterwarnings('ignore')
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [14]:
pairs = [[query, doc] for doc in res_docs]
scores = cross_encoder.predict(pairs)
scores

array([ 8.621017 ,  5.628974 , -4.83948  , -4.7934055,  4.427843 ,
       -9.209896 , -5.6856685, -9.222131 , -0.5042026, -6.449566 ],
      dtype=float32)

In [15]:
new_sorted_indexes = np.argsort(scores)[::-1]
new_sorted_indexes

array([0, 1, 4, 8, 3, 2, 6, 9, 5, 7], dtype=int64)

In [16]:
new_sorted_docs = np.array(res_docs)[new_sorted_indexes]
new_sorted_docs

array(['product type : loans product name : equipment financing product product description : the equipment financing product is a loan designed for financing the purchase, replacement, or refurbishment of machinery and equipment for factories. it supports both new and used machinery and offers medium to long - term financing with a maximum term of 5 years, including up to 9 months of grace period. the loan covers up to 70 % of the value of new machinery and 60 % for used machinery. required documentation includes certified financial statements ( 3 years, with a minimum of 2 years ), recent tax and insurance status, a financial and technical study from an accredited consultant, quotes for the machinery, and various business licenses and registrations.',
       'product type : loans product name : medical equipment financing product product description : the medical equipment financing product is a loan designed to finance medical equipment and devices for laboratories, radiological cen

In [17]:
for doc in new_sorted_docs:
    print(word_wrap(doc))
    print('\n')

product type : loans product name : equipment financing product product
description : the equipment financing product is a loan designed for financing
the purchase, replacement, or refurbishment of machinery and equipment for
factories. it supports both new and used machinery and offers medium to long -
term financing with a maximum term of 5 years, including up to 9 months of grace
period. the loan covers up to 70 % of the value of new machinery and 60 % for
used machinery. required documentation includes certified financial statements (
3 years, with a minimum of 2 years ), recent tax and insurance status, a
financial and technical study from an accredited consultant, quotes for the
machinery, and various business licenses and registrations.


product type : loans product name : medical equipment financing product product
description : the medical equipment financing product is a loan designed to
finance medical equipment and devices for laboratories, radiological centers,
private ho

In [1]:
from sentence_transformers import CrossEncoder
import numpy as np
import warnings
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import pickle

warnings.filterwarnings('ignore')


class InfoRetrial:
    def __init__(self, chroma_data_path: str = 'chroma_data/collection_data.pkl') -> None:
        self.is_collection_loaded = False
        self.chroma_collection: chromadb.Collection | None = None
        self.cross_encoder = None
        self.chroma_data_path = chroma_data_path

    def load_chroma_collection(self):

        chroma_client = chromadb.Client()
        embedding_function = SentenceTransformerEmbeddingFunction()

        chroma_collection = chroma_client.get_or_create_collection(
            name="banking-products-chroma",
            embedding_function=embedding_function
        )

        with open(self.chroma_data_path, 'rb') as file:
            data = pickle.load(file)

        # Extract IDs and documents
        new_ids = data['ids']
        token_split_texts = data['documents']

        chroma_collection.add(
            ids=new_ids,
            documents=token_split_texts
        )

        self.chroma_collection = chroma_collection
        self.cross_encoder = CrossEncoder(
            'cross-encoder/ms-marco-MiniLM-L-6-v2')

        self.is_collection_loaded = True

    def query(self, query_texts: str, n_results: int = 10):
        if not self.is_collection_loaded:
            self.load_chroma_collection()

        res_docs = self.chroma_collection.query(  # type: ignore
            query_texts=[query_texts],
            n_results=n_results
        )['documents'][0]

        pairs = [[query, doc] for doc in res_docs]
        scores = self.cross_encoder.predict(pairs)   # type: ignore

        new_sorted_indexes = np.argsort(scores)[::-1]
        new_sorted_docs = np.array(res_docs)[new_sorted_indexes]

        return new_sorted_docs

  from tqdm.autonotebook import tqdm, trange


In [2]:
info = InfoRetrial()
info.load_chroma_collection()

In [5]:
query = '''
What are the loan most interesting?
'''

In [6]:
info.query(query)

array(['product type : loans product name : mashroui mashrouak product description : mashroui mashrouak is a loan product designed to support existing projects with sales between egp 1 million and egp 50 million, focusing on development, expansion, replacement, and renewal. it provides financing for various enterprises, including industrial, commercial, services, and environmentally - friendly projects, such as clean energy initiatives. the loan features a 5 % simple diminishing interest rate for industrial, services, and professions, with a competitive rate for commercial activities. loan amounts range from egp 250, 000 to egp 8 million, with terms between 1 to 5 years, including a grace period tailored to the project. required documents include a recent transcript from the commercial registry, tax card, activity license, taxation and insurance status, property documents, company budgets, and a feasibility study for new projects.',
       'product type : loans product name : school fi