In [1]:
import os
import faiss
import mammoth
import numpy as np
from docx import Document as DocxDocument
from langchain.docstore.document import Document
from openpyxl import load_workbook
import PyPDF2
import fitz
import xlrd
import subprocess
from langchain.embeddings.vertexai import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import FAISS
import faiss
import pickle

In [2]:
def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        return file.read()


def read_pdf(file_path):
    text = ""
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num in range(len(pdf_reader.pages)):
                text += pdf_reader.pages[page_num].extract_text()
    except:
        pdf_document = fitz.open(file_path)
        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
    return text


def read_docx(file_path):
    doc = DocxDocument(file_path)
    return "\n".join([paragraph.text for paragraph in doc.paragraphs])


def read_docm(file_path):
    with open(file_path, 'rb') as docm_file:
        result = mammoth.extract_raw_text(docm_file)
        return result.value


def read_xls(file_path):
    workbook = xlrd.open_workbook(file_path)
    sheet = workbook.sheet_by_index(0)
    data = []
    for row_num in range(sheet.nrows):
        data.append("\t".join(map(str, sheet.row_values(row_num))))
    return "\n".join(data)


def read_xlsx(file_path):
    workbook = load_workbook(file_path)
    sheet = workbook.active
    data = []
    for row in sheet.iter_rows(values_only=True):
        data.append("\t".join(map(str, row)))
    return "\n".join(data)


def read_doc(file_path):
    result = subprocess.run(['antiword', file_path], capture_output=True, text=True)
    if result.returncode != 0:
        raise Exception(f"Error reading {file_path}: {result.stderr}")
    return result.stdout


def load_files(directory):
    documents = []
    unread_documents = []
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3500, chunk_overlap=350)
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:
            if filename.endswith('.txt'):
                content = read_txt(file_path)
            elif filename.endswith('.pdf'):
                content = read_pdf(file_path)
            elif filename.endswith('.docx'):
                content = read_docx(file_path)
            elif filename.endswith('.doc'):
                content = read_doc(file_path)
            elif filename.endswith('.docm'):
                content = read_docm(file_path)
            elif filename.endswith('.xls'):
                content = read_xls(file_path)
            elif filename.endswith('.xlsx'):
                content = read_xlsx(file_path)
            else:
                print(f"Unsupported file format: {filename}")
                unread_documents.append(filename)
                continue
            doc = Document(page_content=content, metadata={"filename": filename})
            split_docs = text_splitter.split_documents([doc])
            documents.extend(split_docs)
        except Exception as e:
            print(f"Failed to process {filename}: {e}")
            unread_documents.append(filename)
    return documents, unread_documents



In [3]:
directory = '/home/jupyter/DarkDawn/sp_docs'

# 5208 seconds
documents, unread = load_files(directory)

Failed to process Aerial Service Wire (ASW) Process All Markets-Copy1.docx: "There is no item named 'word/NULL' in the archive"
Unsupported file format: Tekelec - fru-square-fan.mpg
Failed to process Central Office Power - Power Routine Forms - all.xlsx: Unable to read workbook: could not read strings from /home/jupyter/DarkDawn/sp_docs/Central Office Power - Power Routine Forms - all.xlsx.
This is most probably because the workbook source files contain some invalid XML.
Please see the exception for more details.
Unsupported file format: Adtran TA5000 Cobmbo VDSL 2 Tech Bulletin 61187120L1-4D.PDF


  warn(msg)


Unsupported file format: Ciena  - Champion SFP+.pptx
Unsupported file format: Calix C7 Ethernet Services Presentation.pptx
Unsupported file format: Tellabs Update - GbE Guidelines 5.12.11 Installation.pptx
Unsupported file format: dont_touch.pkl
Failed to process A2EProposedProjectActivity.xlsx: File is not a zip file
Unsupported file format: Adtran TA 5004 Hi Level Overview.pptx
Failed to process ADTRAN Internal ONTs Installation and Maintenance Guidelines.docx: "There is no item named 'word/NULL' in the archive"
Unsupported file format: E911-Translations.ppt
Unsupported file format: Tellabs 530 531 532 Database backups.pptx
Failed to process TA5000-TA5006 BBDLC RT Node Provisioning Job Aid 6-10-11 SR 5-0.1.1.doc: 'utf-8' codec can't decode byte 0xed in position 83816: invalid continuation byte
Unsupported file format: Calix - G FAST -  Activate Training 2-12-2018 - Copy.pptx
Unsupported file format: Adtran TA1124 ntwk power block diagrams 1.ppt
Unsupported file format: Adtran TA5000 

  warn("""Cannot parse header or footer so it will be ignored""")


Unsupported file format: Tellabs 1000 FP15-Transport Guidelines.pptx
Failed to process Aerial Service Wire (ASW) Process All Markets.docx: "There is no item named 'word/NULL' in the archive"
Unsupported file format: .ipynb_checkpoints
Failed to process Calix_Transitional_Device_Config_Template.xlsx: File is not a zip file
Failed to process Tellabs AFC_UMC1000_Transitional_Device.xlsx: File is not a zip file
Unsupported file format: Tekelec - replace_fans.mpg


In [13]:
pathh='documents_obj.pkl'
with open(pathh, 'wb') as f:
    pickle.dump(documents, f)

In [16]:
documents[0]

Document(page_content='Fiber + Birth Certificates Tool (DS/CO)\n\n\n\n\nRelated Documents\n\n\n\nThe fiber plus birth certificates tool will: \n\nProvide customer with written confirmation of successful delivery of service.\nProvide detailed record of successful install, including date/time stamp for internal records; date service was delivered and tested.\nValidate data throughput meets parameters of bandwidth purchased by customer.\nPrompt the customer to formally accept the service on the day of service delivery. \nIf customer does not provide acceptance, the Birth Certificate will trigger closure of the National Order MLAP job, and move order to billing\nProvide customer with their Circuit ID, and CenturyLink contact information needed for repair/service calls\nNotify Project Managers working Hosted VOIP/Managed Office orders, of completion of the transport component, allowing them to schedule the HV/MO installation\nReduce dispatches to customer premises to validate service\n\n\nP