# Prerequisites

## Libraries

In [13]:
import os
import pandas as pd

import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.chains import RetrievalQA
from getpass import getpass

## API Key

In [14]:
OPENAI_API_KEY = getpass('Enter OpenAI API Key:')

Enter OpenAI API Key: ········


## Data Preparation

In [10]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# !python -m spacy download en_core_web_sm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
nlp = spacy.load("en_core_web_sm")
stopword_list = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Main

## Helper Functions

In [11]:
def read_text_file(filepath):
    with open(filepath, 'r') as file:
        text = file.read()
    
    return text

In [8]:
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    tokens_without_stopwords = [word for word in tokens if word.lower() not in stopword_list]
    text_without_stopwords = nlp(' '.join(tokens_without_stopwords))
    lemmatized_tokens = [token.lemma_ for token in text_without_stopwords]
    cleaned_data = [re.sub(r'[#*]', '', token) for token in lemmatized_tokens]
    cleaned_data = [re.sub(r'[-]{2,}', '', token) for token in cleaned_data]
    cleaned_data = [token for token in cleaned_data if token.strip()]
    cleaned_data = ' '.join(cleaned_data)

    return cleaned_data

In [12]:
def recursive_character_splitter(text, chunk_size=512, chunk_overlap=64):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
    chunks = text_splitter.create_documents([text])
    
    return chunks

In [30]:
def embed_store_and_model(chunks):
    embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
    library = FAISS.from_documents(chunks, embeddings)
    qna_bot = RetrievalQA.from_chain_type(llm=OpenAI(api_key=OPENAI_API_KEY), chain_type='stuff', retriever=library.as_retriever())

    return qna_bot

In [36]:
def ask_the_bot(query, bot):
    response = bot.invoke(query)

    print(response['result'])

## Executable

In [9]:
filepath_list = [
    'C:\\Users\\Akash\\Desktop\\GenAI\\CentraComm\\CentraComm\\data\\project\\alex_johnson.txt',
    'C:\\Users\\Akash\\Desktop\\GenAI\\CentraComm\\CentraComm\\data\\project\\emily_chen.txt',
    'C:\\Users\\Akash\\Desktop\\GenAI\\CentraComm\\CentraComm\\data\\project\\michael_reynolds.txt',
    'C:\\Users\\Akash\\Desktop\\GenAI\\CentraComm\\CentraComm\\data\\research\\alex_johnson.txt',
    'C:\\Users\\Akash\\Desktop\\GenAI\\CentraComm\\CentraComm\\data\\research\\jane_smith.txt',
    'C:\\Users\\Akash\\Desktop\\GenAI\\CentraComm\\CentraComm\\data\\research\\john_doe.txt',
    'C:\\Users\\Akash\\Desktop\\GenAI\\CentraComm\\CentraComm\\data\\training\\tasktracker.txt',
    'C:\\Users\\Akash\\Desktop\\GenAI\\CentraComm\\CentraComm\\data\\training\\virtualconnection.txt'
]

In [32]:
chunk_collection = []

for filepath in filepath_list:
    raw_text = read_text_file(filepath)
    processed_text = preprocess_text(raw_text)
    chunks = recursive_character_splitter(processed_text)
    chunk_collection.extend(chunks)
    
qna_bot = embed_store_and_model(chunk_collection)

In [49]:
ask_the_bot('What are the technologies used in the company?', qna_bot)

 The company uses Apache Tika to extract text metadata from PDFs, Elasticsearch for indexing unstructured data, and a relational database for storing structured data. They also plan to incorporate NLP and RAG techniques for their chatbot development and are considering incorporating machine learning techniques for their equipment management project.
