In [1]:
import os
import shutil
import pdfplumber
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import spacy
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

#### Define the path to collect and save PDF files from One drive to Local

In [2]:
# Define the OneDrive directory paths
onedrive_path_1 = "/Users/rennersantana/OneDrive - TestReach/Briefs"
onedrive_path_2 = "/Users/rennersantana/OneDrive - TestReach/Briefs/On-demand exam briefs"

#### Path to save collected PDF files

In [3]:
# Path to save the collected PDF files
save_path = "/Users/rennersantana/Desktop/AI - ChatBot/PDF_Files"


#### Collect PDF files recursively from a directory

In [4]:
# Collect PDF files recursively from a directory
def collect_pdf_files(directory):
    pdf_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))
    return pdf_files


#### Copy PDf files to a specified directory

In [5]:
# Copy PDF files to a specified directory
def copy_pdf_files(pdf_files, save_path):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    for pdf_file in pdf_files:
        shutil.copy(pdf_file, save_path)


#### Here we collect the PDf files from both directories

In [6]:
# Here we collect the PDF files from both directories
pdf_files_1 = collect_pdf_files(onedrive_path_1)
pdf_files_2 = collect_pdf_files(onedrive_path_2)


#### Combine the lists of PDF

In [7]:
# Combine the lists of PDF files
pdf_files = pdf_files_1 + pdf_files_2


#### Copy PDF files to a specified directory

In [8]:
# Copy PDF files to the specified directory
copy_pdf_files(pdf_files, save_path)


#### Print the list of collected PDF files

In [9]:
# Print the list of collected PDF files
print("PDF files copied to:", save_path)
#for pdf_file in pdf_files:
    #print(pdf_file)


PDF files copied to: /Users/rennersantana/Desktop/AI - ChatBot/PDF_Files


#### Now we can start creating the questions for the bot and to train the model

#### Set the directory containing PDF files

In [10]:
# Directory containing PDF files
pdf_directory = "/Users/rennersantana/OneDrive - TestReach/Briefs"

#### Check if the directory exists

In [11]:
# Check if the PDF directory exists; if not, print a message.
if not os.path.exists(pdf_directory):
    print(f"Directory '{pdf_directory}' does not exist.")
    exit()


#### Defining function to collect Specific Information from PDf files

In [12]:
# Defining the function to collect specific information from PDF files
def extract_information_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        # Initialize variables to store extracted information
        on_screen_calculator = "Not Available"
        on_screen_notepad = "Not Available"
        comfort_breaks = "Not Allowed"
        identification = {
            "Passport": "Not Available",
            "Driver’s License": "Not Available",
            "National ID card": "Not Available",
            "EU ID card": "Not Available",
            "Work ID": "Not Available"
        }
        # Loop through each page of the PDF to extract information
        for page in pdf.pages:
            text = page.extract_text()
            # Extract on-screen calculator information
            if "On Screen Calculator: Available" in text:
                on_screen_calculator = "Available"
            # Extract on-screen notepad information
            if "On Screen Notepad: Available" in text:
                on_screen_notepad = "Available"
            # Extract comfort breaks information
            if "Comfort Breaks" in text:
                if any(phrase in text for phrase in ["Allowed", "Y", "YES", "yes"]):
                    comfort_breaks = "Allowed"
                else:
                    comfort_breaks = "Not Allowed"
            # Extract identification information
            for item in identification:
                if item in text:
                    start_idx = text.find(item)
                    end_idx = text.find("\n", start_idx)
                    line = text[start_idx:end_idx]
                    if any(phrase in line for phrase in ["Y", "YES", "yes", "Allowed", "Not Allowed", "allowed", "not allowed"]):
                        identification[item] = "Yes" if any(phrase in line for phrase in ["Y", "YES", "yes", "Allowed", "allowed"]) else "No"
        # Return the extracted information
        return on_screen_calculator, on_screen_notepad, comfort_breaks, identification


#### Initialise lists to store extracted information

In [13]:
# Initialize lists to store extracted information
filename_list = []
calculator_list = []
notepad_list = []
comfort_breaks_list = []
passport_list = []
drivers_license_list = []
national_id_card_list = []
eu_id_card_list = []
work_id_list = []

#### Loop through each PDf file in directory

In [14]:
# Loop through each PDF file in the directory
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        pdf_file = os.path.join(pdf_directory, filename)
        # Extract information from the current PDF file
        on_screen_calculator, on_screen_notepad, comfort_breaks, identification = extract_information_from_pdf(pdf_file)
        # Append extracted information to lists
        filename_list.append(filename)
        calculator_list.append(on_screen_calculator)
        notepad_list.append(on_screen_notepad)
        comfort_breaks_list.append(comfort_breaks)
        passport_list.append(identification["Passport"])
        drivers_license_list.append(identification["Driver’s License"])
        national_id_card_list.append(identification["National ID card"])
        eu_id_card_list.append(identification["EU ID card"])
        work_id_list.append(identification["Work ID"])


#### Create a DataFrame to store extracted information in CSV file

In [15]:
# Create a DataFrame to store the extracted information
data = {
    "Filename": filename_list,
    "On Screen Calculator": calculator_list,
    "On Screen Notepad": notepad_list,
    "Comfort Breaks": comfort_breaks_list,
    "Passport": passport_list,
    "Driver’s License": drivers_license_list,
    "National ID card": national_id_card_list,
    "EU ID card": eu_id_card_list,
    "Work ID": work_id_list
}
df = pd.DataFrame(data)

#### Path to save the CSV file once extracted

In [16]:
# Define the path to save the CSV file
csv_file_path = "/Users/rennersantana/Desktop/AI - ChatBot/extracted_information.csv"

#### Save DataFrame to a csv file

In [17]:
# Save the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

#### Print confirmation file was saved in the specific path

In [18]:
# Print confirmation message
print("Extracted information saved to:", csv_file_path)

Extracted information saved to: /Users/rennersantana/Desktop/AI - ChatBot/extracted_information.csv


#### Preprocessing the data - After extract informations from PDF and created a dataset csv

In [19]:
# Load the data
df = pd.read_csv("/Users/rennersantana/Desktop/AI - ChatBot/extracted_information.csv")
print("Data loaded successfully. Here's a preview:")
print(df.head())

# Standardizing text data
df['On Screen Calculator'] = df['On Screen Calculator'].str.title()  # Ensures consistent casing
df['On Screen Notepad'] = df['On Screen Notepad'].str.title()
df['Comfort Breaks'] = df['Comfort Breaks'].str.title()
print("\nText data standardized. Here's a preview:")
print(df.head())

# Encoding categorical data using one-hot encoding
df_encoded = pd.get_dummies(df, columns=[
    'On Screen Calculator', 'On Screen Notepad', 'Comfort Breaks',
    'Passport', 'Driver’s License', 'National ID card', 'EU ID card', 'Work ID'
])
print("\nCategorical data encoded. Here's a preview of the new columns:")
print(df_encoded.columns)

# Save the preprocessed data
df_encoded.to_csv("/Users/rennersantana/Desktop/AI - ChatBot/preprocessed_extracted_information.csv", index=False)
# print("\nPreprocessed data saved to '/Users/rennersantana/Desktop/AI - ChatBot/preprocessed_extracted_information.csv'")


Data loaded successfully. Here's a preview:
                                            Filename On Screen Calculator  \
0                             EMA 23 April, 2024.pdf            Available   
1                              CIIA 2nd May 2024.pdf        Not Available   
2                      PMI 8 May,2024 CPTU2 Exam.pdf        Not Available   
3  SRB - Exam Brief_Senior Bank Resolution Expert...        Not Available   
4            BACP Exam Brief - May 2024 sessions.pdf        Not Available   

  On Screen Notepad Comfort Breaks Passport Driver’s License National ID card  \
0         Available    Not Allowed      Yes              Yes              Yes   
1     Not Available    Not Allowed      Yes              Yes              Yes   
2     Not Available    Not Allowed      Yes              Yes    Not Available   
3         Available    Not Allowed      Yes              Yes              Yes   
4     Not Available    Not Allowed      Yes              Yes              Yes   

      

#### Creating the questions for the chatbot to answer

In [20]:
# Dictionary mapping questions to dataset columns
questions_to_columns = {
    "Is an on-screen calculator available during the exam?": "On Screen Calculator_Available",
    "Is there an on-screen notepad available?": "On Screen Notepad_Available",
    "Are comfort breaks allowed during the exam?": "Comfort Breaks_Allowed",
    "Do I need a passport to take the exam?": "Passport_Yes",
    "Can I use my driver’s license as identification for the exam?": "Driver’s License_Yes",
    "Do I need a national ID card to take the exam?": "National ID card_Yes",
    "Can I use my EU ID card as identification for the exam?": "EU ID card_Yes",
    "Is a work ID an acceptable form of identification for the exam?": "Work ID_Yes"
}

# Function to answer questions based on dataset
def answer_question(question, data_frame):
    # Get the column corresponding to the question
    column = questions_to_columns.get(question, "")
    if column:
        # Assume data_frame is the row of interest, for example, the first row
        return "Yes" if data_frame.iloc[0][column] else "No"
    else:
        return "I don't have information about that."

# Here I set up an example usage
import pandas as pd
df_encoded = pd.read_csv("/Users/rennersantana/Desktop/AI - ChatBot/preprocessed_extracted_information.csv")
sample_question = "Is an on-screen calculator available during the exam?"
print(answer_question(sample_question, df_encoded))

Yes


#### Creating different types of questions to expand the dictionary

In [None]:
# Additional variations of existing questions
additional_questions = {
    "Can I use a calculator during the test?": "On Screen Calculator_Available",
    "Is notepad functionality enabled during the test?": "On Screen Notepad_Available",
    # Add more variations and related questions here
}
questions_to_columns.update(additional_questions)


#### Using some machine learning and NLP

In [None]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI

# Load SpaCy's English tokenizer with lemmatization
nlp = spacy.load("en_core_web_sm")

# Set your OpenAI API key
client = OpenAI(api_key='api-key-here')

# Load the preprocessed data
df_encoded = pd.read_csv("/Users/rennersantana/Desktop/AI - ChatBot/preprocessed_extracted_information.csv")

# Define a function to preprocess questions with lemmatization
def preprocess_text(text):
    return " ".join([token.lemma_ for token in nlp(text)])

# Setup vectorization and NLP processing
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
questions_to_columns = {
    "Is an on-screen calculator available during the exam?": "On Screen Calculator_Available",
    "Is there an on-screen notepad available?": "On Screen Notepad_Available",
}
questions = list(questions_to_columns.keys())
processed_questions = [preprocess_text(question) for question in questions]
question_vectors = vectorizer.fit_transform(processed_questions)

def find_closest_question(user_query, threshold=0.75):  # Adjusted threshold
    processed_query = preprocess_text(user_query)
    query_vector = vectorizer.transform([processed_query])
    similarities = cosine_similarity(query_vector, question_vectors)
    max_similarity = max(similarities[0])
    if max_similarity < threshold:
        return None
    return questions[similarities.argmax()]

def answer_question(question, data_frame):
    matched_question = find_closest_question(question)
    if matched_question:
        column = questions_to_columns[matched_question]
        response = "Yes, it is available." if data_frame.iloc[0][column] else "No, it is not available."
    else:
        # Fallback to GPT if no direct match is found
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": question}],
            model="gpt-3.5-turbo",
        )
        response = response.choices[0].message.content.strip()
    return response

# Chatbot interaction loop
print("Welcome to the Exam Info Chatbot! Ask me anything about exam provisions. Type 'quit' to exit.")
while True:
    user_input = input("You: ")
    if user_input.lower() in ['quit', 'exit', 'bye']:
        print("Thank you for using the Chatbot. Goodbye!")
        break
    response = answer_question(user_input, df_encoded)
    print("Chatbot: ", response)
