In [99]:
import os
import openai
import shutil
import pdfplumber
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import spacy
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import DataLoader, Dataset
import torch

#### Define the path to collect and save PDF files from One drive to Local

#### We use onedrive to collect PDFs files from the cloud and save it locally so we can use it later to build the dataset

In [100]:
# Define the OneDrive directory paths
onedrive_path_1 = "/Users/rennersantana/OneDrive - TestReach/Briefs"
onedrive_path_2 = "/Users/rennersantana/OneDrive - TestReach/Briefs/On-demand exam briefs"

#### Define the path where the PDF files are stored

In [101]:
# Path to save the collected PDF files
save_path = "/Users/rennersantana/Desktop/AI - ChatBot/PDF_Files"


#### Collect PDF files recursively from the directory set

In [102]:
# Collect PDF files recursively from a directory
def collect_pdf_files(directory):
    pdf_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))
    return pdf_files


#### Copy PDf files to a specified directory

In [103]:
# Copy PDF files to a specified directory
def copy_pdf_files(pdf_files, save_path):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    for pdf_file in pdf_files:
        shutil.copy(pdf_file, save_path)


#### PDFs are collected from the directories

In [104]:
# Here we collect the PDF files from both directories
pdf_files_1 = collect_pdf_files(onedrive_path_1)
pdf_files_2 = collect_pdf_files(onedrive_path_2)


#### Combine the lists of PDF from the two folders

In [105]:
# Combine the lists of PDF files
pdf_files = pdf_files_1 + pdf_files_2


#### Copy PDF files to a specified directory

In [106]:
# Copy PDF files to the specified directory
copy_pdf_files(pdf_files, save_path)


#### Print the list of collected PDF files and store it locally

In [107]:
# Print the list of collected PDF files
print("PDF files copied to:", save_path)
#for pdf_file in pdf_files:
    #print(pdf_file)


PDF files copied to: /Users/rennersantana/Desktop/AI - ChatBot/PDF_Files


#### After saved the files to the drive of choice the dataset will start to be build

#### Set the directory containing PDF files

In [108]:
# Directory containing PDF files
pdf_directory = "/Users/rennersantana/OneDrive - TestReach/Briefs"

#### Check if the directory exists

In [109]:
# Check if the PDF directory exists; if not, print a message.
if not os.path.exists(pdf_directory):
    print(f"Directory '{pdf_directory}' does not exist.")
    exit()


#### Defining function to collect peace of Informations from PDF files
#### Mapping of original organisation names to anonymize  the name of orgs for privacy and security reasons.

In [110]:
import pdfplumber
import os

# Mapping of original organization names to anonymized identifiers
org_mapping = {
    "ABP Brief 26 April, 2024.pdf": "#001",
    "ABDO 13 May,2024.pdf": "#002",
    "EMA 23 April, 2024.pdf": "#003",
    "CIIA 2nd May 2024.pdf": "#004",
    "PMI 8 May,2024 CPTU2 Exam.pdf": "#005",
    "SRB - Exam Brief_Senior Bank Resolution Expert_SRB AD 2023 006_Profile 2.docx.pdf": "#006"
}

# Here it extracts specific info from a PDF file
def extract_information_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        # Initialize default values for the info we want to extract
        on_screen_calculator = "Not Available"
        on_screen_notepad = "Not Available"
        comfort_breaks = "Not Allowed"
        identification = {
            "Passport": "Not Available",
            "Driver’s License": "Not Available",
            "National ID card": "Not Available",
            "EU ID card": "Not Available",
            "Work ID": "Not Available"
        }
        # Get the filename from the file path
        filename = pdf_file.split("/")[-1]
        # Replace org name with anonymized identifier
        filename_anonymized = org_mapping.get(filename, filename)
        
        # Loop through each page in the PDF to find the info we need
        for page in pdf.pages:
            text = page.extract_text()
            # Check if on-screen calculator is available
            if "On Screen Calculator: Available" in text:
                on_screen_calculator = "Available"
            # Check if on-screen notepad is available
            if "On Screen Notepad: Available" in text:
                on_screen_notepad = "Available"
            # Check if comfort breaks are allowed
            if "Comfort Breaks" in text:
                if any(phrase in text for phrase in ["Allowed", "Y", "YES", "yes"]):
                    comfort_breaks = "Allowed"
                else:
                    comfort_breaks = "Not Allowed"
            # Check the availability of different types of identification
            for item in identification:
                if item in text:
                    start_idx = text.find(item)
                    end_idx = text.find("\n", start_idx)
                    line = text[start_idx:end_idx]
                    if any(phrase in line for phrase in ["Y", "YES", "yes", "Allowed", "Not Allowed", "allowed", "not allowed"]):
                        identification[item] = "Yes" if any(phrase in line for phrase in ["Y", "YES", "yes", "Allowed", "allowed"]) else "No"
        
        # Return the anonymized org name and all the extracted info
        return filename_anonymized, on_screen_calculator, on_screen_notepad, comfort_breaks, identification

# Path to the directory containing the PDF files
pdf_directory = "/Users/rennersantana/Desktop/AI - ChatBot/PDF_Files"
# Get a list of all PDF files in the directory
pdf_files = [os.path.join(pdf_directory, filename) for filename in os.listdir(pdf_directory) if filename.endswith(".pdf")]

# Process each PDF file and print the extracted info
for pdf_file in pdf_files:
    info = extract_information_from_pdf(pdf_file)
    print(info)


('IOB May 2024 - Investment.pdf', 'Available', 'Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Not Available', 'National ID card': 'Not Available', 'EU ID card': 'Not Available', 'Work ID': 'Not Available'})
('ODOW - iStructE Exam Brief + Script.pdf', 'Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Yes', 'Work ID': 'Yes'})
('#003', 'Available', 'Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Yes', 'Work ID': 'Not Available'})
('#004', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Not Available', 'Work ID': 'Yes'})
('#005', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Not Available', 'EU ID card': 'Not Available', 'Work ID': 'Not Available'})
('#006', '

('IGPI Exam Brief - May 2024 Sessions.pdf', 'Available', 'Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Not Available', 'Work ID': 'Not Available'})
('WSAL brief 24.04.2024.pdf', 'Available', 'Available', 'Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Not Available', 'EU ID card': 'Yes', 'Work ID': 'Not Available'})
('COE 16 May, 2024.pdf', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Yes', 'Work ID': 'Not Available'})
('EMCDDA Brief 16 April 2024.pdf', 'Available', 'Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Not Available', 'Work ID': 'Not Available'})
('IGP&I May, 2024.pdf', 'Available', 'Available', 'Not Allowed', {'Passport': 'Not Available', 'Driver’s License': 'Not Available', 'National ID card': 'Not Available', 'EU

('CIIA Brief 4 April 2024.pdf', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Not Available', 'Work ID': 'Yes'})
('Stress Analysis Certification Testing Exam Brief CS030F 2024.docx.pdf', 'Available', 'Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Yes', 'Work ID': 'Not Available'})
('EUAA Recruitment - 4th April,2024.pdf', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Yes', 'Work ID': 'Not Available'})
('ODOW - ISP Exam Brief 2023.pdf', 'Available', 'Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Yes', 'Work ID': 'Yes'})
('RCSI 15 April,24 MFD2-Enhanced Validation.pdf', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Not Availab

#### Anonymizing the data to hide organisation name

In [111]:
import pdfplumber

# Define a mapping between original organization names and anonymized identifiers
org_mapping = {
    "ABP Brief 26 April, 2024.pdf": "#001",
    "ABDO 13 May,2024.pdf": "#002",
    "EMA 23 April, 2024.pdf": "#003",
    "CIIA 2nd May 2024.pdf": "#004",
    "PMI 8 May,2024 CPTU2 Exam.pdf": "#005",
    "SRB - Exam Brief_Senior Bank Resolution Expert_SRB AD 2023 006_Profile 2.docx.pdf": "#006"
    
}

# Defining the function to collect specific information from PDF files
def extract_information_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        # Initialize variables to store extracted information
        on_screen_calculator = "Not Available"
        on_screen_notepad = "Not Available"
        comfort_breaks = "Not Allowed"
        identification = {
            "Passport": "Not Available",
            "Driver’s License": "Not Available",
            "National ID card": "Not Available",
            "EU ID card": "Not Available",
            "Work ID": "Not Available"
        }
        # Get the filename (org name) from the path
        filename = pdf_file.split("/")[-1]
        # Replace org name with anonymized identifier
        filename_anonymized = org_mapping.get(filename, filename)
        # Loop through each page of the PDF to extract information
        for page in pdf.pages:
            text = page.extract_text()
            # Extract on-screen calculator information
            if "On Screen Calculator: Available" in text:
                on_screen_calculator = "Available"
            # Extract on-screen notepad information
            if "On Screen Notepad: Available" in text:
                on_screen_notepad = "Available"
            # Extract comfort breaks information
            if "Comfort Breaks" in text:
                if any(phrase in text for phrase in ["Allowed", "Y", "YES", "yes"]):
                    comfort_breaks = "Allowed"
                else:
                    comfort_breaks = "Not Allowed"
            # Extract identification information
            for item in identification:
                if item in text:
                    start_idx = text.find(item)
                    end_idx = text.find("\n", start_idx)
                    line = text[start_idx:end_idx]
                    if any(phrase in line for phrase in ["Y", "YES", "yes", "Allowed", "Not Allowed", "allowed", "not allowed"]):
                        identification[item] = "Yes" if any(phrase in line for phrase in ["Y", "YES", "yes", "Allowed", "allowed"]) else "No"
        # Return the extracted information along with anonymized org name
        return filename_anonymized, on_screen_calculator, on_screen_notepad, comfort_breaks, identification

# Example usage:
pdf_files = [
    "/Users/rennersantana/Desktop/AI - ChatBot/PDF_Files/ABP Brief 26 April, 2024.pdf",
    "/Users/rennersantana/Desktop/AI - ChatBot/PDF_Files/ABDO 13 May,2024.pdf",
    "/Users/rennersantana/Desktop/AI - ChatBot/PDF_Files/EMA 23 April, 2024.pdf",
    "/Users/rennersantana/Desktop/AI - ChatBot/PDF_Files/CIIA 2nd May 2024.pdf",
    "/Users/rennersantana/Desktop/AI - ChatBot/PDF_Files/PMI 8 May,2024 CPTU2 Exam.pdf",
    "/Users/rennersantana/Desktop/AI - ChatBot/PDF_Files/SRB - Exam Brief_Senior Bank Resolution Expert_SRB AD 2023 006_Profile 2.docx.pdf"
]

for pdf_file in pdf_files:
    info = extract_information_from_pdf(pdf_file)
    print(info)


('#001', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Not Available', 'National ID card': 'Yes', 'EU ID card': 'Yes', 'Work ID': 'Not Available'})
('#002', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Not Available', 'National ID card': 'Not Available', 'EU ID card': 'Not Available', 'Work ID': 'Not Available'})
('#003', 'Available', 'Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Yes', 'Work ID': 'Not Available'})
('#004', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Not Available', 'Work ID': 'Yes'})
('#005', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Not Available', 'EU ID card': 'Not Available', 'Work ID': 'Not Available'})
('#006', 'Not Available', 'Available',

#### We initialise lists to store extracted information

In [112]:
# Initialize lists to store extracted information
filename_list = []
calculator_list = []
notepad_list = []
comfort_breaks_list = []
passport_list = []
drivers_license_list = []
national_id_card_list = []
eu_id_card_list = []
work_id_list = []

#### Loop through each PDF file in directory so we can save the dataset later

In [113]:
def extract_information_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        # Initialize variables to store extracted information
        on_screen_calculator = "Not Available"
        on_screen_notepad = "Not Available"
        comfort_breaks = "Not Allowed"
        identification = {
            "Passport": "Not Available",
            "Driver’s License": "Not Available",
            "National ID card": "Not Available",
            "EU ID card": "Not Available",
            "Work ID": "Not Available"
        }
        # Get the filename (org name) from the path
        filename = pdf_file.split("/")[-1]
        # Replace org name with anonymized identifier
        filename_anonymized = org_mapping.get(filename, filename)
        # Loop through each page of the PDF to extract information
        for page in pdf.pages:
            text = page.extract_text()
            # Extract on-screen calculator information
            if "On Screen Calculator: Available" in text:
                on_screen_calculator = "Available"
            # Extract on-screen notepad information
            if "On Screen Notepad: Available" in text:
                on_screen_notepad = "Available"
            # Extract comfort breaks information
            if "Comfort Breaks" in text:
                if any(phrase in text for phrase in ["Allowed", "Y", "YES", "yes"]):
                    comfort_breaks = "Allowed"
                else:
                    comfort_breaks = "Not Allowed"
            # Extract identification information
            for item in identification:
                if item in text:
                    start_idx = text.find(item)
                    end_idx = text.find("\n", start_idx)
                    line = text[start_idx:end_idx]
                    if any(phrase in line for phrase in ["Y", "YES", "yes", "Allowed", "Not Allowed", "allowed", "not allowed"]):
                        identification[item] = "Yes" if any(phrase in line for phrase in ["Y", "YES", "yes", "Allowed", "allowed"]) else "No"
        # Return the extracted information along with anonymized org name
        return filename_anonymized, on_screen_calculator, on_screen_notepad, comfort_breaks, identification


#### Create a DataFrame to store extracted information and save the dataset

In [114]:
# Loop through each PDF file in the directory
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        pdf_file = os.path.join(pdf_directory, filename)
        try:
            # Extract information from the current PDF file
            info = extract_information_from_pdf(pdf_file)
            print(f"Processed file: {filename}, Info: {info}")
            
            # Append extracted information to lists
            filename_list.append(info[0])
            calculator_list.append(info[1])
            notepad_list.append(info[2])
            comfort_breaks_list.append(info[3])
            passport_list.append(info[4]["Passport"])
            drivers_license_list.append(info[4]["Driver’s License"])
            national_id_card_list.append(info[4]["National ID card"])
            eu_id_card_list.append(info[4]["EU ID card"])
            work_id_list.append(info[4]["Work ID"])
        except Exception as e:
            print(f"Error processing file {filename}: {e}")

# Create a DataFrame to store the extracted information
data = {
    "Filename": filename_list,
    "On Screen Calculator": calculator_list,
    "On Screen Notepad": notepad_list,
    "Comfort Breaks": comfort_breaks_list,
    "Passport": passport_list,
    "Driver’s License": drivers_license_list,
    "National ID card": national_id_card_list,
    "EU ID card": eu_id_card_list,
    "Work ID": work_id_list
}
df = pd.DataFrame(data)


Processed file: IOB May 2024 - Investment.pdf, Info: ('IOB May 2024 - Investment.pdf', 'Available', 'Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Not Available', 'National ID card': 'Not Available', 'EU ID card': 'Not Available', 'Work ID': 'Not Available'})
Processed file: ODOW - iStructE Exam Brief + Script.pdf, Info: ('ODOW - iStructE Exam Brief + Script.pdf', 'Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Yes', 'Work ID': 'Yes'})
Processed file: EMA 23 April, 2024.pdf, Info: ('#003', 'Available', 'Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Yes', 'Work ID': 'Not Available'})
Processed file: CIIA 2nd May 2024.pdf, Info: ('#004', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Not Available', 'Work ID': 'Yes'})
Proce

Processed file: IOB May 2024 - Loans.pdf, Info: ('IOB May 2024 - Loans.pdf', 'Available', 'Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Not Available', 'National ID card': 'Not Available', 'EU ID card': 'Not Available', 'Work ID': 'Not Available'})
Processed file: FOM 13 May,2024 Diploma in Occupational Medicine.pdf, Info: ('FOM 13 May,2024 Diploma in Occupational Medicine.pdf', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Not Available', 'EU ID card': 'Not Available', 'Work ID': 'Not Available'})
Processed file: ERCEA 30 April, 2024.pdf, Info: ('ERCEA 30 April, 2024.pdf', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Not Available', 'Work ID': 'Not Available'})
Processed file: FSEM Brief 7 May,2024.pdf, Info: ('FSEM Brief 7 May,2024.pdf', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': '

Processed file: CISI Spanish MCQ Brief 2023.pdf, Info: ('CISI Spanish MCQ Brief 2023.pdf', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Yes', 'Work ID': 'Yes'})
Processed file: CIPS Brief May 2024.pdf, Info: ('CIPS Brief May 2024.pdf', 'Available', 'Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Yes', 'Work ID': 'Not Available'})
Processed file: COE brief 24.04.2024.pdf, Info: ('COE brief 24.04.2024.pdf', 'Not Available', 'Not Available', 'Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Yes', 'Work ID': 'Not Available'})
Processed file: F4E Brief 2024.pdf, Info: ('F4E Brief 2024.pdf', 'Not Available', 'Not Available', 'Not Allowed', {'Passport': 'Yes', 'Driver’s License': 'Yes', 'National ID card': 'Yes', 'EU ID card': 'Not Available', 'Work ID': 'Not Available'})
Processed fi

#### Once information is collected it is stored in the path of our choice

In [115]:
# Define the path to save the CSV file
csv_file_path = "/Users/rennersantana/Desktop/AI - ChatBot/extracted_information.csv"

#### Save DataFrame to a csv file

In [116]:
# Save the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

#### Print confirmation file was saved in the specific path

In [117]:
# Print confirmation message
print("Extracted information saved to:", csv_file_path)

Extracted information saved to: /Users/rennersantana/Desktop/AI - ChatBot/extracted_information.csv


#### we load the data, reads the original CSV file containing exam-relate information.


#### We standardize text to ensure text data in specific columns is consistently formatted.


#### One-Hot Enconding so we can convert categorical text data into a format suitable for machine learning by creating binary columns.

#### We save the Preprocessed data and writes the transformed data to a new CSV file for further use.


#### We catch and print any errors that occur during the process











In [118]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load SpaCy's English tokenizer
nlp = spacy.load("en_core_web_sm")

try:
    # Load the data
    df = pd.read_csv("/Users/rennersantana/Desktop/AI - ChatBot/extracted_information.csv")
    print("Data loaded successfully:")
    #print(df.head())

    # Standardizing text data
    df['On Screen Calculator'] = df['On Screen Calculator'].str.title()
    df['On Screen Notepad'] = df['On Screen Notepad'].str.title()
    df['Comfort Breaks'] = df['Comfort Breaks'].str.title()
    #print("\nText data standardized. Here's a preview:")
    #print(df.head())

    # Encoding categorical data using one-hot encoding
    df_encoded = pd.get_dummies(df, columns=[
        'On Screen Calculator', 'On Screen Notepad', 'Comfort Breaks',
        'Passport', 'Driver’s License', 'National ID card', 'EU ID card', 'Work ID'
    ])
    #print("\nCategorical data encoded. Here's a preview of the new columns:")
    #print(df_encoded.columns)

    # Save the preprocessed data
    df_encoded.to_csv("/Users/rennersantana/Desktop/AI - ChatBot/preprocessed_extracted_information.csv", index=False)
    print("\nPreprocessed data saved to '/Users/rennersantana/Desktop/AI - ChatBot/preprocessed_extracted_information.csv'")
except Exception as e:
    print("An error occurred:", e)


Data loaded successfully:

Preprocessed data saved to '/Users/rennersantana/Desktop/AI - ChatBot/preprocessed_extracted_information.csv'


#### spacy: A library for advanced natural language processing in Python. Here it's used to load a pre-trained English tokenizer with lemmatization.

#### sklearn: A machine learning library. Specifically, TfidfVectorizer converts text data into numerical format (TF-IDF), and cosine_similarity measures similarity between vectors.

#### openai: Used to interact with the OpenAI GPT-3.5 API for generating detailed responses.

#### re: The regular expression library used for pattern matching.
#### nlp: Loads SpaCy's English tokenizer for processing text data.
#### client: Initializes the OpenAI API client with the provided API key.


In [119]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI
import re

# Load SpaCy's English tokenizer with lemmatization
nlp = spacy.load("en_core_web_sm")

# Set your OpenAI API key
client = OpenAI(api_key='your_api_key')

# Load the preprocessed data
df_encoded = pd.read_csv("/Users/rennersantana/Desktop/AI - ChatBot/preprocessed_extracted_information.csv")

# Define a function to preprocess questions with lemmatization
def preprocess_text(text):
    return " ".join([token.lemma_ for token in nlp(text.lower())])

# Setup vectorization and NLP processing
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
questions_to_columns = {
    # On Screen Calculator Availability
    "is an on-screen calculator available during the exam?": "On Screen Calculator_Available",
    "can i use an on-screen calculator during the exam?": "On Screen Calculator_Available",
    "will there be an on-screen calculator available for the test?": "On Screen Calculator_Available",
    "is the on-screen calculator provided?": "On Screen Calculator_Available",
    "do we have access to an on-screen calculator in the exam?": "On Screen Calculator_Available",
    
    # On Screen Notepad Availability
    "is there an on-screen notepad available?": "On Screen Notepad_Available",
    "can i use an on-screen notepad during the exam?": "On Screen Notepad_Available",
    "will an on-screen notepad be provided during the test?": "On Screen Notepad_Available",
    "is the on-screen notepad accessible during the exam?": "On Screen Notepad_Available",
    "do we have access to an on-screen notepad?": "On Screen Notepad_Available",
    
    # Comfort Breaks Allowed
    "are comfort breaks allowed during the exam?": "Comfort Breaks_Allowed",
    "can i take a comfort break during the test?": "Comfort Breaks_Allowed",
    "is it possible to have comfort breaks during the exam?": "Comfort Breaks_Allowed",
    "will comfort breaks be permitted during the exam?": "Comfort Breaks_Allowed",
    "are we allowed to have comfort breaks?": "Comfort Breaks_Allowed",
    
    # Passport Requirement
    "do i need a passport to take the exam?": "Passport_Yes",
    "is a passport required for the test?": "Passport_Yes",
    "will i need to bring a passport for the exam?": "Passport_Yes",
    "is a passport necessary for the exam?": "Passport_Yes",
    "do i have to bring my passport to the exam?": "Passport_Yes",
    
    # Driver’s License as Identification
    "can i use my driver’s license as identification for the exam?": "Driver’s License_Yes",
    "is a driver’s license acceptable as id for the test?": "Driver’s License_Yes",
    "will a driver’s license be sufficient as identification for the exam?": "Driver’s License_Yes",
    "can my driver’s license be used for identification purposes?": "Driver’s License_Yes",
    "is it okay to use my driver’s license for the exam id?": "Driver’s License_Yes",
    
    # National ID Card Requirement
    "do i need a national id card to take the exam?": "National ID card_Yes",
    "is a national id card required for the test?": "National ID card_Yes",
    "will i need to bring my national id card for the exam?": "National ID card_Yes",
    "is a national id card necessary for the exam?": "National ID card_Yes",
    "do i have to bring my national id card to the exam?": "National ID card_Yes",
    
    # EU ID Card as Identification
    "can i use my eu id card as identification for the exam?": "EU ID card_Yes",
    "is an eu id card acceptable as id for the test?": "EU ID card_Yes",
    "will an eu id card be sufficient as identification for the exam?": "EU ID card_Yes",
    "can my eu id card be used for identification purposes?": "EU ID card_Yes",
    "is it okay to use my eu id card for the exam id?": "EU ID card_Yes",
    
    # Work ID as Identification
    "is a work id an acceptable form of identification for the exam?": "Work ID_Yes",
    "can i use my work id for the test?": "Work ID_Yes",
    "will a work id be sufficient as identification for the exam?": "Work ID_Yes",
    "is it possible to use a work id for the exam?": "Work ID_Yes",
    "do we need to bring a work id for the test?": "Work ID_Yes"
}
questions = list(questions_to_columns.keys())
processed_questions = [preprocess_text(question) for question in questions]
question_vectors = vectorizer.fit_transform(processed_questions)

def find_closest_question(user_query, threshold=0.75):
    processed_query = preprocess_text(user_query)
    query_vector = vectorizer.transform([processed_query])
    similarities = cosine_similarity(query_vector, question_vectors)
    max_similarity = max(similarities[0])
    if max_similarity < threshold:
        return None
    return questions[similarities.argmax()]

def answer_question(question, data_frame, org_code):
    matched_question = find_closest_question(question)
    if matched_question:
        column = questions_to_columns[matched_question]
        basic_response = "Yes" if data_frame.iloc[org_code][column] else "No"
        # Use GPT to elaborate on the basic response with context
        org_details = data_frame.iloc[org_code].to_dict()
        full_query = f"{matched_question} Answer: {basic_response}. Here are the details for organization {org_code}: {org_details}. Can you provide more details?"
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": full_query}],
            model="gpt-3.5-turbo",
        )
        return response.choices[0].message.content.strip()
    else:
        # Fallback to GPT if no direct match is found
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": question}],
            model="gpt-3.5-turbo",
        )
        return response.choices[0].message.content.strip()

def contains_org_code(user_input):
    # Check if the input contains an organization code like #001, #002, etc.
    match = re.search(r"#(\d{3})", user_input)
    if match:
        return int(match.group(1))
    return None

# Chatbot interaction loop
print("Welcome to the Exam Info Chatbot! Ask me anything about exam provisions. Type 'quit' to exit.")
pending_question = None
org_code = None
while True:
    user_input = input("You: ")
    if user_input.lower() in ['quit', 'exit', 'bye']:
        print("Thank you for using the Chatbot. Goodbye!")
        break
    if pending_question:
        code = contains_org_code(user_input)
        if code is not None:
            # Process the pending question with the provided organization code
            org_code = code
            response = answer_question(pending_question, df_encoded, org_code)
            print("Chatbot: ", response)
            pending_question = None
        else:
            print("Chatbot: Please specify the organization code (e.g., #001).")
    else:
        code = contains_org_code(user_input)
        if code is not None:
            org_code = code
            response = answer_question(user_input, df_encoded, org_code)
            print("Chatbot: ", response)
        else:
            pending_question = user_input
            print("Chatbot: Please specify the organization code (e.g., #001).")


Welcome to the Exam Info Chatbot! Ask me anything about exam provisions. Type 'quit' to exit.
You: is comfort break allowed ?
Chatbot: Please specify the organization code (e.g., #001).
You: #004
Chatbot:  Yes, here are more details for Organization 4:

- On Screen Calculator: Available
- On Screen Notepad: Not Available
- Comfort Breaks: Not Allowed
- Passport: Yes
- Driver's License: Yes
- National ID card: Not Available
- EU ID card: Not Available
- Work ID: Not Available

Please note that comfort breaks are not allowed during the exam for this organization.
You: is comfort break allowed?
Chatbot: Please specify the organization code (e.g., #001).
You: #001
Chatbot:  Organization 1 does not allow comfort breaks during the exam. They also require candidates to present a valid form of identification such as a passport, driver's license, national ID card, EU ID card, or work ID. Additionally, they provide an on-screen calculator but do not provide an on-screen notepad for taking notes.