In [82]:
import os
import shutil
import pdfplumber
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import spacy
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split



### Define the path to collect and save PDF files from One drive to Local

In [83]:
# Define the OneDrive directory paths
onedrive_path_1 = "/Users/rennersantana/OneDrive - TestReach/Briefs"
onedrive_path_2 = "/Users/rennersantana/OneDrive - TestReach/Briefs/On-demand exam briefs"

### Path to save collected PDF files

In [84]:
# Path to save the collected PDF files
save_path = "/Users/rennersantana/Desktop/AI - ChatBot/PDF_Files"


### Collect PDF files recursively from a directory

In [85]:
# Collect PDF files recursively from a directory
def collect_pdf_files(directory):
    pdf_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))
    return pdf_files


### Copy PDf files to a specified directory

In [86]:
# Copy PDF files to a specified directory
def copy_pdf_files(pdf_files, save_path):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    for pdf_file in pdf_files:
        shutil.copy(pdf_file, save_path)


### Here we collect the PDf files from both directories

In [87]:
# Here we collect the PDF files from both directories
pdf_files_1 = collect_pdf_files(onedrive_path_1)
pdf_files_2 = collect_pdf_files(onedrive_path_2)


### Combine the lists of PDF

In [88]:
# Combine the lists of PDF files
pdf_files = pdf_files_1 + pdf_files_2


### Copy PDF files to a specified directory

In [89]:
# Copy PDF files to the specified directory
copy_pdf_files(pdf_files, save_path)


### Print the list of collected PDF files

In [90]:
# Print the list of collected PDF files
print("PDF files copied to:", save_path)
#for pdf_file in pdf_files:
    #print(pdf_file)


PDF files copied to: /Users/rennersantana/Desktop/AI - ChatBot/PDF_Files


## Start of the code after copy PDF file localy to a specified directory

### Set the directory containing PDF files

In [91]:
# Directory containing PDF files
pdf_directory = "/Users/rennersantana/OneDrive - TestReach/Briefs"

### Check if the directory exists

In [92]:
# Check if the PDF directory exists; if not, print a message.
if not os.path.exists(pdf_directory):
    print(f"Directory '{pdf_directory}' does not exist.")
    exit()


### Definingh function to collect Specific Information from PDf files

In [93]:
# Defining the function to collect specific information from PDF files
def extract_information_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        # Initialize variables to store extracted information
        on_screen_calculator = "Not Available"
        on_screen_notepad = "Not Available"
        comfort_breaks = "Not Allowed"
        identification = {
            "Passport": "Not Available",
            "Driver’s License": "Not Available",
            "National ID card": "Not Available",
            "EU ID card": "Not Available",
            "Work ID": "Not Available"
        }
        # Loop through each page of the PDF to extract information
        for page in pdf.pages:
            text = page.extract_text()
            # Extract on-screen calculator information
            if "On Screen Calculator: Available" in text:
                on_screen_calculator = "Available"
            # Extract on-screen notepad information
            if "On Screen Notepad: Available" in text:
                on_screen_notepad = "Available"
            # Extract comfort breaks information
            if "Comfort Breaks" in text:
                if any(phrase in text for phrase in ["Allowed", "Y", "YES", "yes"]):
                    comfort_breaks = "Allowed"
                else:
                    comfort_breaks = "Not Allowed"
            # Extract identification information
            for item in identification:
                if item in text:
                    start_idx = text.find(item)
                    end_idx = text.find("\n", start_idx)
                    line = text[start_idx:end_idx]
                    if any(phrase in line for phrase in ["Y", "YES", "yes", "Allowed", "Not Allowed", "allowed", "not allowed"]):
                        identification[item] = "Yes" if any(phrase in line for phrase in ["Y", "YES", "yes", "Allowed", "allowed"]) else "No"
        # Return the extracted information
        return on_screen_calculator, on_screen_notepad, comfort_breaks, identification


### Initialise lists to store extracted information

In [94]:
# Initialize lists to store extracted information
filename_list = []
calculator_list = []
notepad_list = []
comfort_breaks_list = []
passport_list = []
drivers_license_list = []
national_id_card_list = []
eu_id_card_list = []
work_id_list = []

### Loop through each PDf file in directory

In [95]:
# Loop through each PDF file in the directory
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        pdf_file = os.path.join(pdf_directory, filename)
        # Extract information from the current PDF file
        on_screen_calculator, on_screen_notepad, comfort_breaks, identification = extract_information_from_pdf(pdf_file)
        # Append extracted information to lists
        filename_list.append(filename)
        calculator_list.append(on_screen_calculator)
        notepad_list.append(on_screen_notepad)
        comfort_breaks_list.append(comfort_breaks)
        passport_list.append(identification["Passport"])
        drivers_license_list.append(identification["Driver’s License"])
        national_id_card_list.append(identification["National ID card"])
        eu_id_card_list.append(identification["EU ID card"])
        work_id_list.append(identification["Work ID"])


### Create a DataFrame to store extracted information in CSV file

In [96]:
# Create a DataFrame to store the extracted information
data = {
    "Filename": filename_list,
    "On Screen Calculator": calculator_list,
    "On Screen Notepad": notepad_list,
    "Comfort Breaks": comfort_breaks_list,
    "Passport": passport_list,
    "Driver’s License": drivers_license_list,
    "National ID card": national_id_card_list,
    "EU ID card": eu_id_card_list,
    "Work ID": work_id_list
}
df = pd.DataFrame(data)

### Path to save the CSV file once extracted

In [97]:
# Define the path to save the CSV file
csv_file_path = "/Users/rennersantana/Desktop/AI - ChatBot/extracted_information.csv"

### Save DataFrame to a csv file

In [79]:
# Save the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

### Print confirmation file was saved in the specific path

In [80]:
# Print confirmation message
print("Extracted information saved to:", csv_file_path)

Extracted information saved to: /Users/rennersantana/Desktop/AI - ChatBot/extracted_information.csv


In [81]:
df.head()

Unnamed: 0,Filename,On Screen Calculator,On Screen Notepad,Comfort Breaks,Passport,Driver’s License,National ID card,EU ID card,Work ID
0,"EMA 23 April, 2024.pdf",Available,Available,Not Allowed,Yes,Yes,Yes,Yes,Not Available
1,SRB - Exam Brief_Senior Bank Resolution Expert...,Not Available,Available,Not Allowed,Yes,Yes,Yes,Yes,Not Available
2,FSEM Exam Brief 2024.docx.pdf,Not Available,Not Available,Not Allowed,Yes,Yes,Yes,Yes,Not Available
3,CIPS Brief 2024.pdf,Available,Available,Not Allowed,Yes,Yes,Yes,Yes,Not Available
4,"ABP Brief 26 April, 2024.pdf",Not Available,Not Available,Not Allowed,Yes,Not Available,Yes,Yes,Not Available
