# **IMPORT MODEL**

In [1]:
from keras.models import load_model
import pickle

# Define the paths
model_path = 'model2.keras'
tokenizer_path = 'tokenizer.pickle'
label_encoder_path = 'label_encoder.pickle'

model = load_model('model2.keras')

# Load the tokenizer
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load the label encoder
with open(label_encoder_path, 'rb') as handle:
    label_encoder = pickle.load(handle)


# **Test Model(CSV FILE)**

In [5]:
import pandas as pd
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences

csv_path = 'prediction_data.csv'
output_csv_path = 'prediction_data_with_predictions_test.csv'

# Load the CSV file
data = pd.read_csv(csv_path)

# Ensure all entries in the "Text" column are strings and handle missing values
data['Text'] = data['Text'].astype(str).fillna('')

# Extract the "Text" column into a list
new_texts = data['Text'].tolist()

# Extract Label data
label_data = data['Label']

new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded_sequences = pad_sequences(new_sequences, maxlen=200, padding='post', truncating='post')

# Evaluate model to build the compiled metrics
# Use a small batch of data to avoid computational overhead if necessary
eval_data = new_padded_sequences[:100]  # Adjust size as needed
eval_labels = label_data[:100].values  # Adjust size as needed
loss, accuracy = model.evaluate(eval_data, eval_labels, verbose=0)
print(f"Model evaluation results - Loss: {loss}, Accuracy: {accuracy}")

# Predict with the model
predictions = tf.nn.sigmoid(model.predict(new_padded_sequences)).numpy()
predicted_labels = (predictions > 0.5).astype(int)
predicted_labels = label_encoder.inverse_transform(predicted_labels.flatten())

# Add the predictions to the original DataFrame
data['prediction'] = predicted_labels

# Save the updated DataFrame to a new CSV file
data.to_csv(output_csv_path, index=False)

print("Predictions added to the CSV file successfully.")

# Count matches
count = 0
for i in range(len(label_data)):
    if int(label_data[i]) == predicted_labels[i]: count += 1

print("Matches: ", str(count), "/", str(len(label_data)))


Model evaluation results - Loss: 0.05521168187260628, Accuracy: 0.9900000095367432
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step
Predictions added to the CSV file successfully.
Matches:  4101 / 4167


# **PDF INPUT**

In [2]:
import fitz
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences

PATH_TO_PDF = '230911.pdf'

def extract_text_from_pdf(pdf_path):

    pdf_document = fitz.open(pdf_path)

    # List for the PDF  Text
    text_list = []

    for page_num in range(len(pdf_document)):

        page = pdf_document.load_page(page_num)
        page_text = page.get_text()
        text_list.append(page_text)

    pdf_document.close()

    return text_list

new_texts = extract_text_from_pdf(PATH_TO_PDF)
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded_sequences = pad_sequences(new_sequences, maxlen=200, padding='post', truncating='post')

predictions = tf.nn.sigmoid(model.predict(new_padded_sequences)).numpy()
predicted_labels = (predictions > 0.5).astype(int)
predicted_labels = label_encoder.inverse_transform(predicted_labels.flatten())

output_list = []

for i in range(len(predicted_labels)):
  if predicted_labels[i] == 1: 
    output_list.append(i+1)
    
print(f"Toplam sayfa sayısı: {len(predicted_labels)}")
print(output_list)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Toplam sayfa sayısı: 232
[6, 7, 9, 10, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 216]


# **URL INPUT**

In [1]:
import fitz
import requests

def download_pdf(url, save_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            file.write(response.content)
        print(f"PDF downloaded successfully and saved as {save_path}")
    else:
        print(f"Failed to download PDF: {response.status_code}")


def extract_text_from_pdf(pdf_path):

    pdf_document = fitz.open(pdf_path)

    # List for the PDF  Text
    text_list = []


    for page_num in range(len(pdf_document)):

        page = pdf_document.load_page(page_num)
        page_text = page.get_text()
        text_list.append(page_text)

    pdf_document.close()

    return text_list


url = 'PDFURL'
save_path = 'downloaded_file.pdf'

download_pdf(url, save_path)
new_texts = extract_text_from_pdf('downloaded_file.pdf') 
#Same step above


PDF downloaded successfully and saved as downloaded_file.pdf
