In [None]:
import requests
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import PyPDF2

def fetch_text_from_pdf(pdf_link):
    with open(pdf_link, "rb") as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def preprocess_text(text):
    # Remove non-printable characters and Unicode escape sequences
    text = text.encode('ascii', 'ignore').decode('utf-8')
    # Convert the text to lowercase
    text = text.lower()
    # Tokenize the text into individual words
    tokens = word_tokenize(text)
    # Remove stopwords and punctuation from the tokens
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    return " ".join(tokens)

# Collect and preprocess data from the PDFs (HR policies and IT rules)
corpus = []
# List of PDFs for HR policies and IT rules with their links
documents = [
    {"title": "HR Policy 1", "pdf_link": "Sample Employee Handbook - National Council of Nonprofits.pdf"},
    #{"title": "IT Rule 1", "pdf_link": "<URL>"},
    # Add more HR and IT documents to the list
]

for doc in documents:
    pdf_link = doc["pdf_link"]
    text = fetch_text_from_pdf(pdf_link)
    processed_text = preprocess_text(text)
    corpus.extend(processed_text)

# Print the preprocessed data
print(corpus)


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Set hyperparameters
vocab_size = 10000
embedding_dim = 128
max_seq_length = 50
lstm_units = 256
output_units = vocab_size

# Pad sequences
X_train = pad_sequences(X_train, maxlen=max_seq_length)
X_val = pad_sequences(X_val, maxlen=max_seq_length)

# Build the model
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_seq_length),
    LSTM(lstm_units),
    Dense(output_units, activation='softmax')
])

# Compile and train the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

# Save the trained model
model.save('custom_llm_model.h5')