In [15]:
import PyPDF2
import re
import numpy as np
import math
from collections import Counter
from nltk.corpus import stopwords

In [16]:
# Read PDF files
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + " "
    return text

In [17]:
# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

In [18]:
# Compute Term Frequency (TF)
def compute_tf(words):
    term_counts = Counter(words)
    total_terms = len(words)
    tf = {word: count / total_terms for word, count in term_counts.items()}
    return tf

In [19]:
# Compute Inverse Document Frequency (IDF)
def compute_idf(documents):
    N = len(documents)
    all_words = set(word for doc in documents for word in doc)
    idf = {}
    
    for word in all_words:
        containing_docs = sum(1 for doc in documents if word in doc)
        idf[word] = np.log((1 + N) / (1 + containing_docs)) + 1
    
    return idf

In [20]:
# Compute TF-IDF
def compute_tfidf(tf, idf):
    tfidf = {word: tf_val * idf[word] for word, tf_val in tf.items()}
    return tfidf

In [None]:
# Normalization
def normalize_tfidf(tfidf):
    norm = math.sqrt(sum(value ** 2 for value in tfidf.values()))
    if norm == 0:
        return tfidf
    normalized_tfidf = {term: value / norm for term, value in tfidf.items()}
    return normalized_tfidf

In [22]:
# Display top words
def display_top_words(tfidf, top_n=10):
    sorted_tfidf = sorted(tfidf.items(), key=lambda x: x[1], reverse=True)
    return sorted_tfidf[:top_n]

In [23]:
# Encode documents
def encode_documents(documents):
    idf = compute_idf(documents)
    tfidf_vectors = []
    
    for doc in documents:
        tf = compute_tf(doc)
        tfidf = compute_tfidf(tf, idf)
        normalized_tfidfs = normalize_tfidf(tfidf)
        tfidf_vectors.append(normalized_tfidfs)
    
    return tfidf_vectors

In [24]:
# Load and process all PDFs
pdf_files = ["Cooking_Recipe.pdf", "Martial_Arts.pdf", "Tennis.pdf", "NLP.pdf"]
doc_texts = [extract_text_from_pdf(pdf) for pdf in pdf_files]
doc_words = [preprocess_text(text) for text in doc_texts]

tfidf_vectors = encode_documents(doc_words)

In [25]:
# Display results
for i, tfidf in enumerate(tfidf_vectors):
    print(f"Top words in Document {i+1}:", display_top_words(tfidf))

Top words in Document 1: [('pasta', 0.33004599984777094), ('sauce', 0.33004599984777094), ('simple', 0.22003066656518064), ('fresh', 0.22003066656518064), ('basil', 0.22003066656518064), ('cooking', 0.11001533328259032), ('recipe', 0.11001533328259032), ('tomato', 0.11001533328259032), ('classic', 0.11001533328259032), ('easy', 0.11001533328259032)]
Top words in Document 2: [('martial', 0.39314935196840256), ('arts', 0.39314935196840256), ('sports', 0.39314935196840256), ('selfdefense', 0.19657467598420128), ('combatbased', 0.09828733799210064), ('combine', 0.09828733799210064), ('strength', 0.09828733799210064), ('discipline', 0.09828733799210064), ('practiced', 0.09828733799210064), ('centuries', 0.09828733799210064)]
Top words in Document 3: [('tennis', 0.3359616161831774), ('open', 0.3359616161831774), ('sport', 0.2239744107887849), ('ball', 0.2239744107887849), ('court', 0.2239744107887849), ('making', 0.1429599651341907), ('globally', 0.11198720539439246), ('involves', 0.11198720

In [26]:
# Combine all words from the four documents
all_words = [word for doc in doc_words for word in doc]

# Compute TF-IDF for combined words
combined_tf = compute_tf(all_words)
combined_idf = compute_idf([all_words])  # Treat as a single document
combined_tfidf = compute_tfidf(combined_tf, combined_idf)
normalized_combined_tfidf = normalize_tfidf(combined_tfidf)

# Print top 20 words for all documents combined
print("Top 20 words across all documents:", display_top_words(normalized_combined_tfidf, top_n=20))

Top 20 words across all documents: [('nlp', 0.24663181183349467), ('martial', 0.19730544946679573), ('arts', 0.19730544946679573), ('sports', 0.19730544946679573), ('making', 0.19730544946679573), ('language', 0.19730544946679573), ('pasta', 0.1479790871000968), ('sauce', 0.1479790871000968), ('tennis', 0.1479790871000968), ('open', 0.1479790871000968), ('simple', 0.09865272473339787), ('requires', 0.09865272473339787), ('fresh', 0.09865272473339787), ('basil', 0.09865272473339787), ('physical', 0.09865272473339787), ('selfdefense', 0.09865272473339787), ('techniques', 0.09865272473339787), ('popular', 0.09865272473339787), ('coordination', 0.09865272473339787), ('mental', 0.09865272473339787)]
