### TF-IDF model

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os

folder_path = "../data/txt"
filenames = sorted(os.listdir(folder_path))

# Load documents and line counts
documents = []
line_counts = []
page_count = []
for filename in filenames:
    with open(os.path.join(folder_path, filename), encoding="utf-8") as f:
        lines = f.readlines()[24:]  # skip disclaimer
        text_lines = [line.strip() for line in lines if not line.strip().startswith("#+")]
        documents.append(" ".join(text_lines))
        line_counts.append(len(text_lines))

        # Keeping track of the highest page number
        highest_page_number = 0
        for line in lines:
            if line.startswith("#+ NL-HaNA_"):
                splits = line.split("_")
                page_number = int(splits[3][:4])
                if page_number > highest_page_number:
                    highest_page_number = page_number
        page_count.append(highest_page_number)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(lowercase=True, max_features=None, max_df=0.9)
tfidf_matrix = vectorizer.fit_transform(documents)

In [16]:
tfidf_matrix

<6888x10900007 sparse matrix of type '<class 'numpy.float64'>'
	with 107660400 stored elements in Compressed Sparse Row format>

In [33]:
# Inspecting some vocabulary features
vocab = vectorizer.vocabulary_

# Check how many words were used
print(f"Actual vocabulary size: {len(vocab)}")

# Check the last few items in the vocabulary dictionary
for term, index in list(vocab.items())[10900000:10900007]:
    print(term)

Actual vocabulary size: 10900007
fanaradelste
amtrenk
wangedagte
belatelijx
waarnament
wesendlyjkste
waarsieneje


### Computing cosine similarity

In [18]:
# Cosine similarity
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

In [19]:
cosine_sim_matrix.shape

(6888, 6888)

### Checking for (near) duplicates

In [23]:
# Initialising list for the similarly grouped documents
similar_docs = []

# Check for near-duplicates
threshold = 0.95
for i in range(len(filenames)):
    for j in range(i + 1, len(filenames)): # +1 so that it doesn't check for duplicates with itself or previous docs
        similarity = cosine_sim_matrix[i, j]
        if similarity > threshold:
            # Line count
            lines_i = line_counts[i]
            lines_j = line_counts[j]
            line_diff = abs(lines_i - lines_j)

            # Page count
            pages_i = page_count[i]
            pages_j = page_count[j]
            page_diff = abs(pages_i - pages_j)

            # Saving tuple of the documents in list
            similar_docs.append((filenames[i], filenames[j]))
            print(f"Document {filenames[i]} and Document {filenames[j]} are similar with cosine similarity = {similarity:.4f}, page difference {page_diff}, and line difference = {line_diff}")

Document 14571 and Document 19373 are similar with cosine similarity = 0.9727, page difference 0, and line difference = 145
Document 14827 and Document 20132 are similar with cosine similarity = 0.9963, page difference 2, and line difference = 53
Document 14934 and Document 16606 are similar with cosine similarity = 0.9556, page difference 40, and line difference = 1563
Document 15055 and Document 15982 are similar with cosine similarity = 0.9670, page difference 146, and line difference = 726
Document 15105 and Document 21335 are similar with cosine similarity = 0.9771, page difference 424, and line difference = 2155
Document 15217 and Document 16822 are similar with cosine similarity = 0.9567, page difference 90, and line difference = 3962
Document 15248 and Document 15548 are similar with cosine similarity = 0.9522, page difference 90, and line difference = 1796
Document 15378 and Document 19109 are similar with cosine similarity = 0.9506, page difference 18, and line difference = 1

### Copying documents classified as similair into a separate folder

In [32]:
import shutil

for i, docs in enumerate(similar_docs):
    os.makedirs(f"../data/TF-IDF grouped similar documents/group_{i}", exist_ok=True)

    shutil.copy(f"../data/txt/{docs[0]}", f"../data/TF-IDF grouped similar documents/group_{i}/{docs[0]}")
    shutil.copy(f"../data/txt/{docs[1]}", f"../data/TF-IDF grouped similar documents/group_{i}/{docs[1]}")