### TF-IDF model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os

folder_path = "../data/txt"
filenames = sorted(os.listdir(folder_path))

# Load documents and line counts
documents = []
line_counts = []
for filename in filenames:
    with open(os.path.join(folder_path, filename), encoding="utf-8") as f:
        lines = f.readlines()[24:]  # skip disclaimer
        text_lines = [line.strip() for line in lines if not line.strip().startswith("#+")]
        documents.append(" ".join(text_lines))
        line_counts.append(len(text_lines))

# TF-IDF vectorization
vectorizer = TfidfVectorizer(lowercase=True, max_features=100000)
tfidf_matrix = vectorizer.fit_transform(documents)

In [7]:
tfidf_matrix

<6888x100000 sparse matrix of type '<class 'numpy.float64'>'
	with 74589726 stored elements in Compressed Sparse Row format>

In [26]:
# Inspecting some vocabulary features
vocab = vectorizer.vocabulary_

# Check how many words were used
print(f"Actual vocabulary size: {len(vocab)}")

# Check the first few items in the vocabulary dictionary
for term, index in list(vocab.items())[:20]:
    print(term)

Actual vocabulary size: 100000
van
de
generale
resolutien
des
casteels
batavia
genomen
in
rade
india
primo
januarij
tot
den
26
februarij
1714
over
diverse


### Computing cosine similarity

In [4]:
# Cosine similarity
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

### Checking for (near) duplicates

In [11]:
# Check for near-duplicates
threshold = 0.99
for i in range(len(filenames)):
    for j in range(i + 1, len(filenames)):
        similarity = cosine_sim_matrix[i, j]
        if similarity > threshold:
            # Check line count similarity
            lines_i = line_counts[i]
            lines_j = line_counts[j]
            line_diff = abs(lines_i - lines_j)
            if line_diff < 150:
                print(f"Document {filenames[i]} and Document {filenames[j]} are similar with cosine similarity = {similarity:.4f} and line diff = {line_diff}")

Document 14571 and Document 19373 are similar with cosine similarity = 0.9989 and line diff = 145
Document 14827 and Document 20132 are similar with cosine similarity = 0.9998 and line diff = 53
Document 15192 and Document 20802 are similar with cosine similarity = 0.9927 and line diff = 127
Document 15218 and Document 17965 are similar with cosine similarity = 0.9957 and line diff = 34
Document 15249 and Document 17854 are similar with cosine similarity = 0.9942 and line diff = 104
Document 15765 and Document 18697 are similar with cosine similarity = 0.9938 and line diff = 55
Document 15880 and Document 18128 are similar with cosine similarity = 0.9982 and line diff = 85
Document 15927 and Document 18234 are similar with cosine similarity = 0.9928 and line diff = 70
Document 16808 and Document 20668 are similar with cosine similarity = 0.9967 and line diff = 84
Document 16966 and Document 19545 are similar with cosine similarity = 0.9998 and line diff = 42
Document 17838 and Document