In [37]:
import os
import numpy as np
import math

## Processing Data

In [24]:
## help function that loads book text files into a dictionary
def load_books(folder):
    books = {}
    for filename in os.listdir(folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                books[filename[:-4]] = file.read()
    return books


# Load books from the folder
books = load_books("books")

## For test: books["The Awakening, and Selected Short Stories by Kate Chopin (192)"]

## Building vector

In [None]:
# Convert the book contents into a list for vectorization
contents = list(books.values())

In [23]:
## helper functions to compute TF-IDF
def compute_tf(text):
    tf_text = {}
    words = text.split()
    total_words = len(words)
    for word in words:
        tf_text[word] = tf_text.get(word, 0) + 1
    for word in tf_text:
        tf_text[word] = tf_text[word] / total_words
    return tf_text

def compute_idf(documents):
    idf_dict = {}
    total_documents = len(documents)
    for document in documents:
        for word in set(document.split()):
            idf_dict[word] = idf_dict.get(word, 0) + 1
    for word in idf_dict:
        idf_dict[word] = math.log(total_documents / idf_dict[word])
    return idf_dict

def compute_tfidf(documents):
    tfidf_documents = []
    idf = compute_idf(documents)
    
    for document in documents:
        tf = compute_tf(document)
        tfidf = {}
        for word, tf_value in tf.items():
            tfidf[word] = tf_value * idf.get(word, 0) 
        tfidf_documents.append(tfidf)
    return tfidf_documents

In [31]:
## this is only for testing and checking
tfidf_documents = compute_tfidf(contents)
for doc_index, tfidf_scores in enumerate(tfidf_documents):
    print(f"Document {doc_index + 1}:")
    sorted_tfidf = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_tfidf[:10]:  # print top 10 terms
        print(f"{word}: {score:.4f}")
    print("\n")

Document 1:
Pierre: 0.0059
Natásha: 0.0056
Rostóv: 0.0037
Prince: 0.0029
Kutúzov: 0.0027
Moscow: 0.0023
Andrew: 0.0022
Princess: 0.0021
Sónya: 0.0018
Denísov: 0.0017


Document 2:
[Footnote: 0.0046
Federal: 0.0034
Americans: 0.0021
Union: 0.0019
democratic: 0.0016
Constitution: 0.0013
America: 0.0012
American: 0.0012
State: 0.0012
Government: 0.0012


Document 3:
Clerval: 0.0019
Justine: 0.0016
Felix: 0.0013
Justine,: 0.0012
Elizabeth: 0.0009
Safie: 0.0008
Clerval,: 0.0007
I: 0.0007
Victor,: 0.0007
Frankenstein,: 0.0006


Document 4:
LORD: 0.0127
unto: 0.0082
LORD,: 0.0078
thy: 0.0036
thou: 0.0036
ye: 0.0036
LORD.: 0.0029
hath: 0.0025
saith: 0.0025
Israel: 0.0024


Document 5:
Nora.: 0.0634
Helmer.: 0.0378
Linde.: 0.0275
Krogstad.: 0.0195
Nora: 0.0159
Rank.: 0.0148
Torvald: 0.0073
Helmer: 0.0072
Nora,: 0.0064
Mrs.: 0.0058


Document 6:
Wayne: 0.0246
Fort: 0.0229
Wayne,: 0.0089
Miami: 0.0068
Wayne.: 0.0062
Wells: 0.0060
Harrison: 0.0050
Indians: 0.0049
Detroit: 0.0044
Historical: 0.0044

## Computing Similarity

In [33]:
## Compute the cosine similarity between two TF-IDF dictionaries
def cosine_similarity(v1, v2):
    intersection = set(v1.keys()) & set(v2.keys())
    numerator = sum([v1[x] * v2[x] for x in intersection])
    
    sum1 = sum([v1[x]**2 for x in v1.keys()])
    sum2 = sum([v2[x]**2 for x in v2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

In [34]:
def recommend_books(user_input, books):
    documents = list(books.values()) + [user_input]
    tfidf_documents = compute_tfidf(documents)
    input_tfidf = tfidf_documents[-1]
    book_tfidfs = tfidf_documents[:-1]
    
    similarities = []
    for index, book_tfidf in enumerate(book_tfidfs):
        sim = cosine_similarity(input_tfidf, book_tfidf)
        similarities.append((list(books.keys())[index], sim))
    
    # Sort by similarity score
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    return similarities[:3]  # return top 3 similar books

## Running the Recommendation System

In [36]:
print("Enter your book preference or a description of what you're interested in:")
user_preference = input()

recommendations = recommend_books(user_preference, books)

# Show recommendation results
print(f"\nRecommended books based on what you input: {user_preference}")
for book, score in recommendations:
    print(f"{book}: {score:.2f}")

Enter your book preference or a description of what you're interested in:

Recommended books based on what you input: I like European History
Beyond Good and Evil by Friedrich Wilhelm Nietzsche (120): 0.03
Democracy in America — Volume 1 by Alexis de Tocqueville (147): 0.02
On Liberty by John Stuart Mill (116): 0.01
