In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("books_embeddings.csv")

In [2]:
def parse_embedding(embedding_str):
    """Convert the string of comma-separated floats into a NumPy array."""
    return np.array([float(x) for x in embedding_str.split(',')])

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_closest_records(record, new_df, n=5):
    record_vector = parse_embedding(record['book_embedding']).reshape(1, -1)

    distances = []
    names = []

    length = new_df.shape[0]
    for index, row in new_df.iterrows():
        other_vector = parse_embedding(row['book_embedding']).reshape(1, -1)

        similarity = cosine_similarity(record_vector, other_vector)[0][0]

        distance = 1 - similarity

        distances.append(distance)
        names.append(row['name'])
        
        print(f'Progress: {index / length:.2%}', end='\r')

    sorted_indices = sorted(range(len(distances)), key=lambda k: distances[k])
    closest_names = [names[i] for i in sorted_indices[:n]]

    return closest_names

In [4]:
from sklearn.cluster import KMeans

def create_embeddings_list(df, embeddings):
    length = df.shape[0]
    for index, row in df.iterrows():
        vector = parse_embedding(row['book_embedding']).flatten()  
        embeddings.append(vector)  
        
        print(f'Progress: {index / length:.2%}', end='\r')
    return embeddings  


def create_cluster(embeddings, n_clusters = 50):
    embeddings_matrix = np.array(embeddings)  
    print("Embeddings matrix shape:", embeddings_matrix.shape)

    kmeans = KMeans(n_clusters= n_clusters, random_state=42)
    kmeans.fit(embeddings_matrix)

    cluster_labels = kmeans.labels_
    return cluster_labels, kmeans 


In [5]:
embeddings = []
embeddings = create_embeddings_list(df, embeddings)

Progress: 100.00%

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

def find_closest_records_clusters(record, new_df, cluster_labels, kmeans, n=5):
    
    record_vector = parse_embedding(record['book_embedding']).reshape(1, -1)

    record_cluster = kmeans.predict(record_vector)[0]
    
    same_cluster_df = new_df[np.array(cluster_labels) == record_cluster]
    
    distances = []
    names = []

    length = same_cluster_df.shape[0]
    for index, row in same_cluster_df.iterrows():
        other_vector = parse_embedding(row['book_embedding']).reshape(1, -1)
        
        similarity = cosine_similarity(record_vector, other_vector)[0][0]
        distance = 1 - similarity
        
        distances.append(distance)
        names.append(row['name'])

    sorted_indices = sorted(range(len(distances)), key=lambda k: distances[k])
    closest_names = [names[i] for i in sorted_indices[:n]]

    return closest_names

def find_closest_records_clusters2(record, new_df, cluster_labels, kmeans, n=5, n_clusters_to_search=2):
    record_vector = parse_embedding(record['book_embedding']).reshape(1, -1)
    record_cluster = kmeans.predict(record_vector)[0]

    record_centroid = kmeans.cluster_centers_[record_cluster].reshape(1, -1)
    centroid_distances = cosine_similarity(record_centroid, kmeans.cluster_centers_)[0]

    closest_cluster_indices = np.argsort(-centroid_distances)[:n_clusters_to_search + 1]
    
    closest_records_df = new_df[np.isin(cluster_labels, closest_cluster_indices)]
    
    distances = []
    names = []
    
    for index, row in closest_records_df.iterrows():
        other_vector = parse_embedding(row['book_embedding']).reshape(1, -1)
        similarity = cosine_similarity(record_vector, other_vector)[0][0]
        distance = 1 - similarity
        
        distances.append(distance)
        names.append(row['name'])
    
    # Sort by distance and retrieve the top n closest records
    sorted_indices = sorted(range(len(distances)), key=lambda k: distances[k])
    closest_names = [names[i] for i in sorted_indices[:n]]

    return closest_names

In [7]:
def find_records(indexes, n):
    result = []
    for i in indexes:
        result+=(find_closest_records(df.iloc[i], df, n))
    print(result)
    return result

In [8]:
def write_to_file(result):
    with open(r'records.txt', 'w') as fp:
        for item in result:
            # write each item on a new line
            fp.write("%s\n" % item)
        print('Done')

In [None]:
def how_many_match(indexes, cluster_labels, kmeans, result):
    n = 20
    clusters_result = []
    clusters_grouped_result = []
    for i in indexes:
        clusters_result+=(find_closest_records_clusters(df.iloc[i], df,  cluster_labels, kmeans, n))
        clusters_grouped_result+=(find_closest_records_clusters2(df.iloc[i], df,  cluster_labels, kmeans, n, 5))
    print(clusters_result)
    print(clusters_grouped_result)
    matches = [entry for entry in clusters_result if entry in result]
    matches2 = [entry for entry in clusters_grouped_result if entry in result]
    matches2_number = len(matches2)
    matches_number = len(matches)
    accuracy = matches_number / n
    accuracy2 = matches2_number / n
    return accuracy/len(indexes), accuracy2/len(indexes)


In [10]:
def find_best_number(numbers, indexes, result):
    accuracy = accuracy_grouped = 0
    best_accuracy = best_accuracy_grouped = 0
    best_number1 = -1
    best_number2 = -1
    accuracies = []
    accuracies2 = []
    for i in numbers:
        cluster_labels, kmeans = create_cluster(embeddings, n_clusters=i)
        accuracy, accuracy_grouped = how_many_match(indexes, cluster_labels, kmeans, result)
        if (accuracy > best_accuracy):
            best_accuracy = accuracy
            best_number1 = i
            cluster_labels_best, kmeans_best = cluster_labels, kmeans
        if (accuracy_grouped > best_accuracy_grouped):
            best_accuracy_grouped = accuracy_grouped
            best_number2 = i
        accuracies.append(accuracy)
        accuracies2.append(accuracy_grouped)
    
    return best_number1, best_number2, best_accuracy, best_accuracy_grouped, accuracies, accuracies2, cluster_labels_best, kmeans_best

In [None]:
numbers = [50, 55, 65]
indexes = [354, 1234, 4532, 9809, 9816, 3245]

In [12]:
result = find_records(indexes= indexes, n = 20)
write_to_file(result)

['Operation Redemption: A Vision of Hope in an Age of Turmoil', "From mourning to morning: Healing in America's time of crisis", 'The Journey: How to live by Faith in an Uncertain World', 'Foundations for Christian Education in an Era of Change', 'Catch the Fire: The Toronto Blessing an Experience of Renewal and Revival', 'The Thorn In the Flesh: Hope for All Who Struggle With Impossible Conditions', 'Future Hope: A Jewish Christian Look at the End of the World', 'The Peacemaker: A Biblical Guide to Resolving Personal Conflict', "Revolution in World Missions: One Man's Journey to Change a Generation", "Tribulation to triumph: A mandate for today's church", "Hope in Times of Trouble (Seeking Answers in Life's Struggles)", "Hope When the World Falls Apart: Daniel and Revelation's Message of Hope", 'The Darkness and the Dawn: Empowered by the Tragedy and Triumph of the Cross', 'The Gift: A World Solution to Hunger and Poverty', 'Pueblo Intrigue: A Journey of Faith', 'Beyond Jungle Walls: 

In [13]:
num1, num2, accuracy1, accuracy2, accuracies, accuracies2, cluster_labels_best, kmeans_best = find_best_number(numbers=numbers, indexes=indexes, result = result)

Embeddings matrix shape: (103063, 2771)
['Operation Redemption: A Vision of Hope in an Age of Turmoil', 'The Journey: How to live by Faith in an Uncertain World', 'Catch the Fire: The Toronto Blessing an Experience of Renewal and Revival', 'The Thorn In the Flesh: Hope for All Who Struggle With Impossible Conditions', 'Future Hope: A Jewish Christian Look at the End of the World', 'The Peacemaker: A Biblical Guide to Resolving Personal Conflict', "Tribulation to triumph: A mandate for today's church", "Hope in Times of Trouble (Seeking Answers in Life's Struggles)", "Hope When the World Falls Apart: Daniel and Revelation's Message of Hope", 'The Darkness and the Dawn: Empowered by the Tragedy and Triumph of the Cross', 'Pueblo Intrigue: A Journey of Faith', 'THE STORY OF OLD GLORY: Cornerstones Of Freedom', 'Voices from the Heart: Inspiration for a Compassionate Future', 'The Incredible Patience of God: Along the Road to Spiritual Maturity', "A Beacon in the Darkness : Reflecting God's

In [17]:
unique, counts = np.unique(cluster_labels_best, return_counts=True)
cluster_distribution = dict(zip(unique, counts))
print("Number of books in each cluster:", cluster_distribution)

Number of books in each cluster: {np.int32(0): np.int64(1947), np.int32(1): np.int64(4574), np.int32(2): np.int64(2019), np.int32(3): np.int64(3180), np.int32(4): np.int64(1219), np.int32(5): np.int64(6236), np.int32(6): np.int64(2471), np.int32(7): np.int64(3249), np.int32(8): np.int64(1584), np.int32(9): np.int64(1100), np.int32(10): np.int64(2237), np.int32(11): np.int64(2606), np.int32(12): np.int64(2602), np.int32(13): np.int64(2215), np.int32(14): np.int64(1056), np.int32(15): np.int64(2526), np.int32(16): np.int64(1839), np.int32(17): np.int64(5048), np.int32(18): np.int64(2337), np.int32(19): np.int64(3688), np.int32(20): np.int64(3187), np.int32(21): np.int64(3416), np.int32(22): np.int64(2384), np.int32(23): np.int64(999), np.int32(24): np.int64(6374), np.int32(25): np.int64(1897), np.int32(26): np.int64(2470), np.int32(27): np.int64(2363), np.int32(28): np.int64(1078), np.int32(29): np.int64(2155), np.int32(30): np.int64(2413), np.int32(31): np.int64(2430), np.int32(32): np.

In [21]:
print(num1,num2)
print(accuracy1, accuracy2)
print(numbers)
print(accuracies)
print(accuracies2)

40 40
0.7916666666666666 0.9
[40, 50, 55, 65, 75]
[0.7916666666666666, 0.75, 0.7583333333333333, 0.75, 0.7000000000000001]
[0.9, 0.8583333333333334, 0.8166666666666668, 0.7916666666666666, 0.7833333333333333]


In [24]:
import joblib
cluster_labels_df = pd.DataFrame(cluster_labels_best, columns=['Cluster'])
cluster_labels_df.to_csv('cluster_labels_best.csv', index=False)
joblib.dump(kmeans_best, 'kmeansbest_model.joblib')


['kmeansbest_model.joblib']

In [29]:
find_closest_records_clusters(df.iloc[9809], df, cluster_labels_best, kmeans_best, 10)

['Writing Was Everything',
 "The Style's the Man: Reflections on Proust, Fitzgerald, Wharton, Vidal, and Others",
 'Jafsie and John Henry: Essays',
 'Walt Whitman: The Song of Himself',
 'Penchants and Places: Essays and Criticism',
 "Why Kerouac Matters: The Lessons of On the Road (They're Not What You Think)",
 'An American Procession',
 'The Best American Spiritual Writing 2006 (The Best American Series)',
 'Walk On The Wild Side',
 'The Accidental Masterpiece: On the Art of Life and Vice Versa']

In [30]:
accuracy, accuracy_grouped = how_many_match(indexes, cluster_labels_best, kmeans_best, result)

['Operation Redemption: A Vision of Hope in an Age of Turmoil', 'The Journey: How to live by Faith in an Uncertain World', 'Catch the Fire: The Toronto Blessing an Experience of Renewal and Revival', 'The Thorn In the Flesh: Hope for All Who Struggle With Impossible Conditions', 'Future Hope: A Jewish Christian Look at the End of the World', 'The Peacemaker: A Biblical Guide to Resolving Personal Conflict', "Tribulation to triumph: A mandate for today's church", "Hope in Times of Trouble (Seeking Answers in Life's Struggles)", "Hope When the World Falls Apart: Daniel and Revelation's Message of Hope", 'The Darkness and the Dawn: Empowered by the Tragedy and Triumph of the Cross', 'Pueblo Intrigue: A Journey of Faith', 'THE STORY OF OLD GLORY: Cornerstones Of Freedom', 'Voices from the Heart: Inspiration for a Compassionate Future', 'The Incredible Patience of God: Along the Road to Spiritual Maturity', "A Beacon in the Darkness : Reflecting God's Light in Today's World", "Armed and Dan

In [34]:
print(accuracy, accuracy_grouped)

0.7916666666666666 0.9
