# Evaluating Recommender Systems for Digital Library Datasets

## Content Based Algorithms
## Comparison/Evaluation based on Metrics/Properties

In [44]:
import pandas as pd
from IPython.display import display, HTML

In [46]:
f_ext = {"Feature Extraction Methods":["TF-IDF", "LSA", "Word2Vec", "Doc2Vec", "BERT", "BoW", "BM25", "LDA", "FastText", "GloVe"]}
sim_m = {"Similarity and Distance Measures":["Cosine Similarity", "Euclidean Distance", "Jaccard Similarity", "Manhattan Distance", "Pearson Correlation", "Bray-Curtis Distance", "Canberra Distance", "Minkowski Distance", "Mahalanobis Distance", "Wasserstein Distance"]}
d_ext = pd.DataFrame(f_ext); d_ext
d_ext.insert(0, "No.", range(1, len(d_ext) + 1))
d_sim = pd.DataFrame(sim_m); d_sim
d_sim.insert(0, "No.", range(1, len(d_sim) + 1))

display(HTML(f"""
<div style="display: flex; justify-content: space-around;">
    <div>{d_ext.to_html(index=False)}</div>
    <div>{d_sim.to_html(index=False)}</div>
</div>
"""))


No.,Feature Extraction Methods
1,TF-IDF
2,LSA
3,Word2Vec
4,Doc2Vec
5,BERT
6,BoW
7,BM25
8,LDA
9,FastText
10,GloVe

No.,Similarity and Distance Measures
1,Cosine Similarity
2,Euclidean Distance
3,Jaccard Similarity
4,Manhattan Distance
5,Pearson Correlation
6,Bray-Curtis Distance
7,Canberra Distance
8,Minkowski Distance
9,Mahalanobis Distance
10,Wasserstein Distance


### Evaluation Metrics/Properties:
- Prediction Accuracy
    - Ratings Prediction Accuracy ? (ratings)
    - **Usage Prediction (feedback)**
    - **Ranking Measures**
- **Coverage**
- **Confidence**
- Trust
- **Novelty**
- Serendipity
- **Diversity**
- Utility
- Risk
- Robustness
- Privacy
- Adaptability
- Scalability


### Variations of algorithms

## TF-IDF and Cosine Similarity

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [35]:
df = pd.read_csv('Books/books.csv', delimiter=';')

df_head = df.head(10); df_head

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company
5,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group
6,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group
7,0671870432,PLEADING GUILTY,Scott Turow,1993,Audioworks
8,0679425608,Under the Black Flag: The Romance and the Real...,David Cordingly,1996,Random House
9,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie,2002,Scribner


In [11]:

# Sample dataset
data = {
    'title': [
        'The Matrix', 
        'The Matrix Reloaded', 
        'The Matrix Revolutions', 
        'Inception', 
        'Interstellar', 
        'The Prestige'
    ],
    'description': [
        'A computer hacker learns about the true nature of reality and his role in the war against its controllers.',
        'Neo and his allies race against time before the machines come to destroy Zion.',
        'The human city of Zion defends itself against the massive invasion of the machines.',
        'A thief who steals corporate secrets through dream-sharing technology is given an inverse task.',
        'A team of explorers travels through a wormhole in space in an attempt to save humanity.',
        'Two stage magicians engage in a battle to create the ultimate illusion.'
    ],
    'genres': [
        'Action, Sci-Fi', 
        'Action, Sci-Fi', 
        'Action, Sci-Fi', 
        'Sci-Fi, Thriller', 
        'Sci-Fi, Drama', 
        'Drama, Mystery'
    ],
    'actors': [
        'Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss',
        'Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss',
        'Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss',
        'Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen Page',
        'Matthew McConaughey, Anne Hathaway, Jessica Chastain',
        'Hugh Jackman, Christian Bale, Scarlett Johansson'
    ]
}

df = pd.DataFrame(data)

# Combine Features
def combine_features(row):
    return f"{row['description']} {row['genres']} {row['actors']}"

df['combined_features'] = df.apply(combine_features, axis=1)

# Compute TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = df.index[df['title'] == title].tolist()[0]
    
    # Get the pairwise similarity scores for all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 3 most similar movies (excluding the first one, which is itself)
    sim_scores = sim_scores[1:4]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 3 most similar movies
    return df['title'].iloc[movie_indices]


movie_title = "The Matrix"
recommendations = get_recommendations(movie_title)
print(f"Recommendations for '{movie_title}':")
print(recommendations)


Recommendations for 'The Matrix':
2    The Matrix Revolutions
1       The Matrix Reloaded
4              Interstellar
Name: title, dtype: object


## LSA and Cosine Similarity

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Sample book metadata
data = {
    'BookID': [1, 2, 3, 4, 5],
    'Title': ['Book A', 'Book B', 'Book C', 'Book D', 'Book E'],
    'Description': [
        'A tale of adventure and mystery in a fantastical world.',
        'A romantic drama set in the heart of the city.',
        'A science fiction novel exploring space and time.',
        'A thrilling mystery with unexpected twists.',
        'A heartwarming story about love and friendship.'
    ]
}


books_df = pd.DataFrame(data)

# Preprocess the text
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(books_df['Description'])

# Apply LSA (Latent Semantic Analysis)
lsa = TruncatedSVD(n_components=2, random_state=42)
lsa_matrix = lsa.fit_transform(tfidf_matrix)

# Compute cosine similarity between books
similarity_matrix = cosine_similarity(lsa_matrix)

# recommend books based on a given book title
def recommend_books(title, top_n=3):
    if title not in books_df['Title'].values:
        return f"Book '{title}' not found in the database."
    
    book_index = books_df.index[books_df['Title'] == title].tolist()[0]
    similar_books = list(enumerate(similarity_matrix[book_index]))
    similar_books = sorted(similar_books, key=lambda x: x[1], reverse=True)
    
    recommendations = []
    for i, (idx, sim_score) in enumerate(similar_books[1:top_n+1]):
        recommendations.append((books_df.iloc[idx]['Title'], sim_score))
    
    return recommendations

# Test
book_title = 'Book A'
recommended_books = recommend_books(book_title, top_n=3)
print(f"Books similar to '{book_title}':")
for title, score in recommended_books:
    print(f"- {title} (similarity score: {score:.2f})")


Books similar to 'Book A':
- Book D (similarity score: 1.00)
- Book C (similarity score: 0.00)
- Book E (similarity score: -0.00)


## Word2Vec

In [2]:
from gensim.models import Word2Vec
import numpy as np

def train_word2vec(data):
    sentences = [desc.split() for desc in data['description']]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    return model

def recommend_word2vec(book_title, data, model):
    def vectorize(text):
        words = text.split()
        vectors = [model.wv[word] for word in words if word in model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
    
    book_vector = vectorize(data[data['title'] == book_title]['description'].iloc[0])
    similarities = []
    for i, desc in enumerate(data['description']):
        similarities.append((i, cosine_similarity([book_vector], [vectorize(desc)])[0][0]))
    
    sorted_scores = sorted(similarities, key=lambda x: x[1], reverse=True)
    recommendations = [data['title'][i[0]] for i in sorted_scores[1:6]]
    return recommendations


## Doc2Vec

In [5]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def train_doc2vec(data):
    documents = [TaggedDocument(desc.split(), [i]) for i, desc in enumerate(data['description'])]
    model = Doc2Vec(documents, vector_size=100, window=5, min_count=1, workers=4)
    return model

def recommend_doc2vec(book_title, data, model):
    book_vector = model.infer_vector(data[data['title'] == book_title]['description'].iloc[0].split())
    similarities = []
    for i, desc in enumerate(data['description']):
        desc_vector = model.infer_vector(desc.split())
        similarities.append((i, cosine_similarity([book_vector], [desc_vector])[0][0]))
    
    sorted_scores = sorted(similarities, key=lambda x: x[1], reverse=True)
    recommendations = [data['title'][i[0]] for i in sorted_scores[1:6]]
    return recommendations


## BERT

In [35]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.5.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting sympy==1.13.1 (from torch>=1.11.0->sentence-transformers)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.20.3-cp312-none-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Downloading sentence_transformers-3.3

In [37]:
from sentence_transformers import SentenceTransformer

def recommend_bert(book_title, data):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(data['description'].tolist())
    
    book_embedding = model.encode(data[data['title'] == book_title]['description'].iloc[0])
    similarities = cosine_similarity([book_embedding], embeddings)[0]
    
    sorted_scores = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)
    recommendations = [data['title'][i[0]] for i in sorted_scores[1:6]]
    return recommendations


## BoW

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

def recommend_bow(book_title, data):
    count_vectorizer = CountVectorizer(stop_words='english')
    bow_matrix = count_vectorizer.fit_transform(data['description'])
    
    cosine_sim = cosine_similarity(bow_matrix, bow_matrix)
    idx = data[data['title'] == book_title].index[0]
    scores = list(enumerate(cosine_sim[idx]))
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    
    recommendations = [data['title'][i[0]] for i in sorted_scores[1:6]]
    return recommendations


## BM25

In [31]:
pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2
Note: you may need to restart the kernel to use updated packages.


In [33]:
from rank_bm25 import BM25Okapi

def recommend_bm25(book_title, data):
    tokenized_corpus = [desc.split() for desc in data['description']]
    bm25 = BM25Okapi(tokenized_corpus)
    
    query = data[data['title'] == book_title]['description'].iloc[0].split()
    scores = bm25.get_scores(query)
    
    sorted_scores = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
    recommendations = [data['title'][i[0]] for i in sorted_scores[1:6]]
    return recommendations


## LDA

In [50]:
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def lda_recommend(data, book_title, num_topics=10):
    # Preprocessing: Tokenize descriptions
    tokenized_descriptions = [desc.split() for desc in data['description']]
    
    # Create a dictionary and corpus for LDA
    dictionary = Dictionary(tokenized_descriptions)
    corpus = [dictionary.doc2bow(text) for text in tokenized_descriptions]
    
    # Train the LDA model
    lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    
    # Get topic distributions for each document
    topic_distributions = [lda.get_document_topics(bow, minimum_probability=0) for bow in corpus]
    topic_vectors = np.array([[prob for _, prob in dist] for dist in topic_distributions])
    
    # Get the topic vector for the queried book
    book_idx = data[data['title'] == book_title].index[0]
    book_vector = topic_vectors[book_idx]
    
    # Compute cosine similarity
    similarities = cosine_similarity([book_vector], topic_vectors)[0]
    sorted_indices = np.argsort(similarities)[::-1]
    
    # Return top 5 recommendations
    recommendations = data['title'].iloc[sorted_indices[1:6]].tolist()
    return recommendations

# Example usage
import pandas as pd

data = pd.DataFrame({
    'title': ['Book1', 'Book2', 'Book3', 'Book4', 'Book5'],
    'description': [
        'A story about friendship and adventure.',
        'A thrilling mystery novel with unexpected twists.',
        'A guide to understanding the basics of quantum physics.',
        'A romantic tale set in the beautiful Italian countryside.',
        'An action-packed fantasy with dragons and knights.'
    ]
})

book_title = 'Book1'
print("LDA Recommendations:", lda_recommend(data, book_title))


LDA Recommendations: ['Book2', 'Book5', 'Book4', 'Book3']


## FastText

In [53]:
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def fasttext_recommend(data, book_title):
    # Preprocessing: Tokenize descriptions
    tokenized_descriptions = [desc.split() for desc in data['description']]
    
    # Train the FastText model
    model = FastText(tokenized_descriptions, vector_size=100, window=5, min_count=1, epochs=10)
    
    # Get document vectors by averaging word embeddings
    def get_document_vector(tokens):
        vectors = [model.wv[word] for word in tokens if word in model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
    
    doc_vectors = np.array([get_document_vector(tokens) for tokens in tokenized_descriptions])
    
    # Get the vector for the queried book
    book_idx = data[data['title'] == book_title].index[0]
    book_vector = doc_vectors[book_idx]
    
    # Compute cosine similarity
    similarities = cosine_similarity([book_vector], doc_vectors)[0]
    sorted_indices = np.argsort(similarities)[::-1]
    
    # Return top 5 recommendations
    recommendations = data['title'].iloc[sorted_indices[1:6]].tolist()
    return recommendations

# Example usage
print("FastText Recommendations:", fasttext_recommend(data, book_title))


FastText Recommendations: ['Book2', 'Book4', 'Book3', 'Book5']


## GloVe

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vectors = np.array(values[1:], dtype='float32')
            embeddings_index[word] = vectors
    return embeddings_index

def glove_recommend(data, book_title, glove_path='glove.6B.100d.txt'):
    # Load pre-trained GloVe embeddings
    embeddings_index = load_glove_embeddings(glove_path)
    
    # Preprocessing: Tokenize descriptions
    tokenized_descriptions = [desc.split() for desc in data['description']]
    
    # Get document vectors by averaging GloVe word embeddings
    def get_document_vector(tokens):
        vectors = [embeddings_index[word] for word in tokens if word in embeddings_index]
        return np.mean(vectors, axis=0) if vectors else np.zeros(100)  # 100 for GloVe 100d
    
    doc_vectors = np.array([get_document_vector(tokens) for tokens in tokenized_descriptions])
    
    # Get the vector for the queried book
    book_idx = data[data['title'] == book_title].index[0]
    book_vector = doc_vectors[book_idx]
    
    # Compute cosine similarity
    similarities = cosine_similarity([book_vector], doc_vectors)[0]
    sorted_indices = np.argsort(similarities)[::-1]
    
    # Return top 5 recommendations
    recommendations = data['title'].iloc[sorted_indices[1:6]].tolist()
    return recommendations

# Example usage
glove_path = 'path_to_glove/glove.6B.100d.txt'  # Update with the actual path
print("GloVe Recommendations:", glove_recommend(data, book_title, glove_path))
