# Evaluating Recommender Systems for Digital Library Datasets

## Content Based Algorithms
## Comparison/Evaluation based on Metrics/Properties

In [3]:
import pandas as pd
from IPython.display import display, HTML

In [5]:
f_ext = {"Feature Extraction Methods":["TF-IDF", "LSA", "Word2Vec", "Doc2Vec", "BERT", "BoW", "BM25"]}
sim_m = {"Similarity and Distance Measures":["Cosine Similarity", "Euclidean Distance", "Jaccard Similarity", "Manhattan Distance", "Pearson Correlation", "Hamming Distance"]}
d_ext = pd.DataFrame(f_ext); d_ext
d_ext.insert(0, "No.", range(1, len(d_ext) + 1))
d_sim = pd.DataFrame(sim_m); d_sim
d_sim.insert(0, "No.", range(1, len(d_sim) + 1))

display(HTML(f"""
<div style="display: flex; justify-content: space-around;">
    <div>{d_ext.to_html(index=False)}</div>
    <div>{d_sim.to_html(index=False)}</div>
</div>
"""))


No.,Feature Extraction Methods
1,TF-IDF
2,LSA
3,Word2Vec
4,Doc2Vec
5,BERT
6,BoW
7,BM25

No.,Similarity and Distance Measures
1,Cosine Similarity
2,Euclidean Distance
3,Jaccard Similarity
4,Manhattan Distance
5,Pearson Correlation
6,Hamming Distance


### Evaluation Metrics/Properties:
- Prediction Accuracy
    - Ratings Prediction Accuracy ? (ratings)
    - Usage Prediction ? (feedback)
    - **Ranking Measures**
- Coverage
- **Confidence**
- Trust
- **Novelty**
- Serendipity
- **Diversity**
- Utility
- Risk
- Robustness
- Privacy
- Adaptability
- Scalability


## TF-IDF and Cosine Similarity

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [35]:
df = pd.read_csv('Books/books.csv', delimiter=';')

df_head = df.head(10); df_head

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company
5,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group
6,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group
7,0671870432,PLEADING GUILTY,Scott Turow,1993,Audioworks
8,0679425608,Under the Black Flag: The Romance and the Real...,David Cordingly,1996,Random House
9,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie,2002,Scribner


In [11]:

# Sample dataset
data = {
    'title': [
        'The Matrix', 
        'The Matrix Reloaded', 
        'The Matrix Revolutions', 
        'Inception', 
        'Interstellar', 
        'The Prestige'
    ],
    'description': [
        'A computer hacker learns about the true nature of reality and his role in the war against its controllers.',
        'Neo and his allies race against time before the machines come to destroy Zion.',
        'The human city of Zion defends itself against the massive invasion of the machines.',
        'A thief who steals corporate secrets through dream-sharing technology is given an inverse task.',
        'A team of explorers travels through a wormhole in space in an attempt to save humanity.',
        'Two stage magicians engage in a battle to create the ultimate illusion.'
    ],
    'genres': [
        'Action, Sci-Fi', 
        'Action, Sci-Fi', 
        'Action, Sci-Fi', 
        'Sci-Fi, Thriller', 
        'Sci-Fi, Drama', 
        'Drama, Mystery'
    ],
    'actors': [
        'Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss',
        'Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss',
        'Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss',
        'Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen Page',
        'Matthew McConaughey, Anne Hathaway, Jessica Chastain',
        'Hugh Jackman, Christian Bale, Scarlett Johansson'
    ]
}

df = pd.DataFrame(data)

# Combine Features
def combine_features(row):
    return f"{row['description']} {row['genres']} {row['actors']}"

df['combined_features'] = df.apply(combine_features, axis=1)

# Compute TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = df.index[df['title'] == title].tolist()[0]
    
    # Get the pairwise similarity scores for all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 3 most similar movies (excluding the first one, which is itself)
    sim_scores = sim_scores[1:4]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 3 most similar movies
    return df['title'].iloc[movie_indices]


movie_title = "The Matrix"
recommendations = get_recommendations(movie_title)
print(f"Recommendations for '{movie_title}':")
print(recommendations)


Recommendations for 'The Matrix':
2    The Matrix Revolutions
1       The Matrix Reloaded
4              Interstellar
Name: title, dtype: object


## LSA and Cosine Similarity

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Sample book metadata
data = {
    'BookID': [1, 2, 3, 4, 5],
    'Title': ['Book A', 'Book B', 'Book C', 'Book D', 'Book E'],
    'Description': [
        'A tale of adventure and mystery in a fantastical world.',
        'A romantic drama set in the heart of the city.',
        'A science fiction novel exploring space and time.',
        'A thrilling mystery with unexpected twists.',
        'A heartwarming story about love and friendship.'
    ]
}


books_df = pd.DataFrame(data)

# Preprocess the text
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(books_df['Description'])

# Apply LSA (Latent Semantic Analysis)
lsa = TruncatedSVD(n_components=2, random_state=42)
lsa_matrix = lsa.fit_transform(tfidf_matrix)

# Compute cosine similarity between books
similarity_matrix = cosine_similarity(lsa_matrix)

# recommend books based on a given book title
def recommend_books(title, top_n=3):
    if title not in books_df['Title'].values:
        return f"Book '{title}' not found in the database."
    
    book_index = books_df.index[books_df['Title'] == title].tolist()[0]
    similar_books = list(enumerate(similarity_matrix[book_index]))
    similar_books = sorted(similar_books, key=lambda x: x[1], reverse=True)
    
    recommendations = []
    for i, (idx, sim_score) in enumerate(similar_books[1:top_n+1]):
        recommendations.append((books_df.iloc[idx]['Title'], sim_score))
    
    return recommendations

# Test
book_title = 'Book A'
recommended_books = recommend_books(book_title, top_n=3)
print(f"Books similar to '{book_title}':")
for title, score in recommended_books:
    print(f"- {title} (similarity score: {score:.2f})")


Books similar to 'Book A':
- Book D (similarity score: 1.00)
- Book C (similarity score: 0.00)
- Book E (similarity score: -0.00)
