To start, here's several helpful packages to load

In [None]:
#Useful libraries
import re
import ast
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
#Importing datasets
#Books
books_file = 'datasets/books_rs/books.csv'
df_books = pd.read_csv(books_file)

#Movies
movies_file = 'datasets/books_rs/movies.csv'
df_movies = pd.read_csv(movies_file)

3. Data Preprocessing. 
Let's make the two datasets omogeneus.  First reordering.
Now we separate the ratings from the actual content, before removing the unnecessary attributes.

In [None]:
#Add a column to identify the type of content
df_books['Type'] = 'book' 
df_movies['Type'] = 'movie'

# Separate ratings and content dataframes
rating_cols = ['Title', 'Vote Count', 'Vote Average']
df_books_ratings = df_books[rating_cols]
df_movies_ratings = df_movies[rating_cols]
df_books_content = df_books.drop(['Vote Count', 'Vote Average'], axis=1)
df_movies_content = df_movies.drop(['Vote Count', 'Vote Average'], axis=1)

4. Combining Datasets. 
Merge the books and movies datasets into single DataFrames for content and ratings.


In [None]:
cross_content = pd.concat([df_books_content, df_movies_content], ignore_index=True)
cross_rating = pd.concat([df_books_ratings, df_movies_ratings], ignore_index=True)

title_type_df = cross_content[['Title', 'Type']]

5. Processing Categories. 
Convert the 'categories' string into a list of categories for each item.

In [None]:
# Process 'Genres' into 'categories_list'
def process_categories(categories_str):
    if pd.isnull(categories_str):
        return []
    try:
        # Try to parse the string as a list
        categories = ast.literal_eval(categories_str)
        if isinstance(categories, list):
            return [str(cat).strip().lower() for cat in categories]
    except (ValueError, SyntaxError):
        pass
    # Split by common delimiters if not a list
    return [cat.strip().lower() for cat in re.split(r'[;,]', categories_str)]

cross_content['categories_list'] = cross_content['Genres'].apply(process_categories)

6. Text Preprocessing.

b. Now we condense all the non title attributes in one TAGS column.

c. Now we apply preprocessing to clean the tags column to remove all non essential caracteristics to clean and standardize text data for better performance.

In [None]:
# Initialize stop words and stemmer once
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
word_pattern = re.compile(r'\b\w+\b')

def preprocess_text(text):
    # Lowercase and extract words
    words = word_pattern.findall(text.lower())
    # Remove stop words and stem
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

d. Combining 'tags' and 'categories_list'

In [None]:
# Preprocess 'tags' and 'categories_list'
cross_content['tags'] = cross_content[['Author', 'Plot', 'Type']].fillna('').agg(' '.join, axis=1)
cross_content['tags'] = cross_content['tags'].apply(preprocess_text)
cross_content['categories_list'] = cross_content['categories_list'].apply(
    lambda x: ' '.join(preprocess_text(' '.join(x)))
)

# Combine 'tags' and 'categories_list' into 'combined_tags'
cross_content['combined_tags'] = cross_content['tags'] + ' ' + cross_content['categories_list']

e. Before splitting the data, we need to ensure that each item in cross_content has its corresponding average_rating for prediction and evaluation.

In [None]:
# Merge 'average_rating' into 'cross_content' for prediction
cross_content = cross_content.merge(cross_rating[['Title', 'Vote Average']], on='Title', how='left')

Now the cross_content dataframe is clean with a title key and a tags string rappresentative of the content. 

7. Splitting Data.
Divide the data into training and testing sets to evaluate the recommendation system.

In [None]:
# Split the dataset into training and testing sets
X_cross_train, X_cross_test = train_test_split(cross_content, test_size=0.2, random_state=42)

8. TF-IDF vectorization.
Convert textual data into numerical vectors using TF-IDF for similarity calculations.

In [None]:
# Create TF-IDF matrix on 'combined_tags'
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3))
tfidf_matrix = tf.fit_transform(cross_content['combined_tags'])

# Get TF-IDF vectors for training and test sets
train_tfidf_matrix = tfidf_matrix[X_cross_train.index]
test_tfidf_matrix = tfidf_matrix[X_cross_test.index]

9. Computing Cosine Similarities.
Calculate the similarity between each test item and all training items.

In [None]:
cosine_sim_test_train = cosine_similarity(test_tfidf_matrix, train_tfidf_matrix)

10. Evaluation Metrics.

Evaluate the performance of the recommender system using standard metrics.
    Initialize (TP), (FP), (FN), (TN).
    Loop through each test item:
    Retrieve the test item's categories, skipping items without categories.
    Compute similarity scores to all training items.
    Select top k recommended items.
    For each recommended item, check if it shares categories with the test item.
    Update TP and FP counts accordingly.
    Calculate FN and TN based on the total number of relevant and non-relevant items.
    After processing all test items, compute Precision, Recall, Accuracy, and F1 Score.

In [None]:
# Get the ratings from the training set
train_ratings = X_cross_train['Vote Average'].values

k = 10  # Number of similar contents to consider

# Get indices of top k similar train items for each test item
top_k_indices = np.argsort(-cosine_sim_test_train, axis=1)[:, :k]

# Get top k similarities and ratings
top_k_similarities = np.take_along_axis(cosine_sim_test_train, top_k_indices, axis=1)
top_k_ratings = train_ratings[top_k_indices]

# Compute predicted ratings
numerators = np.sum(top_k_similarities * top_k_ratings, axis=1)
denominators = np.sum(top_k_similarities, axis=1)

# Handle zero denominators
predicted_ratings = np.divide(numerators, denominators, out=np.zeros_like(numerators), where=denominators != 0)
# Replace zero denominators with mean of top_k_ratings
mean_top_k_ratings = np.mean(top_k_ratings, axis=1)
predicted_ratings = np.where(denominators != 0, predicted_ratings, mean_top_k_ratings)

# Get actual ratings
actual_ratings = X_cross_test['Vote Average'].values

# Exclude items where actual rating is NaN
valid_indices = ~np.isnan(actual_ratings)
actual_ratings = actual_ratings[valid_indices]
predicted_ratings = predicted_ratings[valid_indices]


11. Computing metrics.
After processing all test items, we compute the overall metrics.
Precision: Proportion of recommended items that are relevant.
Recall: Proportion of relevant items that are recommended.
F1 Score: Harmonic mean of Precision and Recall.
Mean Squared Error (MSE): Measures the average squared difference between actual and predicted ratings.
Root Mean Squared Error (RMSE): Square root of MSE, providing error in the same units as the ratings.

In [None]:
# Compute evaluation metrics
threshold = 5
actual_binary = (actual_ratings > threshold).astype(int)
predicted_binary = (predicted_ratings > threshold).astype(int)

precision = precision_score(actual_binary, predicted_binary)
recall = recall_score(actual_binary, predicted_binary)
f1 = f1_score(actual_binary, predicted_binary)
accuracy = accuracy_score(actual_binary, predicted_binary)

mse = mean_squared_error(actual_ratings, predicted_ratings)
rmse = np.sqrt(mse)

12. Recommendation Function.

Try to find the index of the item with the specified title.
Compute cosine similarity between the item's TF-IDF vector and all items.
Sort the similarity scores in descending order.
Exclude the item itself from recommendations.
Retrieve the titles of the top k recommended items.

In [None]:
# Function to get recommendations for a content title
def get_recommendations(content_title, k):
    # Get the index of the content with the given title
    try:
        content_index = cross_content[cross_content['Title'] == content_title].index[0]
    except IndexError:
        print(f"Content titled '{content_title}' not found.")
        return []

    # Get the similarity scores for the content_index
    # Compute cosine similarity between the content and all contents
    sim_scores = cosine_similarity(tfidf_matrix[content_index], tfidf_matrix).flatten()

    # Sort the similarity scores in descending order
    sim_scores_indices = sim_scores.argsort()[::-1]

    # Get the indices of the top k+1 similar contents (excluding the content itself)
    top_indices = [i for i in sim_scores_indices if i != content_index][:k]

    # Get the titles of the recommended contents
    recommended_titles = cross_content.iloc[top_indices]['Title'].tolist()

    return recommended_titles

13. Printing Recommendations.
Generate and display recommendations for a specific content title. Sort the recommendations based on 'average_rating' and 'ratings_count' for better results


In [None]:
def print_content_type(titles):
    for title in titles:
        content_type_row = title_type_df[title_type_df['Title'] == title]
        if not content_type_row.empty:
            content_type = content_type_row['Type'].values[0]
            if content_type == 'book':
                print(f"{title} (Book)")
            elif content_type == 'movie':
                print(f"{title} (Movie)")
            else:
                print(f"{title} (Unknown Content Type)")
        else:
            print(f"{title} (Content Type Not Found)")

14. Actually running the algorithm with a title with an example usage.

In [None]:
content_title = "The Avengers"
recommendations = get_recommendations(content_title, k)

# Sort recommendations by 'Vote Average' and 'Vote Count'
sorted_recommendations = cross_rating[cross_rating['Title'].isin(recommendations)] \
    .sort_values(['Vote Average', 'Vote Count'], ascending=[False, False])['Title'].tolist()

print_content_type(sorted_recommendations)

15. Printing validation metrics. 

In [None]:
#Valuation of the recommender system
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")

metrics = ['accuracy', 'precision', 'f1', 'recall', 'mse', 'rmse']
values = [accuracy, precision, f1, recall, mse, rmse]

plt.bar(metrics, values)
plt.xlabel('Metrics')
plt.ylabel('Values')
plt.title('Metrics Plot')
plt.show()