To start, here's several helpful packages to load

In [None]:
#Useful libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [None]:
#Importing datasets
#Books
books_file = 'datasets/books_rs/books.csv'
df_books = pd.read_csv(books_file)

#Movies
movies_file = 'datasets/books_rs/movies.csv'
df_movies = pd.read_csv(movies_file)

3. Data Preprocessing. 
Let's make the two datasets omogeneus.  First reordering.
Now we separate the ratings from the actual content, before removing the unnecessary attributes.

In [None]:
#Reindexing columns
df_books = df_books.reindex(columns=['title', 'categories', 'authors', 'description', 'ratings_count', 'average_rating', 'published_year', 'subtitle', 'isbn13'])
df_movies = df_movies.reindex(columns=['Series_Title', 'Genre', 'Director', 'Overview', 'No_of_Votes', 'IMDB_Rating', 'Released_Year'])

#Merge title and subtitle columns for books
df_books['title'] = df_books['title'] + df_books['subtitle'].apply(lambda x: ' : ' + x if pd.notnull(x) else '')

#Drop unused columns
df_books = df_books.drop(['published_year', 'subtitle', 'isbn13'], axis=1, errors='ignore')
df_movies = df_movies.drop(['Released_Year'], axis=1, errors='ignore')

#Rename the columns to be the same as the books
df_movies.columns = ['title', 'categories', 'authors', 'description', 'ratings_count', 'average_rating']

#Add a column to identify the type of content
df_books['content_type'] = 'book' 
df_movies['content_type'] = 'movie'

#Save separated ratings and content dataframes for books and movies
df_books_ratings = df_books[['title', 'ratings_count', 'average_rating']]
df_books_content = df_books.drop(['ratings_count', 'average_rating'], axis=1, errors='ignore')

df_movies_ratings = df_movies[['title', 'ratings_count', 'average_rating']]
df_movies_content = df_movies.drop(['ratings_count', 'average_rating'], axis=1, errors='ignore')

#Normalize the books ratings
df_books_ratings.loc[:, 'average_rating'] = df_books_ratings['average_rating'] * 2





4. Combining Datasets. 
Merge the books and movies datasets into single DataFrames for content and ratings.


In [None]:
#print("df_books_content dimensions:", df_books_content.shape)
#print("df_movies_content dimensions:", df_movies_content.shape)
cross_content = pd.concat([df_books_content, df_movies_content], ignore_index=True)
#print("cross_content dimensions:", cross_content.shape)
#And do the same fot the ratings
cross_rating = pd.concat([df_books_ratings, df_movies_ratings], ignore_index=True)

title_type_df = cross_content[['title', 'content_type']]

5. Processing Categories. 
Convert the 'categories' string into a list of categories for each item.

In [None]:
# Process 'categories' field into a list before creating 'tags'
def process_categories(categories_str):
    if pd.isnull(categories_str):
        return []
    elif isinstance(categories_str, list):
        # Flatten the list
        categories = []
        for item in categories_str:
            if isinstance(item, list):
                categories.extend(item)
            else:
                categories.append(str(item))
        return [cat.strip().lower() for cat in categories]
    else:
        # If the string looks like a list, convert it
        categories_str = str(categories_str)
        if categories_str.startswith('[') and categories_str.endswith(']'):
            import ast
            try:
                categories = ast.literal_eval(categories_str)
                return [str(cat).strip().lower() for cat in categories]
            except (ValueError, SyntaxError):
                pass
        # Split by comma or semicolon
        return [cat.strip().lower() for cat in re.split(',|;', categories_str)]

cross_content['categories_list'] = cross_content['categories'].apply(process_categories)



6. Text Preprocessing. 
a. Ensure 'categories_list' is a flat list of strings without nested lists.

In [None]:
# Function to flatten any nested lists in 'categories_list'
def flatten_categories(categories):
    flat_list = []
    for item in categories:
        if isinstance(item, list):
            flat_list.extend(flatten_categories(item))
        else:
            flat_list.append(str(item))
    return flat_list

cross_content['categories_list'] = cross_content['categories_list'].apply(flatten_categories)

b. Now we condense all the non title attributes in one TAGS column.

In [None]:
# Now create 'tags' column without dropping 'categories_list'
cross_content['tags'] = cross_content.apply(lambda row: '; '.join([str(row[col]) for col in ['authors', 'description', 'content_type'] if col in cross_content.columns and pd.notnull(row[col])]), axis=1)


c. Now we apply preprocessing to clean the tags column to remove all non essential caracteristics to clean and standardize text data for better performance.

In [None]:
# Download stop words if running for the first time!!!
#nltk.download('stopwords')

def preprocess_text(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # 3. Remove stop words
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    # 4. Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    # Join words back to a single string
    return ' '.join(words)

d. Combining 'tags' and 'categories_list'

In [None]:
# Preprocess 'tags' and 'categories_list'
cross_content['tags'] = cross_content['tags'].apply(preprocess_text)
cross_content['categories_list'] = cross_content['categories_list'].apply(lambda x: [preprocess_text(cat) for cat in x])

# Combine 'tags' and 'categories_list' into a single 'combined_tags' column
cross_content['combined_tags'] = cross_content.apply(lambda row: ' '.join(row['tags'].split() + row['categories_list']), axis=1)

Now the cross_content dataframe is clean with a title key and a tags string rappresentative of the content. 

7. Splitting Data.
Divide the data into training and testing sets to evaluate the recommendation system.

In [None]:
# Split the dataset for training and testing
X_cross_train, X_cross_test = train_test_split(cross_content, test_size=0.2, random_state=42)
train_indices = X_cross_train.index
test_indices = X_cross_test.index


8. TF-IDF vectorization.
Convert textual data into numerical vectors using TF-IDF for similarity calculations.

In [None]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0.0)
tfidf_matrix = tf.fit_transform(cross_content['tags'])

# Get TF-IDF vectors for training and test sets
train_tfidf_matrix = tfidf_matrix[train_indices]
test_tfidf_matrix = tfidf_matrix[test_indices]

9. Computing Cosine Similarities.
Calculate the similarity between each test item and all training items.

In [None]:
cosine_sim_test_train = cosine_similarity(test_tfidf_matrix, train_tfidf_matrix)

# Convert indices to numpy arrays for indexing
train_indices_array = train_indices.to_numpy()
test_indices_array = test_indices.to_numpy()



10. Evaluation Metrics.

Evaluate the performance of the recommender system using standard metrics.
    Initialize (TP), (FP), (FN), (TN).
    Loop through each test item:
    Retrieve the test item's categories, skipping items without categories.
    Compute similarity scores to all training items.
    Select top k recommended items.
    For each recommended item, check if it shares categories with the test item.
    Update TP and FP counts accordingly.
    Calculate FN and TN based on the total number of relevant and non-relevant items.
    After processing all test items, compute Precision, Recall, Accuracy, and F1 Score.

In [None]:
k = 10  # Number of similar contents to recommend

# Initialize counters
total_TP = 0
total_FP = 0
total_FN = 0
total_TN = 0

num_train_items = len(train_indices)
num_test_items = len(test_indices)

for i, test_idx in enumerate(test_indices_array):
    # Get the test item
    test_item = cross_content.loc[test_idx]
    test_title = test_item['title']
    test_categories = set(flatten_categories(test_item['categories_list']))
    
    # Skip items with no categories
    if not test_categories:
        continue

    # Get the similarity scores to training items
    sim_scores = cosine_sim_test_train[i]

    # Get the top k indices in training set
    top_k_indices_in_train = sim_scores.argsort()[-k:][::-1]
    top_k_train_indices = train_indices_array[top_k_indices_in_train]

    # Get the recommended items
    recommended_items = cross_content.loc[top_k_train_indices]
    recommended_titles = recommended_items['title'].tolist()

    # Get categories of recommended items
    recommended_categories = recommended_items['categories_list'].tolist()

    # Flatten and preprocess categories
    recommended_categories_flat = [set(flatten_categories(cats)) for cats in recommended_categories]

    # Calculate TP and FP
    TP = 0
    FP = 0
    for rec_cats in recommended_categories_flat:
        if rec_cats & test_categories:
            TP += 1
        else:
            FP += 1

    # Calculate FN and TN
    total_relevant_items = X_cross_train[X_cross_train['categories_list'].apply(lambda x: bool(set(flatten_categories(x)) & test_categories))].shape[0]
    total_non_relevant_items = num_train_items - total_relevant_items
    FN = total_relevant_items - TP
    TN = total_non_relevant_items - FP

    total_TP += TP
    total_FP += FP
    total_FN += FN
    total_TN += TN

# Compute metrics
precision = total_TP / (total_TP + total_FP) if (total_TP + total_FP) > 0 else 0
recall = total_TP / (total_TP + total_FN) if (total_TP + total_FN) > 0 else 0
accuracy = (total_TP + total_TN) / (total_TP + total_FP + total_FN + total_TN) if (total_TP + total_FP + total_FN + total_TN) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0



11. Recommendation Function.

Try to find the index of the item with the specified title.
Compute cosine similarity between the item's TF-IDF vector and all items.
Sort the similarity scores in descending order.
Exclude the item itself from recommendations.
Retrieve the titles of the top k recommended items.

In [None]:
# Function to get recommendations for a content title
def get_recommendations(content_title, k):
    # Get the index of the content with the given title
    try:
        content_index = cross_content[cross_content['title'] == content_title].index[0]
    except IndexError:
        print(f"Content titled '{content_title}' not found.")
        return []

    # Get the similarity scores for the content_index
    # Compute cosine similarity between the content and all contents
    sim_scores = cosine_similarity(tfidf_matrix[content_index], tfidf_matrix).flatten()

    # Sort the similarity scores in descending order
    sim_scores_indices = sim_scores.argsort()[::-1]

    # Get the indices of the top k+1 similar contents (excluding the content itself)
    top_indices = [i for i in sim_scores_indices if i != content_index][:k]

    # Get the titles of the recommended contents
    recommended_titles = cross_content.iloc[top_indices]['title'].tolist()

    return recommended_titles

12. Printing Recommendations.


In [None]:

content_title = "Fight Club"
recommendations = get_recommendations(content_title, k)

def print_content_type(titles):
    for title in titles:
        content_type_row = title_type_df[title_type_df['title'] == title]
        if not content_type_row.empty:
            content_type = content_type_row['content_type'].values[0]
            if content_type == 'book':
                print(f"{title} (Book)")
            elif content_type == 'movie':
                print(f"{title} (Movie)")
            else:
                print(f"{title} (Unknown Content Type)")
        else:
            print(f"{title} (Content Type Not Found)")


sorted_recommendations = cross_rating[cross_rating['title'].isin(recommendations)].sort_values(['average_rating', 'ratings_count'], ascending=[False, False])['title'].tolist()
print_content_type(sorted_recommendations)


13. Printing validation metrics. 

In [None]:
#Valuation of the recommender system
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")