To start, here's several helpful packages to load

In [1]:
#Useful libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
#Importing datasets
#Books
books_file = 'datasets/books_rs/books.csv'
df_books = pd.read_csv(books_file)

#Movies
movies_file = 'datasets/books_rs/movies.csv'
df_movies = pd.read_csv(movies_file)

Let's make the two datasets omogeneus.  First reordering.
Now we separate the ratings from the actual content, before removing the unnecessary attributes.

In [3]:
#Reindexing columns
df_books = df_books.reindex(columns=['title', 'categories', 'authors', 'description', 'ratings_count', 'average_rating', 'published_year', 'subtitle', 'isbn13'])
df_movies = df_movies.reindex(columns=['Series_Title', 'Genre', 'Director', 'Overview', 'No_of_Votes', 'IMDB_Rating', 'Released_Year'])

#Merge title and subtitle columns for books
df_books['title'] = df_books['title'] + df_books['subtitle'].apply(lambda x: ' : ' + x if pd.notnull(x) else '')

#Drop unused columns
df_books = df_books.drop(['published_year', 'subtitle', 'isbn13'], axis=1, errors='ignore')
df_movies = df_movies.drop(['Released_Year'], axis=1, errors='ignore')

#Rename the columns to be the same as the books
df_movies.columns = ['title', 'categories', 'authors', 'description', 'ratings_count', 'average_rating']

#Add a column to identify the type of content
df_books['content_type'] = 'book' 
df_movies['content_type'] = 'movie'

#Save separated ratings and content dataframes for books and movies
df_books_ratings = df_books[['title', 'ratings_count', 'average_rating']]
df_books_content = df_books.drop(['ratings_count', 'average_rating'], axis=1, errors='ignore')

df_movies_ratings = df_movies[['title', 'ratings_count', 'average_rating']]
df_movies_content = df_movies.drop(['ratings_count', 'average_rating'], axis=1, errors='ignore')

#Normalize the books ratings
df_books_ratings.loc[:, 'average_rating'] = df_books_ratings['average_rating'] * 2





Now we can merge the two content dataframes, keeping an eye on the dimensions.

In [4]:
print("df_books_content dimensions:", df_books_content.shape)
print("df_movies_content dimensions:", df_movies_content.shape)
cross_content = pd.concat([df_books_content, df_movies_content])
print("cross_content dimensions:", cross_content.shape)
#And do the same fot the ratings
cross_rating = pd.concat([df_books_ratings, df_movies_ratings])


title_type_df = cross_content[['title', 'content_type']]


df_books_content dimensions: (1000, 5)
df_movies_content dimensions: (1000, 5)
cross_content dimensions: (2000, 5)


Now we condense all the non title attributes in one TAGS column.

In [5]:
cross_content['tags'] = cross_content.apply(lambda row: '; '.join([str(row[col]) for col in cross_content.columns if col != 'title']), axis=1)
cross_content = cross_content.drop(['categories', 'authors', 'description', 'content_type'], axis=1, errors='ignore')

Now we apply preprocessing to clean the tags column to remove all non essential caracteristics.

In [6]:
# Download stop words if running for the first time!!!
#nltk.download('stopwords')

def preprocess_text(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # 3. Remove stop words
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    # 4. Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    # Join words back to a single string
    return ' '.join(words)

cross_content['tags'] = cross_content['tags'].apply(preprocess_text)


Now the cross_content dataframe is clean with a title key and a tags string rappresentative of the content. 

In [7]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0.0)
tfidf_matrix = tf.fit_transform(cross_content['tags'])

Now use cosine similarity to compare the vectors.

In [8]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def get_recommendations(content_title, k):
    # Get the index of the content with the given title
    content_index = cross_content[cross_content['title'] == content_title].index[0]

    # Get the similarity scores for the content_index
    sim_scores = list(enumerate(cosine_sim[content_index]))

    # Sort the similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top k+1 similar contents (including the content itself)
    top_indices = [i[0] for i in sim_scores[1:k+1]]

    # Get the titles of the recommended contents
    recommended_titles = cross_content.iloc[top_indices]['title'].tolist()

    return recommended_titles

k = 8  # Number of similar contents to recommend
content_title = "The Avengers"
recommendations = get_recommendations(content_title, 20)

def print_content_type(titles):
    for title in titles:
        content_type = title_type_df.loc[title_type_df['title'] == title, 'content_type'].values[0]
        if content_type == 'book':
            print(f"{title} (Book)")
        elif content_type == 'movie':
            print(f"{title} (Movie)")
        else:
            print(f"{title} (Unknown Content Type)")

# Example usage:
print_content_type(recommendations)

Wizard's First Rule (Book)
The Pillars of Creation (Book)
Naked Empire (Book)
Debt of Bones (Book)
Chainfire (Book)
The Light Fantastic (Book)
Ilse Witch (Book)
Magic Kingdom for Sale : Sold (Book)
Magician : Master (Book)
Forever Odd (Book)
Monsters, Inc. (Movie)
Home Alone (Movie)
The Satanic Verses : A Novel (Book)
The Naked and the Dead : 50th Anniversary Edition, With a New Introduction by the Author (Book)
Terminator 2: Judgment Day (Movie)
Idi i smotri (Movie)
Stone of Farewell (Book)
A Clash of Kings (Book)
Job, a Comedy of Justice (Book)
Ballet Shoes (Book)


As seen the titles recommended are not too relevant because average ratings and the number of ratings are not taken into consideration yet.

The final step saving fuck all.

In [9]:
cross_content.to_csv('cross_content.csv', index=False)