# Content based filtering on dummy data


In [24]:
# Content based filtering on dummy data

## Data Preparation
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from sklearn.metrics.pairwise import cosine_similarity

# Sample data
data = {
    "item_id": [1, 2, 3, 4, 5, 6],
    "genre": ["Action", "Comedy", "Action", "Drama", "Comedy", "Action"],
}

df = pd.DataFrame(data)
df


Unnamed: 0,item_id,genre
0,1,Action
1,2,Comedy
2,3,Action
3,4,Drama
4,5,Comedy
5,6,Action


In [25]:
# Vectorize the descriptions
tfidf = TfidfVectorizer()

# Create the TF-IDF matrix - returns sparse matrix of shape (# of docs, # of unique words)
tfidf_matrix = tfidf.fit_transform(df["genre"])
tfidf_matrix.toarray()


array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [18]:
## User Profile Creation
# Simulate a user profile based on liked items
liked_items = [0, 2]  # User3 liked item 1 and 3

## Create user profile by averaging the TF-IDF vectors of liked items
user_profile = tfidf_matrix[liked_items].mean(axis=0)

user_profile

matrix([[1., 0., 0.]])

In [None]:
## Recommendation Generation

# Compute cosine similarity between user profile and all item profiles
# cosine_similarities = linear_kernel(np.array(user_profile), tfidf_matrix).flatten()
cosine_similarities = cosine_similarity(np.array(user_profile), tfidf_matrix).flatten()
cosine_similarities


array([1., 0., 1., 0., 0., 1.])

In [23]:
# Get top N recommendations
N = 1
top_n_indices = cosine_similarities.argsort()[-N:][::-1]
recommended_items = df.iloc[top_n_indices]

recommended_items


Unnamed: 0,item_id,genre
5,6,Action
