In [None]:
# Install necessary libraries (if needed)
!pip install scikit-learn
!pip install surprise
!pip install pandas
!pip install numpy
!pip install scikit-learn


Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2463300 sha256=d1a83b2eb96f483253585f6203f8feb3025909d5db95e585dc52e40546eeee49
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installi

##Load Dataset

In [None]:
import pandas as pd

# Load files
events = pd.read_csv('events.csv')
item_properties1 = pd.read_csv('item_properties_part1.csv')
item_properties2 = pd.read_csv('item_properties_part2.csv')
category_tree = pd.read_csv('category_tree.csv')

# Combine product properties
item_properties = pd.concat([item_properties1, item_properties2])

print("Data Loaded Successfully!")
print("Events shape:", events.shape)
print("Item Properties shape:", item_properties.shape)


Data Loaded Successfully!
Events shape: (2756101, 5)
Item Properties shape: (1500393, 7)


##Basic Preprocessing

In [None]:
# Filter only 'view' events (you can also use 'purchase')
events = events[events['event'] == 'view']

# Drop unnecessary columns
events = events[['visitorid', 'itemid', 'timestamp']]

# Keep only important properties

item_properties = item_properties[item_properties['property'].isin(['categoryid', 'available', 'brand'])]
item_properties = item_properties[['itemid', 'property', 'value']]

# Pivot product properties

item_features = item_properties.pivot_table(index='itemid', columns='property', values='value', aggfunc='first')

# Drop products without features
item_features = item_features.dropna()

print("Preprocessing Done!")
item_features.head()


Preprocessing Done!


property,available,categoryid
itemid,Unnamed: 1_level_1,Unnamed: 2_level_1
83.0,0,619
294.0,0,1007
320.0,0,929
721.0,0,977
1154.0,0,209


##Content-Based Filtering Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Since only 'categoryid' is available, we use only that
item_features['combined_features'] = item_features['categoryid'].astype(str)

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(item_features['combined_features'])

# Compute Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("Content-Based Filtering model is ready!")



Content-Based Filtering model is ready!


##Recommendation Function (Content-Based)

In [None]:
# Map item ids to indices
indices = pd.Series(item_features.index)

def recommend_items_content(item_id, num_recommendations=5):
    try:
        idx = indices[indices == item_id].index[0]
    except IndexError:
        return ["Item not found."]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    item_indices = [i[0] for i in sim_scores]

    recommended_items = item_features.index[item_indices]
    return recommended_items.tolist()

# Example:
print("Content-based Recommendations for a sample item:")
recommend_items_content(83)


Content-based Recommendations for a sample item:


[10674.0, 74755.0, 78337.0, 96206.0, 167782.0]

##Load Dataset  Events

In [None]:
import pandas as pd

# Load events
events = pd.read_csv('events.csv')

# Optional: Keep only 'view' events (simplify)
events = events[events['event'] == 'view']

# Optional: Assign rating 1 to every view
events['rating'] = 1

# Keep only important columns
events = events[['visitorid', 'itemid', 'rating', 'timestamp']]

print(events.head())
print("Events data loaded:", events.shape)


   visitorid  itemid  rating      timestamp
0     257597  355908       1  1433221332117
1     992329  248676       1  1433224214164
2     111016  318965       1  1433221999827
3     483717  253185       1  1433221955914
4     951259  367447       1  1433221337106
Events data loaded: (2664312, 4)


##Filter active users and popular items

In [None]:
# Keep users with at least 10 interactions
active_users = events['visitorid'].value_counts()
active_users = active_users[active_users >= 10].index

# Keep items with at least 20 interactions
popular_items = events['itemid'].value_counts()
popular_items = popular_items[popular_items >= 20].index

# Filter dataset
filtered_events = events[(events['visitorid'].isin(active_users)) & (events['itemid'].isin(popular_items))]

print("Filtered events shape:", filtered_events.shape)


Filtered events shape: (395302, 4)


##Create the User-Item Matrix

In [None]:
# Create user-item matrix
user_item_matrix = filtered_events.pivot_table(index='visitorid', columns='itemid', values='rating', fill_value=0)

print("User-Item matrix created:", user_item_matrix.shape)


User-Item matrix created: (20576, 24479)


##Build and Train the KNN Collaborative Model

In [None]:
from sklearn.neighbors import NearestNeighbors

# Fit a KNN model (cosine similarity)
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(user_item_matrix)

print("Collaborative Filtering model trained!")


Collaborative Filtering model trained!


##Make Recommendations for a User

In [None]:
def recommend_items_collaborative(user_id, num_recommendations=5):
    # Check if user exists
    if user_id not in user_item_matrix.index:
        return ["User not found."]

    # Get user vector
    user_vector = user_item_matrix.loc[user_id].values.reshape(1, -1)

    # Find nearest neighbors
    distances, indices = model_knn.kneighbors(user_vector, n_neighbors=num_recommendations + 1)

    similar_users = user_item_matrix.index[indices.flatten()].tolist()
    similar_users.remove(user_id)  # Remove itself

    recommended_items = []

    for similar_user in similar_users:
        # Find items that similar user interacted with but this user hasn't
        unseen_items = user_item_matrix.columns[(user_item_matrix.loc[user_id] == 0) & (user_item_matrix.loc[similar_user] > 0)]
        recommended_items.extend(unseen_items)

    # Remove duplicates and limit recommendations
    recommended_items = list(dict.fromkeys(recommended_items))[:num_recommendations]
    return recommended_items

# Example to get recommendations
example_user_id = user_item_matrix.index[0]  # You can pick any real user ID
print(f"Recommended Items for user {example_user_id}:")
print(recommend_items_collaborative(example_user_id))


Recommended Items for user 54:
