Table Of Contents

>[Import Libraries](#scrollTo=xEjZ1jJTj2tn)

>[Create a Custom Dataset](#scrollTo=-vy-GCG9h_8_)

>[Baseline: Prefiltering & Collaborative Filtering](#scrollTo=_rQSQ3TUkb0C)

>[Suggested Methode (Markov Model)](#scrollTo=rgFZyhxKuDJ1)

>[Comparison between two approaches](#scrollTo=HXOBCX8fuhBx)



# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from collections import defaultdict
import time

# Create a Custom Dataset

In [2]:
random_seed_custom = 56

In [39]:
random.seed(random_seed_custom)
np.random.seed(random_seed_custom)

num_users = 5_000
num_items = 350
num_ratings = 5_0000

user_ids = np.random.randint(1, num_users + 1, num_ratings)
item_ids = np.random.randint(1, num_items + 1, num_ratings)
ratings = np.random.randint(1, 6, num_ratings)

time_of_day = np.random.choice(["Morning", "AfterNoon", "Night"], num_ratings)
mood = np.random.choice(["Happy", "Sad", "Angry"], num_ratings)
whether = np.random.choice(["Rainy", "Sunny"], num_ratings)
device_type = np.random.choice(["Mobile", "Laptop", "TV"], num_ratings)

synthetic_data = pd.DataFrame({
    "user_id": user_ids,
    "item_id": item_ids,
    "rating": ratings,
    "time_of_day": time_of_day,
    "device_type": device_type,
    "mood": mood,
    "whether": whether
})

synthetic_data

Unnamed: 0,user_id,item_id,rating,time_of_day,device_type,mood,whether
0,2533,24,5,AfterNoon,Mobile,Happy,Sunny
1,400,184,2,Morning,Mobile,Happy,Rainy
2,3265,163,2,AfterNoon,Mobile,Sad,Rainy
3,1260,169,3,Night,Laptop,Sad,Sunny
4,1147,282,4,Night,Laptop,Sad,Sunny
...,...,...,...,...,...,...,...
49995,1731,149,4,Night,TV,Happy,Sunny
49996,1281,137,1,Night,Mobile,Happy,Sunny
49997,1743,262,3,Morning,Laptop,Sad,Sunny
49998,4228,328,4,Night,TV,Sad,Rainy


# Baseline: Prefiltering & Collaborative Filtering

In [40]:
# Prefiltering methode like (Afternoon, Charger, Angry, whether)
filtered_data = synthetic_data[(synthetic_data["time_of_day"] == "Morning") &
                               (synthetic_data["device_type"] == "Laptop") &
                               (synthetic_data["mood"] == "Angry") &
                               (synthetic_data["whether"] == "Sunny")]

# Creating Item-User Matrix for Collaboration Filtering
user_item_matrix = filtered_data.pivot(index="user_id", columns="item_id", values="rating").fillna(0)
sparse_matrix = csr_matrix(user_item_matrix)

user_similarity = cosine_similarity(sparse_matrix)

user_sim_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

print("Users Similarity")
user_sim_df.head(10)

Users Similarity


user_id,1,7,8,12,22,29,32,35,37,38,...,4923,4931,4940,4943,4949,4966,4973,4980,4981,4997
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
32,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Find Similar Users

In [41]:
def find_similarity(target):
    if target in user_sim_df.index:
        similar_users = user_sim_df[target].sort_values(ascending=False)
        print(f"5 Users most similar to user {target}:")
        return similar_users.head(5)
    else:
        return f"User {target} does not exist in the similarity matrix."

In [45]:
find_similarity(8)

5 Users most similar to user 8:


Unnamed: 0_level_0,8
user_id,Unnamed: 1_level_1
8,1.0
2583,1.0
3963,0.83205
1,0.0
3433,0.0


- `Now, we know similar users to user_id = 8. If this user for example buy a laptop in a Rainy Night with Angry mood, we can suggest the laptop in a similar conditions for user_id = 2583.`

In [46]:
find_similarity(38)

5 Users most similar to user 38:


Unnamed: 0_level_0,38
user_id,Unnamed: 1_level_1
38,1.0
1156,0.970143
1776,0.242536
4468,0.242536
4601,0.189389


In [47]:
find_similarity(12)

5 Users most similar to user 12:


Unnamed: 0_level_0,12
user_id,Unnamed: 1_level_1
12,1.0
583,1.0
1,0.0
3426,0.0
3358,0.0


# Suggested Methode (Markov Model)

In [52]:
user_sequences = synthetic_data.groupby("user_id")["item_id"].apply(list)

transition_matrix = defaultdict(lambda: defaultdict(int))

for sequence in user_sequences:
    for i in range(len(sequence) - 1):
        transition_matrix[sequence[i]][sequence[i + 1]] += 1

for movie, transitions in transition_matrix.items():
    total = sum(transitions.values())
    for next_movie in transitions:
        transition_matrix[movie][next_movie] /= total

def predict_next_movie(current_movie):
    if current_movie in transition_matrix:
        return max(transition_matrix[current_movie], key=transition_matrix[current_movie].get)
    return None

test_movie = 78
predicted_movie = predict_next_movie(test_movie)
print(f"After watching movie {test_movie}, the model suggest to watch movie {predicted_movie}")

After watching movie 78, the model suggest to watch movie 270


# Comparison between two approaches

In [53]:
def evaluate_models():
    start_time = time.time()
    baseline_results = user_sim_df.mean().mean()
    baseline_time = time.time() - start_time

    start_time = time.time()
    markov_results = np.mean([len(seq) for seq in user_sequences])
    markov_time = time.time() - start_time

    print(f"Approach 1: Prefiltered & Collaboration Filtering(Mean Similarity) {baseline_results:.3f}, time= {baseline_time:.4f} seconds")
    print(f"Approach 2: Morkov Model(Mean Sequence) {markov_results:.3f}, time= {markov_time:.4f} seconds")

evaluate_models()


Approach 1: Prefiltered & Collaboration Filtering(Mean Similarity) 0.004, time= 0.0060 seconds
Approach 2: Morkov Model(Mean Sequence) 10.002, time= 0.0021 seconds
