<a href="https://colab.research.google.com/github/Durgasai26/Machine-learning/blob/main/INT423_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity


In [3]:

events = pd.read_csv('/content/events.csv')
category_tree = pd.read_csv('/content/category_tree.csv')
item_properties_part1 = pd.read_csv('/content/item_properties_part1.csv')
item_properties_part2 = pd.read_csv('/content/item_properties_part2.csv')

print(events.head())
print(category_tree.head())
print(item_properties_part1.head())
print(item_properties_part2.head())


       timestamp  visitorid event    itemid  transactionid
0  1433221332117     257597  view  355908.0            NaN
1  1433224214164     992329  view  248676.0            NaN
2  1433221999827     111016  view  318965.0            NaN
3  1433221955914     483717  view  253185.0            NaN
4  1433221337106     951259  view  367447.0            NaN
   categoryid  parentid
0        1016     213.0
1         809     169.0
2         570       9.0
3        1691     885.0
4         536    1691.0
       timestamp  itemid    property                            value
0  1435460400000  460429  categoryid                             1338
1  1441508400000  206783         888          1116713 960601 n277.200
2  1439089200000  395014         400  n552.000 639502 n720.000 424566
3  1431226800000   59481         790                       n15360.000
4  1431831600000  156781         917                           828513
       timestamp  itemid property            value
0  1433041200000  183478      5

In [4]:
events_sampled = events.sample(frac=0.01, random_state=42)
print(f"Sampled data shape: {events_sampled.shape}")

events_filtered = events_sampled[events_sampled['event'].isin(['view', 'purchase'])]

interaction_counts = events_filtered.groupby(['visitorid', 'itemid']).size().reset_index(name='count')

user_ids = interaction_counts['visitorid'].astype('category').cat.codes
item_ids = interaction_counts['itemid'].astype('category').cat.codes
counts = interaction_counts['count']


Sampled data shape: (25760, 5)


In [5]:
user_item_matrix_sparse = coo_matrix((counts, (user_ids, item_ids)))
print(f"Shape of the sparse user-item interaction matrix: {user_item_matrix_sparse.shape}")


Shape of the sparse user-item interaction matrix: (23316, 18329)


In [6]:
svd = TruncatedSVD(n_components=10)
user_item_matrix_svd = svd.fit_transform(user_item_matrix_sparse)

predicted_ratings = np.dot(user_item_matrix_svd, svd.components_)
print(f"Shape of the predicted ratings matrix: {predicted_ratings.shape}")


Shape of the predicted ratings matrix: (23316, 18329)


In [7]:
item_properties = pd.concat([item_properties_part1, item_properties_part2], ignore_index=True)

grouped = item_properties.groupby(['itemid', 'property'])['value'].first().reset_index()

itemid_codes = grouped['itemid'].astype('category').cat.codes
property_codes = grouped['property'].astype('category').cat.codes

values = pd.to_numeric(grouped['value'], errors='coerce').fillna(0)

sparse_matrix = coo_matrix((values, (itemid_codes, property_codes)))

sparse_matrix_csr = sparse_matrix.tocsr()
print(f"Shape of the sparse item feature matrix: {sparse_matrix_csr.shape}")


Shape of the sparse item feature matrix: (414826, 1079)


In [8]:
item_similarity_matrix = cosine_similarity(sparse_matrix_csr.T)

item_similarity_sparse = csr_matrix(item_similarity_matrix)

print("Similarity of the first item with others:")
print(item_similarity_matrix[0])


Similarity of the first item with others:
[1.         0.         0.         ... 0.         0.0075008  0.03089526]


In [9]:
def get_recommendations(user_id, n=10):
    user_interactions = events_filtered[events_filtered['visitorid'] == user_id]

    user_item_ids = user_interactions['itemid'].unique()

    user_item_codes = pd.Categorical(user_item_ids).codes

    user_similarity_scores = np.zeros(item_similarity_sparse.shape[0])

    for item_code in user_item_codes:
        user_similarity_scores += item_similarity_sparse[item_code].toarray().flatten()

    recommended_item_indices = np.argsort(user_similarity_scores)[::-1][:n]

    recommended_item_ids = pd.Categorical.from_codes(recommended_item_indices, categories=item_properties['itemid'].unique()).categories

    return recommended_item_ids

user_id = 257597
recommendations = get_recommendations(user_id, n=10)
print(f"Top 10 recommended items for user {user_id}: {recommendations}")


Top 10 recommended items for user 257597: Index([460429, 206783, 395014,  59481, 156781, 285026,  89534, 264312, 229370,
        98113,
       ...
       114033,  45965, 135880, 232427, 315727,  88405, 298244, 224712, 371645,
       414806],
      dtype='int64', length=414826)


In [10]:
user_mapping = {user_id: idx for idx, user_id in enumerate(events_filtered['visitorid'].unique())}


In [11]:
import pandas as pd
import numpy as np


popular_items = events_filtered.groupby('itemid').size().sort_values(ascending=False).index[:100]

def hybrid_recommendation(user_id, n=10, alpha=0.5):
    if user_id not in user_mapping:
        print(f"User ID {user_id} not found in the predicted ratings matrix. Returning default recommendations.")
        return popular_items[:n]
    user_index = user_mapping[user_id]
    collaborative_scores = predicted_ratings[user_index]

    item_similarity_scores = np.zeros_like(collaborative_scores)

    user_interactions = events_filtered[events_filtered['visitorid'] == user_id]
    user_item_ids = user_interactions['itemid'].unique()
    user_item_codes = pd.Categorical(user_item_ids).codes

    for item_code in user_item_codes:
        item_similarity_scores += item_similarity_sparse[item_code].toarray().flatten()

    hybrid_scores = alpha * collaborative_scores + (1 - alpha) * item_similarity_scores

    recommended_item_indices = np.argsort(hybrid_scores)[::-1][:n]
    recommended_item_ids = pd.Categorical.from_codes(recommended_item_indices, categories=item_properties['itemid'].unique()).categories

    return recommended_item_ids

user_id = 257597
try:
    recommendations_hybrid = hybrid_recommendation(user_id, n=10, alpha=0.5)
    print(f"Hybrid top 10 recommended items for user {user_id}: {recommendations_hybrid}")
except ValueError as e:
    print(e)


User ID 257597 not found in the predicted ratings matrix. Returning default recommendations.
Hybrid top 10 recommended items for user 257597: Index([187946.0, 461686.0, 370653.0, 384302.0, 219512.0,  96924.0,   7943.0,
       111530.0, 400946.0, 335975.0],
      dtype='float64', name='itemid')


In [12]:
def post_process_recommendations(user_id, recommendations, events_filtered, n=10):
    user_interactions = events_filtered[events_filtered['visitorid'] == user_id]
    user_item_ids = user_interactions['itemid'].unique()

    filtered_recommendations = [item for item in recommendations if item not in user_item_ids]

    if len(filtered_recommendations) < n:
        recent_interactions = user_interactions.sort_values(by='timestamp', ascending=False)
        recent_items = recent_interactions['itemid'].head(n - len(filtered_recommendations))
        filtered_recommendations.extend([item for item in recent_items if item not in filtered_recommendations])

    filtered_recommendations = filtered_recommendations[:n]

    return filtered_recommendations

processed_recommendations = post_process_recommendations(user_id, recommendations_hybrid, events_filtered, n=10)
print(f"Processed top 10 recommendations for user {user_id}: {processed_recommendations}")


Processed top 10 recommendations for user 257597: [187946.0, 461686.0, 370653.0, 384302.0, 219512.0, 96924.0, 7943.0, 111530.0, 400946.0, 335975.0]


In [13]:
popular_items = events_filtered.groupby('itemid').size().sort_values(ascending=False).index[:100]


In [14]:
# User ID for which we want the recommendations
user_id = 257597

# Step 1: Generate Hybrid Recommendations
try:
    recommendations_hybrid = hybrid_recommendation(user_id, n=10, alpha=0.5)
    print(f"Hybrid top 10 recommended items for user {user_id}: {recommendations_hybrid}")
except ValueError as e:
    print(e)

# Step 2: Post-process to remove already interacted items (views or purchases)
processed_recommendations = post_process_recommendations(user_id, recommendations_hybrid, events_filtered, n=10)
print(f"Processed top 10 recommendations for user {user_id}: {processed_recommendations}")


User ID 257597 not found in the predicted ratings matrix. Returning default recommendations.
Hybrid top 10 recommended items for user 257597: Index([187946.0, 461686.0, 370653.0, 384302.0, 219512.0,  96924.0,   7943.0,
       111530.0, 400946.0, 335975.0],
      dtype='float64', name='itemid')
Processed top 10 recommendations for user 257597: [187946.0, 461686.0, 370653.0, 384302.0, 219512.0, 96924.0, 7943.0, 111530.0, 400946.0, 335975.0]


In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity



events = pd.read_csv('/content/events.csv')
category_tree = pd.read_csv('/content/category_tree.csv')
item_properties_part1 = pd.read_csv('/content/item_properties_part1.csv')
item_properties_part2 = pd.read_csv('/content/item_properties_part2.csv')

print(events.head())
print(category_tree.head())
print(item_properties_part1.head())
print(item_properties_part2.head())


events_sampled = events.sample(frac=0.01, random_state=42)
print(f"Sampled data shape: {events_sampled.shape}")

events_filtered = events_sampled[events_sampled['event'].isin(['view', 'purchase'])]

interaction_counts = events_filtered.groupby(['visitorid', 'itemid']).size().reset_index(name='count')

user_ids = interaction_counts['visitorid'].astype('category').cat.codes
item_ids = interaction_counts['itemid'].astype('category').cat.codes
counts = interaction_counts['count']


user_item_matrix_sparse = coo_matrix((counts, (user_ids, item_ids)))
print(f"Shape of the sparse user-item interaction matrix: {user_item_matrix_sparse.shape}")


svd = TruncatedSVD(n_components=10)
user_item_matrix_svd = svd.fit_transform(user_item_matrix_sparse)

predicted_ratings = np.dot(user_item_matrix_svd, svd.components_)
print(f"Shape of the predicted ratings matrix: {predicted_ratings.shape}")


item_properties = pd.concat([item_properties_part1, item_properties_part2], ignore_index=True)

grouped = item_properties.groupby(['itemid', 'property'])['value'].first().reset_index()

itemid_codes = grouped['itemid'].astype('category').cat.codes
property_codes = grouped['property'].astype('category').cat.codes

values = pd.to_numeric(grouped['value'], errors='coerce').fillna(0)

sparse_matrix = coo_matrix((values, (itemid_codes, property_codes)))

sparse_matrix_csr = sparse_matrix.tocsr()
print(f"Shape of the sparse item feature matrix: {sparse_matrix_csr.shape}")


item_similarity_matrix = cosine_similarity(sparse_matrix_csr.T)

item_similarity_sparse = csr_matrix(item_similarity_matrix)

print("Similarity of the first item with others:")
print(item_similarity_matrix[0])


def get_recommendations(user_id, n=10):
    user_interactions = events_filtered[events_filtered['visitorid'] == user_id]

    user_item_ids = user_interactions['itemid'].unique()

    user_item_codes = pd.Categorical(user_item_ids).codes

    user_similarity_scores = np.zeros(item_similarity_sparse.shape[0])

    for item_code in user_item_codes:
        user_similarity_scores += item_similarity_sparse[item_code].toarray().flatten()

    recommended_item_indices = np.argsort(user_similarity_scores)[::-1][:n]

    recommended_item_ids = pd.Categorical.from_codes(recommended_item_indices, categories=item_properties['itemid'].unique()).categories

    return recommended_item_ids

user_id = 257597
recommendations = get_recommendations(user_id, n=10)
print(f"Top 10 recommended items for user {user_id}: {recommendations}")


user_mapping = {user_id: idx for idx, user_id in enumerate(events_filtered['visitorid'].unique())}


import pandas as pd
import numpy as np


popular_items = events_filtered.groupby('itemid').size().sort_values(ascending=False).index[:100]

def hybrid_recommendation(user_id, n=10, alpha=0.5):
    if user_id not in user_mapping:
        print(f"User ID {user_id} not found in the predicted ratings matrix. Returning default recommendations.")
        return popular_items[:n]
    user_index = user_mapping[user_id]
    collaborative_scores = predicted_ratings[user_index]

    item_similarity_scores = np.zeros_like(collaborative_scores)

    user_interactions = events_filtered[events_filtered['visitorid'] == user_id]
    user_item_ids = user_interactions['itemid'].unique()
    user_item_codes = pd.Categorical(user_item_ids).codes

    for item_code in user_item_codes:
        item_similarity_scores += item_similarity_sparse[item_code].toarray().flatten()

    hybrid_scores = alpha * collaborative_scores + (1 - alpha) * item_similarity_scores

    recommended_item_indices = np.argsort(hybrid_scores)[::-1][:n]
    recommended_item_ids = pd.Categorical.from_codes(recommended_item_indices, categories=item_properties['itemid'].unique()).categories

    return recommended_item_ids

user_id = 257597
try:
    recommendations_hybrid = hybrid_recommendation(user_id, n=10, alpha=0.5)
    print(f"Hybrid top 10 recommended items for user {user_id}: {recommendations_hybrid}")
except ValueError as e:
    print(e)


def post_process_recommendations(user_id, recommendations, events_filtered, n=10):
    user_interactions = events_filtered[events_filtered['visitorid'] == user_id]
    user_item_ids = user_interactions['itemid'].unique()

    filtered_recommendations = [item for item in recommendations if item not in user_item_ids]

    if len(filtered_recommendations) < n:
        recent_interactions = user_interactions.sort_values(by='timestamp', ascending=False)
        recent_items = recent_interactions['itemid'].head(n - len(filtered_recommendations))
        filtered_recommendations.extend([item for item in recent_items if item not in filtered_recommendations])

    filtered_recommendations = filtered_recommendations[:n]

    return filtered_recommendations

processed_recommendations = post_process_recommendations(user_id, recommendations_hybrid, events_filtered, n=10)
print(f"Processed top 10 recommendations for user {user_id}: {processed_recommendations}")


popular_items = events_filtered.groupby('itemid').size().sort_values(ascending=False).index[:100]



       timestamp  visitorid event    itemid  transactionid
0  1433221332117     257597  view  355908.0            NaN
1  1433224214164     992329  view  248676.0            NaN
2  1433221999827     111016  view  318965.0            NaN
3  1433221955914     483717  view  253185.0            NaN
4  1433221337106     951259  view  367447.0            NaN
   categoryid  parentid
0        1016     213.0
1         809     169.0
2         570       9.0
3        1691     885.0
4         536    1691.0
       timestamp  itemid    property                            value
0  1435460400000  460429  categoryid                             1338
1  1441508400000  206783         888          1116713 960601 n277.200
2  1439089200000  395014         400  n552.000 639502 n720.000 424566
3  1431226800000   59481         790                       n15360.000
4  1431831600000  156781         917                           828513
       timestamp  itemid property            value
0  1433041200000  183478      5