In [6]:
!pip install numpy==1.23.5
!pip install scipy==1.10.1
!pip install seaborn==0.12.2
!pip install scikit-surprise --no-binary :all:



In [2]:
# Installing surprise
!pip install scikit-surprise

# Importing necessary libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from google.colab import files
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

# Loading dataset
drive.mount('/content/drive')
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/BUSA710/OnlineRetail.csv', encoding='latin-1')

# Dropping rows with missing CustomerID or Description
data = data.dropna(subset=["CustomerID", "Description"])

# Removing canceled orders (InvoiceNo starting with 'C')
data = data[~data["InvoiceNo"].astype(str).str.startswith("C")]

# Converting CustomerID to int
data["CustomerID"] = data["CustomerID"].astype(int)

Mounted at /content/drive


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["CustomerID"] = data["CustomerID"].astype(int)


In [3]:
# Capping Quantity to 5 for a 1–5 rating scale
data["rating"] = data["Quantity"].clip(upper=5)

# Previewing cleaned data
data_cleaned = data[["CustomerID", "StockCode", "Description", "rating"]]
data_cleaned.head()

# Preparing data for model
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(data_cleaned[["CustomerID", "StockCode", "rating"]], reader)
trainset, testset = train_test_split(surprise_data, test_size=0.2)

# Training model using SVD
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7d0ea41ee950>

In [4]:
# Predicting a rating
pred = model.predict(uid=17850, iid='84029G')
print(f"Predicted rating for user 17850 and product 84029G: {pred.est:.2f}")

# Recommending top-N products to a user
def get_top_n_recommendations(user_id, model, data, n=5):
    product_ids = data["StockCode"].unique()
    predictions = []
    for pid in product_ids:
        pred = model.predict(user_id, pid)
        predictions.append((pid, pred.est))
    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    return top_n

top_recs = get_top_n_recommendations(17850, model, data_cleaned)
print("Top recommendations for user 17850:")
for pid, score in top_recs:
    desc = data_cleaned[data_cleaned["StockCode"] == pid]["Description"].iloc[0]
    print(f"{desc} (Predicted Rating: {score:.2f})")

Predicted rating for user 17850 and product 84029G: 4.97
Top recommendations for user 17850:
WHITE HANGING HEART T-LIGHT HOLDER (Predicted Rating: 5.00)
CREAM CUPID HEARTS COAT HANGER (Predicted Rating: 5.00)
HAND WARMER UNION JACK (Predicted Rating: 5.00)
ASSORTED COLOUR BIRD ORNAMENT (Predicted Rating: 5.00)
STARS GIFT TAPE  (Predicted Rating: 5.00)


In [5]:
# Creating a list of the top 100 most frequently purchased products
popular_products = (
    data_cleaned.groupby("StockCode")
    .agg({'Description': 'first', 'rating': 'count'})
    .sort_values('rating', ascending=False)
    .head(100)
    .reset_index()
)

# Saving it as a CSV file
popular_products.to_csv("popular_products.csv", index=False)

# Downloading the file to your computer
from google.colab import files
files.download("popular_products.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>