In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
df = pd.read_csv("online_retail.csv")
df.head()


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479703 entries, 0 to 479702
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    479703 non-null  object 
 1   StockCode    479703 non-null  object 
 2   Description  478300 non-null  object 
 3   Quantity     479703 non-null  int64  
 4   InvoiceDate  479703 non-null  object 
 5   UnitPrice    479702 non-null  float64
 6   CustomerID   361381 non-null  float64
 7   Country      479702 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 29.3+ MB


In [4]:
df = df[['CustomerID', 'StockCode', 'Quantity']]
df.head()


Unnamed: 0,CustomerID,StockCode,Quantity
0,17850.0,85123A,6
1,17850.0,71053,6
2,17850.0,84406B,8
3,17850.0,84029G,6
4,17850.0,84029E,6


In [5]:
df = df.dropna()
df = df[df['Quantity'] > 0]


In [6]:
df['CustomerID'] = df['CustomerID'].astype(int)


In [7]:
user_item_matrix = df.pivot_table(
    index='CustomerID',
    columns='StockCode',
    values='Quantity',
    aggfunc='sum',
    fill_value=0
)

user_item_matrix.head()


StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214R,90214S,90214V,90214Y,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12347,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12348,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
12350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,7


In [8]:
user_similarity = cosine_similarity(user_item_matrix)


In [9]:
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=user_item_matrix.index,
    columns=user_item_matrix.index
)


In [11]:
def recommend_products(customer_id, top_n=5):
    similar_users = (
        user_similarity_df.loc[customer_id]
        .sort_values(ascending=False)
        .iloc[1:6]
    )
    recommended_products = user_item_matrix.loc[similar_users.index].sum()
    return recommended_products.sort_values(ascending=False).head(top_n)


In [12]:
sample_customer = user_item_matrix.index[0]
recommend_products(sample_customer)


Unnamed: 0_level_0,0
StockCode,Unnamed: 1_level_1
23167,202
23166,164
22962,72
47566,62
23165,52
