In [2]:
#importing all the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Load the datasets
customers = pd.read_csv("/content/Customers.csv")

products = pd.read_csv("/content/Products.csv")

transactions = pd.read_csv("/content/Transactions.csv")





In [5]:
customers.columns

Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate'], dtype='object')

In [6]:
products.columns

Index(['ProductID', 'ProductName', 'Category', 'Price'], dtype='object')

In [7]:
transactions.columns

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')

In [8]:
#converting date to datetime format
customers['SignupDate']=pd.to_datetime(customers['SignupDate'])
transactions['SignupDate']=pd.to_datetime(transactions['TransactionDate'])

Lookalike Model

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

#prepare customer-product transaction matrix
customer_product = transactions.groupby(['CustomerID', 'ProductID']).agg({'Quantity': 'sum'}).unstack(fill_value=0)
customer_product.columns = customer_product.columns.droplevel()

In [10]:
#we will standardise the data
scaler=StandardScaler()
customer_product_scaled=scaler.fit_transform(customer_product)
customer_product_scaled

array([[-0.19155441, -0.19831279, -0.23403059, ..., -0.17354437,
        -0.12925978, -0.13742396],
       [-0.19155441, -0.19831279, -0.23403059, ..., -0.17354437,
        -0.12925978, -0.13742396],
       [-0.19155441,  6.11596653, -0.23403059, ..., -0.17354437,
        -0.12925978, -0.13742396],
       ...,
       [-0.19155441, -0.19831279, -0.23403059, ..., -0.17354437,
        -0.12925978, -0.13742396],
       [-0.19155441, -0.19831279, -0.23403059, ..., -0.17354437,
        -0.12925978, -0.13742396],
       [-0.19155441, -0.19831279, -0.23403059, ..., -0.17354437,
        -0.12925978, -0.13742396]])

In [12]:
#lets compute cosine matrix
similarity_matrix = cosine_similarity(customer_product_scaled)
similarity_matrix

array([[ 1.        , -0.04882928, -0.06147586, ..., -0.03838458,
         0.34902818, -0.06724444],
       [-0.04882928,  1.        , -0.03569919, ..., -0.00165323,
        -0.03453342, -0.04030028],
       [-0.06147586, -0.03569919,  1.        , ..., -0.03180765,
        -0.04766667, -0.04790728],
       ...,
       [-0.03838458, -0.00165323, -0.03180765, ...,  1.        ,
        -0.00988126,  0.365046  ],
       [ 0.34902818, -0.03453342, -0.04766667, ..., -0.00988126,
         1.        , -0.05334491],
       [-0.06724444, -0.04030028, -0.04790728, ...,  0.365046  ,
        -0.05334491,  1.        ]])

In [13]:
# Create lookalike recommendations for the first 20 customers
customer_ids = customer_product.index.tolist()
lookalikes = {}

In [14]:
for i, customer_id in enumerate(customer_ids[:20]):
    similar_indices = similarity_matrix[i].argsort()[::-1][1:4]  # Top 3 similar customers
    similar_scores = similarity_matrix[i][similar_indices]
    similar_customers = [(customer_ids[idx], score) for idx, score in zip(similar_indices, similar_scores)]
    lookalikes[customer_id] = similar_customers

In [16]:
lookalikes_df = pd.DataFrame.from_dict(lookalikes, orient='index', columns=['Lookalike_1', 'Lookalike_2', 'Lookalike_3'])
lookalikes_df.to_csv('Lookalike.csv', index_label='CustomerID')
lookalikes_df

Unnamed: 0,Lookalike_1,Lookalike_2,Lookalike_3
C0001,"(C0194, 0.40492753118932373)","(C0104, 0.3740015051203949)","(C0020, 0.36660865634533324)"
C0002,"(C0030, 0.40461685378594003)","(C0091, 0.3837780302090941)","(C0071, 0.3201579810580884)"
C0003,"(C0181, 0.4775717980039302)","(C0134, 0.47101615387801016)","(C0144, 0.42379990716450344)"
C0004,"(C0070, 0.3519014889798181)","(C0175, 0.31609789792660986)","(C0132, 0.27959855424498276)"
C0005,"(C0096, 0.48745613929263837)","(C0023, 0.47025182492905426)","(C0055, 0.3820996241448576)"
C0006,"(C0040, 0.48690905811550494)","(C0178, 0.3978109049066894)","(C0058, 0.3141630233621915)"
C0007,"(C0079, 0.6174415040345886)","(C0118, 0.47827697423174215)","(C0020, 0.456615034666693)"
C0008,"(C0144, 0.326750548833923)","(C0169, 0.2788847602830608)","(C0091, 0.260560334551999)"
C0009,"(C0140, 0.5334414848517175)","(C0083, 0.5308424299367163)","(C0162, 0.49889269779678075)"
C0010,"(C0094, 0.5150640624213312)","(C0092, 0.40551651716491116)","(C0143, 0.36601219885018854)"


In [18]:
from google.colab import files

# Download the Lookalike.csv file
files.download("Lookalike.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>