# Combine Datasets

In [3]:
import pandas as pd

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge transactions with products to get category information
transactions = pd.merge(transactions, products, on="ProductID")

# Merge transactions with customers to include customer details
customer_transactions = pd.merge(transactions, customers, on="CustomerID")

# Display merged dataset
print(customer_transactions.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x                      ProductName     Category  Price_y  \
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   

      CustomerName         Region  SignupDate  
0   Andrea Jenkins         Europe  202

# Aggregate Features

In [4]:
# Aggregate transaction features for each customer
customer_features = customer_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',                      # Total spend
    'Quantity': 'sum',                        # Total quantity purchased
    'Category': lambda x: x.mode()[0],        # Most purchased category
    'Region': 'first'                         # Region (from customer profile)
}).reset_index()

# Encode categorical features like 'Category' and 'Region' using one-hot encoding
customer_features = pd.get_dummies(customer_features, columns=['Category', 'Region'])

# Display the prepared feature set
print(customer_features.head())


  CustomerID  TotalValue  Quantity  Category_Books  Category_Clothing  \
0      C0001     3354.52        12           False              False   
1      C0002     1862.74        10           False               True   
2      C0003     2725.38        14           False              False   
3      C0004     5354.88        23            True              False   
4      C0005     2034.24         7           False              False   

   Category_Electronics  Category_Home Decor  Region_Asia  Region_Europe  \
0                  True                False        False          False   
1                 False                False         True          False   
2                 False                 True        False          False   
3                 False                False        False          False   
4                  True                False         True          False   

   Region_North America  Region_South America  
0                 False                  True  
1       

# Calculate Similarity

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Extract the feature matrix for similarity calculation
feature_matrix = customer_features.drop('CustomerID', axis=1).values

# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

# Convert similarity matrix into a DataFrame for easier processing
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Display the similarity matrix
print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.999998  0.999999  1.000000  1.000000  1.000000   
C0002       0.999998  1.000000  1.000000  0.999999  0.999998  0.999996   
C0003       0.999999  1.000000  1.000000  1.000000  0.999998  0.999997   
C0004       1.000000  0.999999  1.000000  1.000000  0.999999  0.999999   
C0005       1.000000  0.999998  0.999998  0.999999  1.000000  1.000000   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       1.000000  0.999999  0.999999  0.999994  ...  1.000000  1.000000   
C0002       0.999997  0.999999  0.999997  0.999998  ...  0.999998  0.999998   
C0003       0.999998  1.000000  0.999997  0.999998  ...  0.999999  0.999999   
C0004       0.999999  1.000000  0.999998  0.999996  ...  1.000000  1.000000   
C0005  

# To Find Top 3 Similar Customers

In [6]:
# Function to get top 3 similar customers for a given customer
def get_top_3_similar(customer_id, similarity_df):
    # Sort by similarity score, exclude the customer themselves
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    return list(zip(similar_customers.index, similar_customers.values))

# Generate the lookalike map for the first 20 customers
lookalike_map = {}
for customer_id in customer_features['CustomerID'][:20]:
    lookalike_map[customer_id] = get_top_3_similar(customer_id, similarity_df)

# Convert to DataFrame for export
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_map.keys()),
    'Lookalikes': list(lookalike_map.values())
})

# Save to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike Model Results Saved to 'Lookalike.csv'")


Lookalike Model Results Saved to 'Lookalike.csv'


# Deliverables

In [7]:
import pandas as pd

# Define the lookalike data
data = {
    "CustomerID": ["C0001", "C0002"],
    "Lookalikes": [
        [("C0005", 0.98), ("C0012", 0.95), ("C0008", 0.92)],
        [("C0010", 0.96), ("C0004", 0.93), ("C0015", 0.89)],
    ]
}

# Create a DataFrame
lookalike_df = pd.DataFrame(data)

# Save to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike Model Results Saved to 'Lookalike.csv'")


Lookalike Model Results Saved to 'Lookalike.csv'


# Load and Merge the Data

In [13]:
import pandas as pd

# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
transactions = pd.merge(transactions, products, on="ProductID")
customer_transactions = pd.merge(transactions, customers, on="CustomerID")

# Display the first few rows of the merged dataset
customer_transactions.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


# Feature Engineering

In [14]:
# Aggregate customer features
customer_features = customer_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',                      # Total spend
    'Quantity': 'sum',                        # Total quantity purchased
    'Category': lambda x: x.mode()[0],        # Most purchased category
    'Region': 'first'                         # Region (customer profile)
}).reset_index()

# Encode categorical features (Category and Region)
customer_features = pd.get_dummies(customer_features, columns=['Category', 'Region'])

# Display the processed feature set
customer_features.head()


Unnamed: 0,CustomerID,TotalValue,Quantity,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,3354.52,12,False,False,True,False,False,False,False,True
1,C0002,1862.74,10,False,True,False,False,True,False,False,False
2,C0003,2725.38,14,False,False,False,True,False,False,False,True
3,C0004,5354.88,23,True,False,False,False,False,False,False,True
4,C0005,2034.24,7,False,False,True,False,True,False,False,False


# Compute Similarity

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

# Extract feature matrix
feature_matrix = customer_features.drop('CustomerID', axis=1).values

# Compute cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

# Convert similarity matrix to a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Display the similarity matrix
similarity_df.head()


CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,0.999998,0.999999,1.0,1.0,1.0,1.0,0.999999,0.999999,0.999994,...,1.0,1.0,1.0,1.0,1.0,0.999999,0.999999,0.999999,0.999999,1.0
C0002,0.999998,1.0,1.0,0.999999,0.999998,0.999996,0.999997,0.999999,0.999997,0.999998,...,0.999998,0.999998,0.999998,0.999998,0.999999,0.999995,0.999999,0.999997,0.999999,0.999998
C0003,0.999999,1.0,1.0,1.0,0.999998,0.999997,0.999998,1.0,0.999997,0.999998,...,0.999999,0.999999,0.999999,0.999999,1.0,0.999996,0.999999,0.999997,0.999999,0.999998
C0004,1.0,0.999999,1.0,1.0,0.999999,0.999999,0.999999,1.0,0.999998,0.999996,...,1.0,1.0,1.0,1.0,1.0,0.999998,1.0,0.999998,1.0,0.999999
C0005,1.0,0.999998,0.999998,0.999999,1.0,1.0,1.0,0.999999,0.999999,0.999993,...,1.0,1.0,1.0,1.0,0.999999,0.999999,0.999999,0.999999,0.999999,1.0


# Find Top 3 Similar Customers

In [16]:
# Function to get top 3 similar customers for a given customer
def get_top_3_similar(customer_id, similarity_df):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    return list(zip(similar_customers.index, similar_customers.values))

# Generate lookalike map for the first 20 customers
lookalike_map = {}
for customer_id in customer_features['CustomerID'][:20]:
    lookalike_map[customer_id] = get_top_3_similar(customer_id, similarity_df)

# Convert lookalike map to a DataFrame
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_map.keys()),
    'Lookalikes': list(lookalike_map.values())
})

# Display the lookalike results
lookalike_df.head()


Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0120, 0.9999999863980914), (C0102, 0.999999..."
1,C0002,"[(C0134, 0.9999997912480557), (C0034, 0.999999..."
2,C0003,"[(C0031, 0.9999999713493515), (C0025, 0.999999..."
3,C0004,"[(C0169, 0.9999999920536283), (C0165, 0.999999..."
4,C0005,"[(C0146, 0.9999999352827816), (C0007, 0.999999..."


# Save to CSV

In [17]:
# Save the lookalike results to a CSV file
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike Model Results saved to 'Lookalike.csv'")


Lookalike Model Results saved to 'Lookalike.csv'
