# Task 2: Lookalike Model
The goal of Task 2 is to build a Lookalike Model that recommends similar customers based on their profiles and transaction history. We will use a cosine similarity measure to calculate the similarity between customers based on their features.

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

In [5]:
transactions = pd.read_csv(r"C:\Users\DaLav\Downloads\Transactions.csv")
products = pd.read_csv(r"C:\Users\DaLav\Downloads\Products.csv")
customers = pd.read_csv(r"C:\Users\DaLav\Downloads\Customers.csv")

## Prepare the Data

In [7]:
# Merge customer data with their transactions to create a rich feature set
# Merge customers with transactions
customer_transactions = pd.merge(transactions, customers, on="CustomerID")
print(customer_transactions)

    TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0          T00001      C0199      P067  2024-08-25 12:38:23         1   
1          T00112      C0146      P067  2024-05-27 22:23:54         1   
2          T00166      C0127      P067  2024-04-25 07:38:55         1   
3          T00272      C0087      P067  2024-03-26 22:55:37         2   
4          T00363      C0070      P067  2024-03-21 15:10:10         3   
..            ...        ...       ...                  ...       ...   
995        T00496      C0118      P037  2024-10-24 08:30:27         1   
996        T00759      C0059      P037  2024-06-04 02:15:24         3   
997        T00922      C0018      P037  2024-04-05 13:05:32         4   
998        T00959      C0115      P037  2024-09-29 10:16:02         2   
999        T00992      C0024      P037  2024-04-21 10:52:24         1   

     TotalValue   Price          CustomerName         Region  SignupDate  
0        300.68  300.68        Andrea Jenkins   

In [8]:
# Aggregate data per customer: Total spending, total quantity, and number of unique products bought
customer_features = customer_transactions.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    total_quantity=('Quantity', 'sum'),
    num_unique_products=('ProductID', 'nunique')
).reset_index()
print(customer_features)

    CustomerID  total_spending  total_quantity  num_unique_products
0        C0001         3354.52              12                    5
1        C0002         1862.74              10                    4
2        C0003         2725.38              14                    4
3        C0004         5354.88              23                    8
4        C0005         2034.24               7                    3
..         ...             ...             ...                  ...
194      C0196         4982.88              12                    3
195      C0197         1928.65               9                    3
196      C0198          931.83               3                    2
197      C0199         1979.28               9                    4
198      C0200         4758.60              16                    5

[199 rows x 4 columns]


In [11]:
# Merge product information for additional features (if necessary)
customer_features = pd.merge(customer_features, customers[['CustomerID', 'Region']], on='CustomerID', how='left')
print(customer_features)

    CustomerID  total_spending  total_quantity  num_unique_products  \
0        C0001         3354.52              12                    5   
1        C0002         1862.74              10                    4   
2        C0003         2725.38              14                    4   
3        C0004         5354.88              23                    8   
4        C0005         2034.24               7                    3   
..         ...             ...             ...                  ...   
194      C0196         4982.88              12                    3   
195      C0197         1928.65               9                    3   
196      C0198          931.83               3                    2   
197      C0199         1979.28               9                    4   
198      C0200         4758.60              16                    5   

          Region_x       Region_y  
0    South America  South America  
1             Asia           Asia  
2    South America  South America  
3  

In [12]:
# For simplicity, let's use total_spending, total_quantity, and num_unique_products as features
features = customer_features[['total_spending', 'total_quantity', 'num_unique_products']]
print(features)

     total_spending  total_quantity  num_unique_products
0           3354.52              12                    5
1           1862.74              10                    4
2           2725.38              14                    4
3           5354.88              23                    8
4           2034.24               7                    3
..              ...             ...                  ...
194         4982.88              12                    3
195         1928.65               9                    3
196          931.83               3                    2
197         1979.28               9                    4
198         4758.60              16                    5

[199 rows x 3 columns]


# Calculate Similarity
We will now calculate the cosine similarity between customers based on their transaction features (total_spending, total_quantity, and num_unique_products).

In [13]:
# Normalize features before calculating cosine similarity
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(normalized_features)

# The similarity_matrix is symmetric, so the diagonal will always be 1 (self-similarity).
print("Cosine Similarity Matrix:\n", similarity_matrix)


Cosine Similarity Matrix:
 [[ 1.          0.5607095  -0.23287112 ...  0.57520619  0.64495019
  -0.82106564]
 [ 0.5607095   1.          0.66710809 ...  0.93024044  0.98742961
  -0.92593466]
 [-0.23287112  0.66710809  1.         ...  0.52506851  0.56284471
  -0.36357058]
 ...
 [ 0.57520619  0.93024044  0.52506851 ...  1.          0.96659366
  -0.84933987]
 [ 0.64495019  0.98742961  0.56284471 ...  0.96659366  1.
  -0.94332605]
 [-0.82106564 -0.92593466 -0.36357058 ... -0.84933987 -0.94332605
   1.        ]]


# Generate Recommendations (Top 3 Lookalikes)
We will now recommend the top 3 most similar customers for each of the first 20 customers based on their similarity scores.

In [14]:
# Function to get top N most similar customers
def get_top_n_similar_customers(similarity_matrix, customer_id, n=3):
    # Get the index of the customer
    customer_index = customer_id - 1  # Assuming CustomerID starts from 1 and is sequential
    
    # Get the similarity scores for the customer
    similarity_scores = similarity_matrix[customer_index]
    
    # Get indices of the top N most similar customers (excluding the customer itself)
    similar_customer_indices = similarity_scores.argsort()[-(n+1):-1]
    
    # Get the CustomerIDs and similarity scores of the top N similar customers
    similar_customers = customer_features.iloc[similar_customer_indices]
    similarity_scores = similarity_scores[similar_customer_indices]
    
    # Return as a list of tuples (CustomerID, SimilarityScore)
    return list(zip(similar_customers['CustomerID'], similarity_scores))

# Get top 3 lookalike customers for the first 20 customers
lookalikes = {}

for customer_id in range(1, 21):  # CustomerID: C0001 to C0020
    lookalikes[customer_id] = get_top_n_similar_customers(similarity_matrix, customer_id, n=3)

# Display lookalikes for the first 5 customers as an example
for customer_id, recommendations in list(lookalikes.items())[:5]:
    print(f"Customer {customer_id}:")
    for recommended_customer, score in recommendations:
        print(f"\tRecommended Customer {recommended_customer} with similarity score {score:.2f}")


Customer 1:
	Recommended Customer C0069 with similarity score 0.96
	Recommended Customer C0137 with similarity score 0.96
	Recommended Customer C0164 with similarity score 0.97
Customer 2:
	Recommended Customer C0094 with similarity score 0.99
	Recommended Customer C0031 with similarity score 1.00
	Recommended Customer C0029 with similarity score 1.00
Customer 3:
	Recommended Customer C0010 with similarity score 0.83
	Recommended Customer C0027 with similarity score 0.86
	Recommended Customer C0176 with similarity score 0.89
Customer 4:
	Recommended Customer C0195 with similarity score 0.99
	Recommended Customer C0175 with similarity score 0.99
	Recommended Customer C0075 with similarity score 1.00
Customer 5:
	Recommended Customer C0015 with similarity score 1.00
	Recommended Customer C0123 with similarity score 1.00
	Recommended Customer C0058 with similarity score 1.00


# Save the Recommendations to a CSV File
Finally, we will save the lookalike recommendations for the first 20 customers in a Lookalike.csv file.

In [15]:
# Convert lookalike recommendations to a DataFrame
lookalike_list = []
for customer_id, recommendations in lookalikes.items():
    for recommended_customer, score in recommendations:
        lookalike_list.append([customer_id, recommended_customer, score])

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike recommendations saved to 'Lookalike.csv'")


Lookalike recommendations saved to 'Lookalike.csv'
