# Task 2: Lookalike Model

### Step 1: Load Required Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

### Step 2: Load the Dataset

In [2]:
# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge the datasets
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')

# Preview the merged dataset
print(merged_data.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

### Step 3: Prepare Data for the Lookalike Model
 

In [3]:
# Aggregate customer-level features
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],  # Total and average spending
    'TransactionID': 'count',       # Number of transactions
    'Category': 'nunique'           # Number of unique product categories
}).reset_index()

# Rename columns
customer_features.columns = ['CustomerID', 'TotalSpent', 'AvgSpent', 'TransactionCount', 'UniqueCategories']

# Normalize numeric features
scaler = StandardScaler()
customer_features[['TotalSpent', 'AvgSpent', 'TransactionCount', 'UniqueCategories']] = scaler.fit_transform(
    customer_features[['TotalSpent', 'AvgSpent', 'TransactionCount', 'UniqueCategories']]
)

print(customer_features.head())


  CustomerID  TotalSpent  AvgSpent  TransactionCount  UniqueCategories
0      C0001   -0.061701 -0.070263         -0.011458          0.160540
1      C0002   -0.877744 -0.934933         -0.467494         -0.904377
2      C0003   -0.405857 -0.026271         -0.467494          0.160540
3      C0004    1.032547 -0.076769          1.356650          0.160540
4      C0005   -0.783929 -0.040028         -0.923530         -0.904377


### Step 4: Calculate Customer Similarity

In [4]:
# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(customer_features.iloc[:, 1:])  # Exclude 'CustomerID'
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000 -0.065564  0.486931 -0.150708 -0.296364 -0.331187   
C0002      -0.065564  1.000000  0.433324 -0.575035  0.797674 -0.633090   
C0003       0.486931  0.433324  1.000000 -0.934622  0.625790  0.032135   
C0004      -0.150708 -0.575035 -0.934622  1.000000 -0.850404 -0.103356   
C0005      -0.296364  0.797674  0.625790 -0.850404  1.000000 -0.039935   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001      -0.536088  0.417965  0.143586 -0.013155  ... -0.548415  0.596616   
C0002       0.390222 -0.329247  0.974612  0.997953  ...  0.843138  0.745241   
C0003       0.465154 -0.558242  0.558997  0.432632  ... -0.040024  0.774123   
C0004      -0.731098  0.784207 -0.631651 -0.554825  ... -0.229380 -0.677074   
C0005  

### Step 5: Generate Recommendations

In [5]:
# Function to get top 3 similar customers
def get_top_similar(customers_df, customer_id, top_n=3):
    similar_customers = customers_df.loc[customer_id].sort_values(ascending=False)[1:top_n+1]
    return list(zip(similar_customers.index, similar_customers.values))

# Generate recommendations for first 20 customers
lookalike_map = {}
for customer_id in customer_features['CustomerID'][:20]:
    lookalike_map[customer_id] = get_top_similar(similarity_df, customer_id)

# Convert to DataFrame for Lookalike.csv
lookalike_data = []
for cust_id, similar in lookalike_map.items():
    for sim_cust_id, score in similar:
        lookalike_data.append([cust_id, sim_cust_id, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'SimilarCustomerID', 'SimilarityScore'])
print(lookalike_df.head())

  CustomerID SimilarCustomerID  SimilarityScore
0      C0001             C0086         0.996560
1      C0001             C0189         0.994776
2      C0001             C0055         0.993965
3      C0002             C0199         0.998247
4      C0002             C0010         0.997953


### Step 6: Save Lookalike Data


In [7]:
lookalike_df.to_csv('Bushra_Kanoje_Lookalike.csv', index=False)
