**Step 1: Import Libraries and Load Data**


We load the datasets and check their structure to understand the features we will use for the lookalike model.

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Paths to datasets
customers_path = "Customers.csv"
products_path = "Products.csv"
transactions_path = "Transactions.csv"

# Load datasets
customers = pd.read_csv(customers_path)
products = pd.read_csv(products_path)
transactions = pd.read_csv(transactions_path)

# Display the first few rows of each dataset
print(customers.head())
print(products.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

**Step 2: Preprocess and Merge Data**

We merge the datasets to combine customer profiles with transaction history and aggregate features like total spending, quantity purchased, and product preferences.


In [2]:
# Convert date columns to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Merge datasets
customer_transactions = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Aggregate transaction history for each customer
customer_profiles = customer_transactions.groupby('CustomerID').agg({
    'Region': 'first',  # Region is a categorical feature
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',  # Total items purchased
    'ProductName': lambda x: ' '.join(x)  # Combine purchased products into a single string
}).reset_index()

# Display the aggregated customer profiles
print(customer_profiles.head())


  CustomerID         Region  TotalValue  Quantity  \
0      C0001  South America     3354.52        12   
1      C0002           Asia     1862.74        10   
2      C0003  South America     2725.38        14   
3      C0004  South America     5354.88        23   
4      C0005           Asia     2034.24         7   

                                         ProductName  
0  HomeSense Wall Art TechPro Headphones ActiveWe...  
1  BookWorld Cookware Set BookWorld Rug ComfortLi...  
2  ActiveWear T-Shirt ActiveWear Rug ActiveWear C...  
3  BookWorld Bluetooth Speaker TechPro Rug TechPr...  
4  TechPro Smartwatch ActiveWear Cookware Set Com...  


**Step 3: Encode Categorical and Textual Features**

We encode the categorical Region column and use TF-IDF to convert the textual ProductName feature into numerical vectors. This creates a comprehensive feature set for similarity computation.

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# One-hot encode 'Region'
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'], prefix='Region')

# Apply TF-IDF vectorization on product preferences
tfidf = TfidfVectorizer()
product_vectors = tfidf.fit_transform(customer_profiles['ProductName'])

# Add the TF-IDF product features to the profile
product_features = pd.DataFrame(product_vectors.toarray(), columns=tfidf.get_feature_names_out())
customer_profiles = pd.concat([customer_profiles, product_features], axis=1)

# Drop the original 'ProductName' column
customer_profiles = customer_profiles.drop(columns=['ProductName'])

# Display processed customer profiles
print(customer_profiles.head())


  CustomerID  TotalValue  Quantity  Region_Asia  Region_Europe  \
0      C0001     3354.52        12        False          False   
1      C0002     1862.74        10         True          False   
2      C0003     2725.38        14        False          False   
3      C0004     5354.88        23        False          False   
4      C0005     2034.24         7         True          False   

   Region_North America  Region_South America  activewear       art  \
0                 False                  True    0.181413  0.370663   
1                 False                 False    0.000000  0.000000   
2                 False                  True    0.727207  0.000000   
3                 False                  True    0.253108  0.000000   
4                 False                 False    0.255203  0.000000   

   biography  ...  shoes  smartphone  smartwatch  soundwave   speaker  \
0        0.0  ...    0.0         0.0    0.267673   0.382563  0.000000   
1        0.0  ...    0.0      

**Step 4: Standardize Numerical Features**

We standardize numerical features like TotalValue and Quantity to ensure they are on the same scale, which is crucial for similarity computation.


In [4]:
# Import the necessary library for StandardScaler
from sklearn.preprocessing import StandardScaler

# Identify numerical features for standardization
numerical_features = ['TotalValue', 'Quantity']
scaler = StandardScaler()
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

# Display standardized features
print(customer_profiles[numerical_features].head())


   TotalValue  Quantity
0   -0.061701 -0.122033
1   -0.877744 -0.448000
2   -0.405857  0.203934
3    1.032547  1.670787
4   -0.783929 -0.936951


**Step 5: Compute Similarity Scores**

We compute the cosine similarity between all customer profiles, generating a similarity matrix where each entry represents the similarity score between two customers.

In [5]:
# Compute cosine similarity between all customer profiles
feature_matrix = customer_profiles.drop(columns=['CustomerID']).values
similarity_matrix = cosine_similarity(feature_matrix)

# Create a DataFrame for similarity scores, using customer IDs only as the index and columns
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])

# Display similarity scores for the first 5 customers
print(similarity_df.iloc[:5, :5])




CustomerID     C0001     C0002     C0003     C0004     C0005
CustomerID                                                  
C0001       1.000000  0.065015  0.570447  0.315788  0.221276
C0002       0.065015  1.000000  0.273564 -0.276564  0.809574
C0003       0.570447  0.273564  1.000000  0.380172  0.262157
C0004       0.315788 -0.276564  0.380172  1.000000 -0.416910
C0005       0.221276  0.809574  0.262157 -0.416910  1.000000


**Step 6: Recommend Lookalike Customers**

This step identifies the top 3 similar customers for each of the first 20 customers and stores them with their similarity scores in a dictionary.

In [6]:
# Function to get top 3 lookalike customers
def get_top_lookalikes(customer_id, similarity_df, top_n=3):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
    return similar_customers.index.tolist(), similar_customers.values.tolist()

# Create the Lookalike map for CustomerID: C0001 - C0020
lookalike_map = {}
for customer_id in customer_profiles['CustomerID'].iloc[:20]:
    similar_ids, scores = get_top_lookalikes(customer_id, similarity_df)
    lookalike_map[customer_id] = list(zip(similar_ids, scores))

# Display the Lookalike map
print(lookalike_map)


{'C0001': [('C0129', 0.7576118846924658), ('C0191', 0.7334505744495917), ('C0190', 0.7145328106140103)], 'C0002': [('C0005', 0.8095743769357011), ('C0088', 0.7999247759859655), ('C0128', 0.7980000045533396)], 'C0003': [('C0181', 0.8051897119210434), ('C0133', 0.7602259179936883), ('C0076', 0.740114050600447)], 'C0004': [('C0087', 0.9015628272797028), ('C0165', 0.8975234616360919), ('C0102', 0.8848866680650116)], 'C0005': [('C0128', 0.9106119416124159), ('C0159', 0.8757594672235632), ('C0007', 0.8199665579164385)], 'C0006': [('C0187', 0.8816113941169941), ('C0171', 0.7627345113449657), ('C0191', 0.7396688197951593)], 'C0007': [('C0005', 0.8199665579164385), ('C0159', 0.812400698309829), ('C0146', 0.7668000095942176)], 'C0008': [('C0109', 0.84322921272305), ('C0068', 0.8317614369573453), ('C0122', 0.8015308221101916)], 'C0009': [('C0062', 0.8960695793373862), ('C0060', 0.8856282360396891), ('C0198', 0.85977143392587)], 'C0010': [('C0166', 0.7634679962440855), ('C0121', 0.7360153780537382

**Step 7: Save Lookalike Map to CSV**

The lookalike_map is converted into a DataFrame and saved as Aditya_Thakur_Lookalike.csv, meeting the task requirements.

In [7]:
# Convert lookalike_map to a DataFrame and save as CSV
lookalike_list = []
for cust_id, similar_list in lookalike_map.items():
    for similar_cust, score in similar_list:
        lookalike_list.append({'cust_id': cust_id, 'similar_cust_id': similar_cust, 'score': score})

lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv('Aditya_Thakur_Lookalike.csv', index=False)

print("Lookalike map saved to Lookalike.csv")


Lookalike map saved to Lookalike.csv
