In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load datasets
transactions = pd.read_csv('C:/Users/CHIRAG JAGGA/Downloads/Transactions.csv')
products = pd.read_csv('C:/Users/CHIRAG JAGGA/Downloads/Products.csv')
customers = pd.read_csv('C:/Users/CHIRAG JAGGA/Downloads/Customers.csv')

# Convert date columns to datetime
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])

# Merge datasets
merged_data = transactions.merge(products, on='ProductID', how='left').merge(customers, on='CustomerID', how='left')

# Encode categorical feature 'Region' and numericalize 'SignupDate'
merged_data['Region_Encoded'] = LabelEncoder().fit_transform(merged_data['Region'])
merged_data['SignupDays'] = (merged_data['SignupDate'] - merged_data['SignupDate'].min()).dt.days

# Aggregate data for each customer
customer_features = merged_data.groupby('CustomerID').agg({
    'ProductName': lambda x: ' '.join(x),  # Combine all purchased product names
    'TotalValue': 'sum',                  # Total spending
    'Quantity': 'sum',                    # Total quantity purchased
    'Region_Encoded': 'first',            # Region (encoded)
    'SignupDays': 'first'                 # Days since earliest signup
}).reset_index()

# Text-based feature (ProductName) -> TF-IDF Vectorization
tfidf = TfidfVectorizer()
product_tfidf = tfidf.fit_transform(customer_features['ProductName'])

# Combine features: TF-IDF + Numerical features
numerical_features = customer_features[['TotalValue', 'Quantity', 'Region_Encoded', 'SignupDays']].values
numerical_features = (numerical_features - np.mean(numerical_features, axis=0)) / np.std(numerical_features, axis=0)  # Normalize
combined_features = np.hstack((product_tfidf.toarray(), numerical_features))

# Compute cosine similarity
similarity_matrix = cosine_similarity(combined_features)

# Generate recommendations for customers C0001 to C0020
customer_ids = customer_features['CustomerID'].tolist()
lookalike_results = {}

for i, cust_id in enumerate(customer_ids[:20]):  # Only process C0001 to C0020
    similarities = similarity_matrix[i]
    similar_customers = sorted(
        [(customer_ids[j], similarities[j]) for j in range(len(similarities)) if i != j], 
        key=lambda x: x[1], 
        reverse=True
    )
    lookalike_results[cust_id] = similar_customers[:3]  # Top 3 similar customers

# Create Lookalike.csv
lookalike_csv = pd.DataFrame({
    "CustomerID": list(lookalike_results.keys()),
    "Lookalikes": [str(lookalike_results[cust_id]) for cust_id in lookalike_results]
})

lookalike_csv.to_csv('C:/Users/CHIRAG JAGGA/Downloads/Lookalike.csv', index=False)

print("Lookalike Model results saved to Lookalike.csv!")


Lookalike Model results saved to Lookalike.csv!


In [3]:
#2nd way of doing it
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load datasets
transactions = pd.read_csv('C:/Users/CHIRAG JAGGA/Downloads/Transactions.csv')
products = pd.read_csv('C:/Users/CHIRAG JAGGA/Downloads/Products.csv')
customers = pd.read_csv('C:/Users/CHIRAG JAGGA/Downloads/Customers.csv')

# Merge datasets
transactions_products = pd.merge(transactions, products, on="ProductID", how="left")
customer_data = pd.merge(transactions_products, customers, on="CustomerID", how="left")

# Feature engineering: Aggregate data per customer
customer_features = customer_data.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    num_transactions=("TransactionID", "count"),
    avg_transaction_value=("TotalValue", "mean"),
    unique_categories=("Category", "nunique")
).reset_index()

# Encode categorical variables: Region
region_encoded = pd.get_dummies(customers.set_index("CustomerID")["Region"], prefix="Region")
customer_features = customer_features.set_index("CustomerID").join(region_encoded, how="left").reset_index()

# Normalize features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.drop(columns=["CustomerID"]))

# Compute cosine similarity
similarity_matrix = cosine_similarity(normalized_features)

# Generate lookalike recommendations
lookalikes = {}
for i, customer_id in enumerate(customer_features["CustomerID"][:20]):
    # Get similarity scores for the customer
    similarity_scores = similarity_matrix[i]
    # Sort by similarity scores (excluding the customer itself)
    similar_customers = np.argsort(-similarity_scores)[1:4]  # Top 3 excluding self
    lookalikes[customer_id] = [
        {
            "customer_id": customer_features["CustomerID"].iloc[j],
            "score": round(similarity_scores[j], 4),
        }
        for j in similar_customers
    ]

# Create Lookalike.csv
lookalike_df = pd.DataFrame([
    {
        "CustomerID": customer_id,
        "Lookalikes": str(recommendations),
    }
    for customer_id, recommendations in lookalikes.items()
])

lookalike_df.to_csv("Lookalike.csv", index=False)

# Display the top recommendations
print("Lookalike recommendations saved to Lookalike.csv.")
lookalike_df.head()


Lookalike recommendations saved to Lookalike.csv.


Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[{'customer_id': 'C0152', 'score': 0.9999}, {'..."
1,C0002,"[{'customer_id': 'C0142', 'score': 0.9875}, {'..."
2,C0003,"[{'customer_id': 'C0129', 'score': 0.9563}, {'..."
3,C0004,"[{'customer_id': 'C0108', 'score': 0.9828}, {'..."
4,C0005,"[{'customer_id': 'C0159', 'score': 0.9998}, {'..."
