In [3]:
import pandas as pd

# Load the datasets
customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")
transactions_df = pd.read_csv("Transactions.csv")


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Merge transaction data with customer details
customer_transactions = transactions_df.merge(customers_df, on="CustomerID")

# Aggregate data at customer level
customer_features = customer_transactions.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending
    "Quantity": "sum",  # Total items bought
    "TransactionID": "count"  # Number of transactions
}).rename(columns={"TransactionID": "TransactionCount"}).reset_index()

# Normalize the data for similarity calculations
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features.iloc[:, 1:])

# Compute similarity matrix
similarity_matrix = cosine_similarity(customer_features_scaled)

# Create a mapping of customer similarity scores
customer_ids = customer_features["CustomerID"].values
similarity_df = pd.DataFrame(similarity_matrix, index=customer_ids, columns=customer_ids)

# Extract top 3 similar customers for each of the first 20 customers
top_lookalikes = {}
for customer in customer_ids[:20]:
    similar_customers = similarity_df[customer].sort_values(ascending=False).iloc[1:4]  # Top 3 (excluding self)
    top_lookalikes[customer] = list(zip(similar_customers.index, similar_customers.values))

# Convert to DataFrame
lookalike_df = pd.DataFrame([(k, v[0], v[1]) for k, values in top_lookalikes.items() for v in values],
                            columns=["CustomerID", "SimilarCustomerID", "SimilarityScore"])

# Save the Lookalike Model output
lookalike_file_path = "Lookalike.csv"
lookalike_df.to_csv(lookalike_file_path, index=False)


In [7]:
# Display the first few rows of the lookalike recommendations
lookalike_preview = lookalike_df.head(20)
lookalike_preview


Unnamed: 0,CustomerID,SimilarCustomerID,SimilarityScore
0,C0001,C0164,0.997598
1,C0001,C0103,0.995394
2,C0001,C0069,0.986073
3,C0002,C0029,0.999754
4,C0002,C0031,0.998986
5,C0002,C0077,0.994313
6,C0003,C0176,0.90295
7,C0003,C0027,0.875121
8,C0003,C0010,0.832965
9,C0004,C0075,0.997789


In [9]:
# Generate Lookalike recommendations for the first 20 customers
lookalikes = {}
for customer_id in customers_df['CustomerID'][:20]:  # First 20 customers
    if customer_id in similarity_df.index:
        # Get the top 3 similar customers (excluding the customer themselves)
        top_similar = similarity_df[customer_id].sort_values(ascending=False)[1:4]
        # Create a list of tuples (customer_id, similarity_score)
        lookalikes[customer_id] = list(zip(top_similar.index, top_similar.values))

# Save the results in the specified format
lookalike_df = pd.DataFrame.from_dict(lookalikes, orient='index')

# Save to a CSV file, where each row is a customer with their lookalike list and similarity scores
lookalike_df.to_csv("Lookalike.csv", header=False, index_label="CustomerID")

print("Lookalike recommendations with similarity scores have been saved to Lookalike.csv")

Lookalike recommendations with similarity scores have been saved to Lookalike.csv


In [10]:
from google.colab import files

# Download the CSV file
files.download('Lookalike.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>