**Import libraries**

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


**Load the datasets**

In [2]:
from google.colab import files
uploaded = files.upload()
customers = pd.read_csv('Customers.csv')

uploaded = files.upload()
products = pd.read_csv('Products.csv')

uploaded = files.upload()
transactions = pd.read_csv('Transactions.csv')

Saving Customers.csv to Customers.csv


Saving Products.csv to Products.csv


Saving Transactions.csv to Transactions.csv


**Merge datasets**

In [5]:
transactions_merged = transactions.merge(products, on="ProductID", how="left")
full_data = transactions_merged.merge(customers, on="CustomerID", how="left")

**Convert TransactionDate to datetime**

In [6]:
full_data["TransactionDate"] = pd.to_datetime(full_data["TransactionDate"])

**Feature Engineering: Aggregate customer features**

In [7]:
customer_features = full_data.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    total_transactions=("TransactionID", "count"),
    avg_transaction_value=("TotalValue", "mean"),
    unique_products=("ProductID", "nunique"),
    unique_categories=("Category", "nunique")
).reset_index()


 **Encode Region as numerical features**

In [9]:
customer_features = customer_features.merge(customers[["CustomerID", "Region"]], on="CustomerID", how="left")
customer_features = pd.get_dummies(customer_features, columns=["Region"], drop_first=True)

**Select first 20 customers**

In [10]:
first_20_customers = customer_features[customer_features["CustomerID"].isin([f"C{str(i).zfill(4)}" for i in range(1, 21)])]

**Compute similarity using cosine similarity**

In [11]:
customer_features_numeric = customer_features.set_index("CustomerID")
similarity_matrix = cosine_similarity(customer_features_numeric, customer_features_numeric)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features_numeric.index, columns=customer_features_numeric.index)

**Find top 3 similar customers for each of the first 20 customers**

In [12]:
lookalike_results = {}

for cust_id in first_20_customers["CustomerID"]:
    similar_customers = similarity_df.loc[cust_id].drop(cust_id).nlargest(3)
    lookalike_results[cust_id] = list(zip(similar_customers.index, similar_customers.values))

**Convert results to DataFrame**

In [18]:
lookalike_df = pd.DataFrame([(k, v) for k, v in lookalike_results.items()], columns=["CustomerID", "Lookalike"])

In [19]:
# Save to CSV with the requested filename
lookalike_csv_path = "Goddati_Bhavyasri_Lookalike.csv"
lookalike_df.to_csv(lookalike_csv_path, index=False)

# Print sample results
print(lookalike_df.head())

  CustomerID                                          Lookalike
0      C0001  [(C0152, 0.9999999997803745), (C0107, 0.999999...
1      C0002  [(C0199, 0.9999998629514659), (C0142, 0.999999...
2      C0003  [(C0178, 0.9999999365474926), (C0146, 0.999999...
3      C0004  [(C0021, 0.9999999669549791), (C0173, 0.999999...
4      C0005  [(C0159, 0.999999999188442), (C0186, 0.9999999...


**Download Csv file**

In [20]:
from google.colab import files
files.download('Goddati_Bhavyasri_Lookalike.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>