In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder

customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

transaction_data = transactions.merge(products, on="ProductID").merge(customers, on="CustomerID")

customer_profile = transaction_data.groupby("CustomerID").agg(
    total_spent=pd.NamedAgg(column="TotalValue", aggfunc="sum"),
    avg_transaction_value=pd.NamedAgg(column="TotalValue", aggfunc="mean"),
    transaction_count=pd.NamedAgg(column="TransactionID", aggfunc="count"),
    favorite_category=pd.NamedAgg(column="Category", aggfunc=lambda x: x.mode()[0]),
    region=pd.NamedAgg(column="Region", aggfunc="first")
).reset_index()

encoder = OneHotEncoder()
encoded_region = pd.DataFrame(
    encoder.fit_transform(customer_profile[["region"]]).toarray(),
    columns=encoder.get_feature_names_out(["region"])
)

encoded_category = pd.DataFrame(
    encoder.fit_transform(customer_profile[["favorite_category"]]).toarray(),
    columns=encoder.get_feature_names_out(["favorite_category"])
)

numerical_features = customer_profile[["total_spent", "avg_transaction_value", "transaction_count"]]
scaled_features = StandardScaler().fit_transform(numerical_features)

customer_features = pd.concat(
    [pd.DataFrame(scaled_features, columns=numerical_features.columns), encoded_region, encoded_category],
    axis=1
)

similarity_matrix = cosine_similarity(customer_features)

top_lookalikes = {}
customer_ids = customer_profile["CustomerID"].tolist()

for i, customer_id in enumerate(customer_ids[:20]):  
    similarity_scores = list(enumerate(similarity_matrix[i]))

    similar_customers = sorted(
        [(customer_ids[j], score) for j, score in similarity_scores if i != j],
        key=lambda x: x[1],
        reverse=True
    )[:3]
    top_lookalikes[customer_id] = similar_customers


lookalike_df = pd.DataFrame({
    "CustomerID": list(top_lookalikes.keys()),
    "Lookalikes": [str(value) for value in top_lookalikes.values()]
})

lookalike_df.to_csv("malla_jayavenkatakiran_Lookalike.csv", index=False)

print(lookalike_df.head())


  CustomerID                                         Lookalikes
0      C0001  [('C0190', np.float64(0.968215451295126)), ('C...
1      C0002  [('C0088', np.float64(0.9663574397998078)), ('...
2      C0003  [('C0052', np.float64(0.9847977904024423)), ('...
3      C0004  [('C0165', np.float64(0.9711437718179058)), ('...
4      C0005  [('C0186', np.float64(0.9787905419345101)), ('...
