In [30]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")
print(customers.head())
print(products.head())
print(transactions.head())
df = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")
print(df.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067   2024-04-25 7:38:55         1   
3       

In [31]:
customer_features = df.groupby("CustomerID").agg({
    "TotalValue": "sum",  
    "TransactionID": "count",  
    "ProductID": "nunique",  
    "Category": lambda x: x.mode()[0],  
    "Region": "first" 
}).reset_index()

print(customer_features.head())

  CustomerID  TotalValue  TransactionID  ProductID     Category         Region
0      C0001     3354.52              5          5  Electronics  South America
1      C0002     1862.74              4          4     Clothing           Asia
2      C0003     2725.38              4          4   Home Decor  South America
3      C0004     5354.88              8          8        Books  South America
4      C0005     2034.24              3          3  Electronics           Asia


In [32]:
numerical_cols = customer_features.columns.difference(["CustomerID", "Region", "Category"])
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[numerical_cols])
customer_features_scaled = pd.DataFrame(scaled_features, columns=numerical_cols)
customer_features_scaled["CustomerID"] = customer_features["CustomerID"]
customer_features_scaled["Category"] = customer_features["Category"]
customer_features_scaled["Region"] = customer_features["Region"]
print(customer_features_scaled.head())

   ProductID  TotalValue  TransactionID CustomerID     Category         Region
0   0.050047   -0.061701      -0.011458      C0001  Electronics  South America
1  -0.424204   -0.877744      -0.467494      C0002     Clothing           Asia
2  -0.424204   -0.405857      -0.467494      C0003   Home Decor  South America
3   1.472798    1.032547       1.356650      C0004        Books  South America
4  -0.898455   -0.783929      -0.923530      C0005  Electronics           Asia


In [33]:
import numpy as np
knn = NearestNeighbors(n_neighbors=4, metric='euclidean')
knn.fit(customer_features_scaled[numerical_cols])

def normalize_scores(distances):
    inverse_distances = 1 / distances
    normalized_scores = inverse_distances / np.sum(inverse_distances)  
    return normalized_scores

def get_knn_customers(customer_id):
    if customer_id not in customer_features_scaled["CustomerID"].values:
        print(f"CustomerID {customer_id} not found!")
        return []
    
    customer_idx = customer_features_scaled.index[customer_features_scaled["CustomerID"] == customer_id].tolist()[0]
    customer_vector = customer_features_scaled.loc[customer_idx, numerical_cols].values.reshape(1, -1)

    distances, indices = knn.kneighbors(customer_vector)
    significance_scores = normalize_scores(distances[0][1:])  

    similar_customers_info = customer_features.iloc[indices[0][1:], :]
    similar_customers_info["SignificanceScore"] = significance_scores
    
    return similar_customers_info


In [34]:
similar_customers_info = get_knn_customers("C0191")
print(similar_customers_info)

    CustomerID  TotalValue  TransactionID  ProductID     Category  \
71       C0072     2999.59              5          5  Electronics   
188      C0190     2983.02              5          5  Electronics   
105      C0106     2939.30              5          5     Clothing   

            Region  SignificanceScore  
71   North America           0.880302  
188  South America           0.095391  
105           Asia           0.024307  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers_info["SignificanceScore"] = significance_scores


In [35]:
import pandas as pd
lookalike_dict = {}
first_20_customers = [f"C{str(i).zfill(4)}" for i in range(1, 21)]
for customer_id in first_20_customers:
    similar_customers_info = get_knn_customers(customer_id)
    lookalike_list = list(zip(similar_customers_info["CustomerID"], similar_customers_info["SignificanceScore"]))
    lookalike_dict[customer_id] = lookalike_list
lookalike_df = pd.DataFrame(lookalike_dict.items(), columns=["CustomerID", "Lookalikes"])
lookalike_df.to_csv("Lookalike.csv", index=False)
print("Lookalike.csv file generated successfully! ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers_info["SignificanceScore"] = significance_scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers_info["SignificanceScore"] = significance_scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers_info["SignificanceScore"] = significance_scores
A value

Lookalike.csv file generated successfully! 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers_info["SignificanceScore"] = significance_scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers_info["SignificanceScore"] = significance_scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers_info["SignificanceScore"] = significance_scores
