In [1]:
import pandas as pd

customers = pd.read_csv("Customers.csv")  
products = pd.read_csv("Products.csv")  
transactions = pd.read_csv("Transactions.csv")  

customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])  
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])  

print("Customers Data:\n", customers.head(), "\n")  
print("Products Data:\n", products.head(), "\n")  
print("Transactions Data:\n", transactions.head())



Customers Data:
   CustomerID        CustomerName         Region SignupDate
0      C0001    Lawrence Carroll  South America 2022-07-10
1      C0002      Elizabeth Lutz           Asia 2022-02-13
2      C0003      Michael Rivera  South America 2024-03-07
3      C0004  Kathleen Rodriguez  South America 2022-10-09
4      C0005         Laura Weber           Asia 2022-08-15 

Products Data:
   ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31 

Transactions Data:
   TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0        T00001      C0199      P067 2024-08-25 12:38:23         1   
1        T00112      C0146      P067 2024-05-27 22:23:54         1   
2        T00166      C0127     

In [2]:
print("Missing values in Transactions:", transactions.isnull().sum().sum())  
print("Missing values in Customers:", customers.isnull().sum().sum())  
print("Missing values in Products:", products.isnull().sum().sum())

Missing values in Transactions: 0
Missing values in Customers: 0
Missing values in Products: 0


In [3]:
import pandas as pd

customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

customer_features = transactions.pivot_table(index="CustomerID", columns="ProductID", values="Quantity", fill_value=0)

print(customer_features.head())


ProductID   P001  P002  P003  P004  P005  P006  P007  P008  P009  P010  ...  \
CustomerID                                                              ...   
C0001        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
C0002        0.0   0.0   0.0   4.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
C0003        0.0   4.0   0.0   0.0   0.0   3.0   0.0   0.0   0.0   0.0  ...   
C0004        0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0   0.0   0.0  ...   
C0005        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

ProductID   P091  P092  P093  P094  P095  P096  P097  P098  P099  P100  
CustomerID                                                              
C0001        0.0   0.0   0.0   0.0   0.0   2.0   0.0   0.0   0.0   0.0  
C0002        0.0   0.0   0.0   0.0   2.0   0.0   0.0   0.0   0.0   0.0  
C0003        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
C0004        0.0   0.0   0.0   0.0   0.0   0.0   3.0   0.0   0.0   0.0  
C0005   

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(customer_features)

similarity_df = pd.DataFrame(similarity_matrix, index=customer_features.index, columns=customer_features.index)

print(similarity_df.head())


CustomerID  C0001  C0002     C0003     C0004     C0005  C0006     C0007  \
CustomerID                                                                
C0001         1.0    0.0  0.000000  0.000000  0.000000    0.0  0.214834   
C0002         0.0    1.0  0.000000  0.000000  0.000000    0.0  0.000000   
C0003         0.0    0.0  1.000000  0.097980  0.308697    0.0  0.000000   
C0004         0.0    0.0  0.097980  1.000000  0.168034    0.0  0.000000   
C0005         0.0    0.0  0.308697  0.168034  1.000000    0.0  0.000000   

CustomerID     C0008  C0009     C0010  ...     C0191     C0192  C0193  \
CustomerID                             ...                              
C0001       0.000000    0.0  0.000000  ...  0.061721  0.000000    0.0   
C0002       0.262071    0.0  0.000000  ...  0.000000  0.000000    0.0   
C0003       0.313786    0.0  0.000000  ...  0.000000  0.000000    0.0   
C0004       0.048038    0.0  0.149854  ...  0.000000  0.000000    0.0   
C0005       0.000000    0.0  0.00000

In [5]:
def get_top_3_lookalikes(customer_id):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    return list(zip(similar_customers.index, similar_customers.values))

lookalike_results = {cust_id: get_top_3_lookalikes(cust_id) for cust_id in customers['CustomerID'].head(20)}

for customer, lookalikes in lookalike_results.items():
    print(f"Customer {customer} is similar to:", lookalikes)


Customer C0001 is similar to: [('C0097', 0.5477225575051661), ('C0194', 0.469668218313862), ('C0199', 0.4381780460041329)]
Customer C0002 is similar to: [('C0091', 0.3801987652174059), ('C0030', 0.37282185960072), ('C0071', 0.329914439536929)]
Customer C0003 is similar to: [('C0134', 0.5199469468957452), ('C0181', 0.5175973113765044), ('C0144', 0.39999999999999997)]
Customer C0004 is similar to: [('C0070', 0.4988876515698588), ('C0132', 0.3843075691322091), ('C0063', 0.3360672201667223)]
Customer C0005 is similar to: [('C0096', 0.6482037235521645), ('C0055', 0.5144957554275265), ('C0064', 0.3328770246548891)]
Customer C0006 is similar to: [('C0058', 0.6488856845230502), ('C0040', 0.5803810000880093), ('C0196', 0.4643048000704075)]
Customer C0007 is similar to: [('C0020', 0.5883484054145521), ('C0079', 0.49613893835683387), ('C0026', 0.36313651960128146)]
Customer C0008 is similar to: [('C0144', 0.39223227027636803), ('C0088', 0.33968311024337877), ('C0165', 0.32144907266094414)]
Custom

In [6]:

lookalike_df = pd.DataFrame([
    {"CustomerID": cust, "Lookalike1": top[0][0], "Similarity1": top[0][1], 
     "Lookalike2": top[1][0], "Similarity2": top[1][1], 
     "Lookalike3": top[2][0], "Similarity3": top[2][1]}
    for cust, top in lookalike_results.items()
])

lookalike_df.to_csv("RuhulFatimaAbdi_Lookalike.csv", index=False)
print("Lookalike model results saved as RuhulFatimaAbdi_Lookalike.csv!")


Lookalike model results saved as RuhulFatimaAbdi_Lookalike.csv!
