In [20]:
from google.colab import files
uploaded = files.upload()  # This will prompt to upload the files

Saving Transactions.csv to Transactions (1).csv
Saving Products.csv to Products (1).csv
Saving Customers.csv to Customers (1).csv


In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Feature Engineering
# Aggregate transaction data for each customer
customer_features = data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spend
    "Quantity": "sum",    # Total quantity purchased
    "Category": lambda x: x.mode()[0],  # Most frequent category
    "Region": "first",    # Region
}).reset_index()

# One-Hot Encoding for categorical features
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(customer_features[["Region", "Category"]]).toarray()

# Combine numerical and encoded categorical features
numerical_features = customer_features[["TotalValue", "Quantity"]].values
scaler = MinMaxScaler()
scaled_numerical = scaler.fit_transform(numerical_features)
final_features = pd.DataFrame(
    data=np.hstack((scaled_numerical, encoded_features)),
    index=customer_features["CustomerID"]
)

# Calculate Cosine Similarity
similarity_matrix = cosine_similarity(final_features)

# Generate Recommendations
lookalike_data = {}
customer_ids = customer_features["CustomerID"].values
for idx, customer_id in enumerate(customer_ids[:20]):  # First 20 customers
    similar_indices = similarity_matrix[idx].argsort()[::-1][1:4]  # Top 3 similar
    similar_customers = [(customer_ids[i], similarity_matrix[idx][i]) for i in similar_indices]
    lookalike_data[customer_id] = similar_customers

# Save Lookalike.csv
lookalike_df = pd.DataFrame({
    "cust_id": list(lookalike_data.keys()),
    "similar_customers": [str(similar) for similar in lookalike_data.values()]
})
lookalike_df.to_csv("Lookalike.csv", index=False)

# Display Lookalike.csv structure
print(lookalike_df.head(20))


   cust_id                                  similar_customers
0    C0001  [('C0184', 0.9997715277163312), ('C0048', 0.99...
1    C0002  [('C0088', 0.9998459356527598), ('C0092', 0.99...
2    C0003  [('C0076', 0.9977575893729745), ('C0052', 0.99...
3    C0004  [('C0169', 0.9978038040040306), ('C0087', 0.99...
4    C0005  [('C0186', 0.9998420404059932), ('C0146', 0.99...
5    C0006  [('C0126', 0.9996118645056206), ('C0187', 0.99...
6    C0007  [('C0146', 0.9999998323693366), ('C0115', 0.99...
7    C0008  [('C0160', 0.9905422274011596), ('C0059', 0.98...
8    C0009  [('C0198', 0.9999972410802239), ('C0061', 0.99...
9    C0010  [('C0111', 0.9964431632741196), ('C0062', 0.99...
10   C0011  [('C0006', 0.9992885999027596), ('C0137', 0.99...
11   C0012  [('C0163', 0.9995437048897474), ('C0113', 0.99...
12   C0013  [('C0099', 0.9980808806093808), ('C0108', 0.99...
13   C0014  [('C0060', 0.9998756502898258), ('C0089', 0.98...
14   C0015  [('C0131', 0.9964314311554922), ('C0036', 0.99...
15   C00

In [23]:
# Save Lookalike.csv with more readable formatting
lookalike_df = pd.DataFrame({
    "cust_id": list(lookalike_data.keys()),
    "similar_customers": [
        "; ".join([f"({cust_id}, {score:.4f})" for cust_id, score in similar])
        for similar in lookalike_data.values()
    ]
})
lookalike_df.to_csv("Lookalike.csv", index=False)

# Display Lookalike.csv structure
print(lookalike_df.head(20))


   cust_id                                  similar_customers
0    C0001  (C0184, 0.9998); (C0048, 0.9995); (C0190, 0.9988)
1    C0002  (C0088, 0.9998); (C0092, 0.9973); (C0106, 0.9958)
2    C0003  (C0076, 0.9978); (C0052, 0.9964); (C0031, 0.9955)
3    C0004  (C0169, 0.9978); (C0087, 0.9974); (C0155, 0.9939)
4    C0005  (C0186, 0.9998); (C0146, 0.9992); (C0007, 0.9992)
5    C0006  (C0126, 0.9996); (C0187, 0.9995); (C0011, 0.9993)
6    C0007  (C0146, 1.0000); (C0115, 0.9994); (C0005, 0.9992)
7    C0008  (C0160, 0.9905); (C0059, 0.9886); (C0079, 0.9875)
8    C0009  (C0198, 1.0000); (C0061, 0.9944); (C0062, 0.9925)
9    C0010  (C0111, 0.9964); (C0062, 0.9962); (C0103, 0.9928)
10   C0011  (C0006, 0.9993); (C0137, 0.9989); (C0126, 0.9987)
11   C0012  (C0163, 0.9995); (C0113, 0.9980); (C0195, 0.9972)
12   C0013  (C0099, 0.9981); (C0108, 0.9974); (C0107, 0.9848)
13   C0014  (C0060, 0.9999); (C0089, 0.9830); (C0172, 0.9766)
14   C0015  (C0131, 0.9964); (C0036, 0.9949); (C0094, 0.9937)
15   C00

In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Simulated data for demonstration
customers = pd.DataFrame({
    'CustomerID': [f'C{i:04}' for i in range(1, 21)],
    'Region': np.random.choice(['North America', 'Europe', 'Asia', 'South America'], 20),
    'SignupDate': pd.date_range(start='2023-01-01', periods=20)
})

products = pd.DataFrame({
    'ProductID': [f'P{i:03}' for i in range(1, 21)],
    'Category': np.random.choice(['Books', 'Electronics', 'Home Decor', 'Clothing'], 20),
    'Price': np.random.randint(10, 500, 20)
})

transactions = pd.DataFrame({
    'TransactionID': [f'T{i:05}' for i in range(1, 101)],
    'CustomerID': np.random.choice(customers['CustomerID'], 100),
    'ProductID': np.random.choice(products['ProductID'], 100),
    'TransactionDate': pd.date_range(start='2023-01-01', periods=100),
    'Quantity': np.random.randint(1, 5, 100),
    'TotalValue': np.random.randint(20, 2000, 100)
})

# Merge datasets
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Feature Engineering: Aggregating data per customer
customer_features = data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spend
    "Quantity": "sum",    # Total quantity purchased
    "Category": lambda x: x.mode()[0],  # Most purchased category
    "Region": "first"    # Region
}).reset_index()

# One-Hot Encoding for categorical features
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(customer_features[["Region", "Category"]]).toarray()

# Normalizing numerical features
numerical_features = customer_features[["TotalValue", "Quantity"]].values
scaler = MinMaxScaler()
scaled_numerical = scaler.fit_transform(numerical_features)

# Combine all features
final_features = pd.DataFrame(
    data=np.hstack((scaled_numerical, encoded_features)),
    index=customer_features["CustomerID"]
)

# Calculate Cosine Similarity
similarity_matrix = cosine_similarity(final_features)

# Generate Lookalike Data
lookalike_data = {}
customer_ids = customer_features["CustomerID"].values
for idx, customer_id in enumerate(customer_ids[:20]):  # First 20 customers
    similar_indices = similarity_matrix[idx].argsort()[::-1][1:4]  # Top 3 similar customers
    similar_customers = [(customer_ids[i], round(similarity_matrix[idx][i], 4)) for i in similar_indices]
    lookalike_data[customer_id] = similar_customers

# Create Lookalike.csv format
lookalike_df = pd.DataFrame({
    "cust_id": list(lookalike_data.keys()),
    "similar_customers": [
        [(cust_id, score) for cust_id, score in similar]
        for similar in lookalike_data.values()
    ]
})

# Display the Lookalike.csv structure
lookalike_df.head(20)

Unnamed: 0,cust_id,similar_customers
0,C0001,"[(C0015, 0.984), (C0005, 0.9755), (C0004, 0.94..."
1,C0002,"[(C0014, 0.9986), (C0016, 0.9696), (C0019, 0.8..."
2,C0003,"[(C0006, 0.951), (C0010, 0.606), (C0019, 0.6054)]"
3,C0004,"[(C0005, 0.9941), (C0015, 0.9556), (C0001, 0.9..."
4,C0005,"[(C0004, 0.9941), (C0015, 0.9805), (C0001, 0.9..."
5,C0006,"[(C0003, 0.951), (C0019, 0.6875), (C0010, 0.67..."
6,C0007,"[(C0010, 0.8442), (C0009, 0.4983), (C0004, 0.4..."
7,C0008,"[(C0018, 0.9946), (C0013, 0.5859), (C0012, 0.5..."
8,C0009,"[(C0020, 0.9175), (C0017, 0.9), (C0013, 0.8666)]"
9,C0010,"[(C0007, 0.8442), (C0006, 0.6789), (C0012, 0.6..."


In [17]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=lookalike_df)

https://docs.google.com/spreadsheets/d/1gSUGBokbKPaHu7P-ArN96XHDll1Xk-Cj6OB1lIKuoXk#gid=0
