In [38]:
import pandas as pd

# Load data
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')



In [39]:
# Merge transaction data with customer data
transactions_with_customers = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')

# Merge transaction data with product data to get product information
transactions_with_products = pd.merge(transactions_with_customers, products_df, on='ProductID', how='left')

# Preview the merged data
transactions_with_products.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [40]:
# Feature engineering: customer-level features
customer_features = transactions_with_products.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'nunique'),
    num_products=('ProductID', 'nunique'),
    product_categories=('Category', lambda x: ','.join(x.unique())),
    region=('Region', 'first'),  # Assume region is the same for each customer
    signup_date=('SignupDate', 'first')
).reset_index()

# Convert signup date to days since signup
customer_features['signup_date'] = pd.to_datetime(customer_features['signup_date'])
customer_features['days_since_signup'] = (pd.to_datetime('today') - customer_features['signup_date']).dt.days


In [37]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Prepare the data: scaling numerical features and one-hot encoding categorical features
numerical_features = ['total_spent', 'num_transactions', 'num_products', 'days_since_signup']
categorical_features = ['region', 'product_categories']

# Numerical transformation (scaling)
scaler = StandardScaler()

# Categorical transformation (one-hot encoding)
encoder = OneHotEncoder(sparse_output=False)  # Use sparse_output instead of sparse

# Full preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numerical_features),
        ('cat', encoder, categorical_features)
    ]
)

# Apply the transformations
X = preprocessor.fit_transform(customer_features)

# Calculate the cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(X)

# Create a DataFrame for easier handling
cos_sim_df = pd.DataFrame(cos_sim, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])



In [34]:
# Function to get top 3 lookalike customers for each customer
def get_top_lookalikes(cust_id, top_n=3):
    # Get similarity scores for the given customer
    similarity_scores = cos_sim_df[cust_id].sort_values(ascending=False)
    
    # Exclude the customer itself (self-similarity)
    similarity_scores = similarity_scores.drop(cust_id)
    
    # Get top_n similar customers
    top_lookalikes = similarity_scores.head(top_n)
    
    # Return a list of customer IDs and their similarity scores
    return list(zip(top_lookalikes.index, top_lookalikes.values))

# Generate lookalikes for the first 20 customers
lookalikes = {}
for cust_id in customer_features['CustomerID'][:20]:
    lookalikes[cust_id] = get_top_lookalikes(cust_id)

# Preview the lookalike recommendations
lookalikes


{'C0001': [('C0152', np.float64(0.7197889775848073)),
  ('C0118', np.float64(0.7028306235557349)),
  ('C0153', np.float64(0.6481888458597074))],
 'C0002': [('C0159', np.float64(0.9345855044457588)),
  ('C0007', np.float64(0.7541799485978633)),
  ('C0005', np.float64(0.7515938514660171))],
 'C0003': [('C0031', np.float64(0.9797955227586055)),
  ('C0158', np.float64(0.7669778435126272)),
  ('C0085', np.float64(0.7088212449512772))],
 'C0004': [('C0104', np.float64(0.8605884225145185)),
  ('C0099', np.float64(0.848986595720709)),
  ('C0102', np.float64(0.8462251603307036))],
 'C0005': [('C0159', np.float64(0.8135090347799416)),
  ('C0007', np.float64(0.8023057550655924)),
  ('C0043', np.float64(0.768007695629993))],
 'C0006': [('C0187', np.float64(0.8743100127032233)),
  ('C0138', np.float64(0.6075570460934051)),
  ('C0085', np.float64(0.5774543673714764))],
 'C0007': [('C0140', np.float64(0.842410653536942)),
  ('C0159', np.float64(0.8047436420589594)),
  ('C0005', np.float64(0.802305755

In [36]:
# Prepare the data for saving
lookalike_data = []
for cust_id, recommendations in lookalikes.items():
    for rec in recommendations:
        lookalike_data.append([cust_id, rec[0], rec[1]])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)