In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline


In [28]:
#  datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')



In [29]:
# merge trans && products
transactions = pd.merge(transactions, products, on='ProductID')
transactions.columns

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'ProductName', 'Category',
       'Price_y'],
      dtype='object')

In [30]:
# aggregate transaction data at the customer level
customer_transactions = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',          # Sum of total transaction value
    'Quantity': 'sum',            # Sum of quantities purchased
    'Price_x': 'mean',            # Average price of products purchased
    'Category': lambda x: x.mode()[0]  # Most frequent category
}).reset_index()

customer_transactions = customer_transactions.rename(columns={
    'Price_x': 'AvgPrice'
})

In [31]:

customer_data = pd.merge(customers, customer_transactions, on='CustomerID')

In [32]:
# feature engineering
customer_data['SignupDate'] = pd.to_datetime(customer_data['SignupDate'])
customer_data['DaysSinceSignup'] = (pd.Timestamp.now() - customer_data['SignupDate']).dt.days

In [33]:


customer_data = customer_data.drop(columns=['CustomerName', 'SignupDate'])

In [34]:
# Define numeric and categorical features
numeric_features = ['TotalValue', 'Quantity', 'AvgPrice', 'DaysSinceSignup']  # Updated 'Price' to 'AvgPrice'
categorical_features = ['Region', 'Category']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])




print(f"Transformed features shape: {customer_features.shape}")

Transformed features shape: (199, 12)


In [35]:

customer_features = preprocessor.fit_transform(customer_data)

In [36]:

similarity_matrix = cosine_similarity(customer_features)

In [37]:


def get_top_similar_customers(customer_id, similarity_matrix, customer_data, top_n=3):
    customer_index = customer_data[customer_data['CustomerID'] == customer_id].index[0]
    similarity_scores = list(enumerate(similarity_matrix[customer_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_similar_customers = similarity_scores[1:top_n+1]  # Exclude the customer itself
    return [(customer_data.iloc[i]['CustomerID'], score) for i, score in top_similar_customers]

In [38]:
# generate  for the first 20 customers
lookalike_map = {}
for customer_id in customer_data['CustomerID'].iloc[:20]:
    lookalike_map[customer_id] = get_top_similar_customers(customer_id, similarity_matrix, customer_data)

In [39]:

# save the results to a CSV file
lookalike_df = pd.DataFrame.from_dict(lookalike_map, orient='index')
lookalike_df = lookalike_df.stack().apply(pd.Series).reset_index(level=1, drop=True)
lookalike_df.columns = ['SimilarCustomerID', 'SimilarityScore']
lookalike_df.to_csv('Lookalike.csv')

print("Lookalike recommendations saved to Lookalike.csv")

Lookalike recommendations saved to Lookalike.csv
