In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

In [4]:
# Load datasets
cs = pd.read_csv("Customers.csv")
ps = pd.read_csv("Products.csv")
ts = pd.read_csv("Transactions.csv")

In [5]:
# Convert dates to datetime
cs['SignupDate'] = pd.to_datetime(cs['SignupDate'])
ts['TransactionDate'] = pd.to_datetime(ts['TransactionDate'])


In [7]:
# Merge the dataframes
# Assuming 'CustomerID' and 'ProductID' are common columns for merging
merged_data = pd.merge(ts, cs, on='CustomerID', how='left')
merged_data = pd.merge(merged_data, ps, on='ProductID', how='left')


# Aggregate transaction data per customer
customer_transactions = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',  # Total quantity purchased
    'Category': lambda x: ' '.join(x),  # Combine product categories
    'ProductName': lambda x: ' '.join(x)  # Combine product names
}).reset_index()

In [12]:
# Merge customer data with aggregated transaction data
customer_features = cs.merge(customer_transactions, on='CustomerID', how='left').fillna(0)


In [13]:
# Normalize numerical features
scaler = StandardScaler()
numerical_features = scaler.fit_transform(customer_features[['TotalValue', 'Quantity']])

In [15]:
# Convert 'Category' column to string type before applying TF-IDF
customer_features['Category'] = customer_features['Category'].astype(str)

# Encode categorical features using TF-IDF
vectorizer = TfidfVectorizer()
category_features = vectorizer.fit_transform(customer_features['Category'])

In [16]:
# Combine all features into a single matrix
final_features = np.hstack((numerical_features, category_features.toarray()))

In [17]:
# Calculate Cosine Similarity
similarity_matrix = cosine_similarity(final_features)

In [18]:
# Create Lookalike Recommendations
lookalike_map = {}
for idx, customer_id in enumerate(customer_features['CustomerID']):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_features['CustomerID'][i], score) for i, score in similarity_scores[1:4]]
    lookalike_map[customer_id] = top_3

In [20]:
# Prepare Lookalike.csv for the first 20 customers
lookalike_data = []
for customer_id in cs['CustomerID'][:20]:
    recommendations = lookalike_map.get(customer_id, [])
    lookalike_data.append({
        'cust_id': customer_id,
        'lookalikes': recommendations
    })

In [22]:
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Dhanesh_Gore_Lookalike.csv', index=False)