In [17]:
import pandas as pd
df1 = pd.read_csv('customers.csv')
df2 = pd.read_csv('products.csv')
df3 = pd.read_csv('transactions.csv') 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Aggregate transaction data to get total spend, frequency, etc.
df3_features = df3.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TotalValue', 'count'),
    avg_transaction_value=('TotalValue', 'mean')
).reset_index()
customer_data = pd.merge(df1, df3_features, on='CustomerID', how='left')
print(customer_data[['total_spent', 'transaction_count', 'avg_transaction_value']].isnull().sum())
customer_data = customer_data.dropna(subset=['total_spent', 'transaction_count', 'avg_transaction_value'])
customer_data[['total_spent', 'transaction_count', 'avg_transaction_value']] = customer_data[['total_spent', 'transaction_count', 'avg_transaction_value']].fillna(customer_data[['total_spent', 'transaction_count', 'avg_transaction_value']].mean())



total_spent              1
transaction_count        1
avg_transaction_value    1
dtype: int64


In [16]:
# Step 1: Merge customer profile data with transaction features (correct dataframe references)
customer_data = pd.merge(df1, df3_features, on='CustomerID', how='left')

# Step 2: Normalize features to ensure they are on the same scale
scaler = StandardScaler()
customer_data[['total_spent', 'transaction_count', 'avg_transaction_value']] = scaler.fit_transform(
    customer_data[['total_spent', 'transaction_count', 'avg_transaction_value']])
print(customer_data[['total_spent', 'transaction_count', 'avg_transaction_value']].isnull().sum())
customer_data = customer_data.dropna(subset=['total_spent', 'transaction_count', 'avg_transaction_value'])
customer_data[['total_spent', 'transaction_count', 'avg_transaction_value']] = customer_data[['total_spent', 'transaction_count', 'avg_transaction_value']].fillna(customer_data[['total_spent', 'transaction_count', 'avg_transaction_value']].mean())

# Step 3: Calculate similarity between customers using cosine similarity
# Use all features (Age, Gender, total_spent, transaction_count, avg_transaction_value)
similarity_matrix = cosine_similarity(customer_data[['total_spent', 'transaction_count', 'avg_transaction_value']])

# Step 4: Get top 3 similar customers for each of the first 20 customers
lookalikes = []

for customer_id in range(1, 21):
    # Get the similarity scores for the customer (adjust index -1 as customer_id is 1-based)
    similar_scores = similarity_matrix[customer_id - 1]
    
    # Sort the indices in descending order to get the most similar customers first
    sorted_indices = similar_scores.argsort()[::-1]
    
    # Get the top 3 lookalikes (exclude the customer itself)
    for idx in sorted_indices:
        if customer_data.iloc[idx]['CustomerID'] != f'C{customer_id:04d}':  # Exclude self
            lookalikes.append({
                'CustomerID': f'C{customer_id:04d}',
                'Lookalike_CustomerID': customer_data.iloc[idx]['CustomerID'],
                'Similarity_Score': similar_scores[idx]
            })
        if len(lookalikes) == 3 * (customer_id - 1):  # Only keep top 3 for each customer
            break

# Convert the lookalikes list into a DataFrame
lookalike_df = pd.DataFrame(lookalikes)

# Step 5: Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike model created successfully!")



total_spent              1
transaction_count        1
avg_transaction_value    1
dtype: int64
Lookalike model created successfully!


In [21]:
# Calculate cosine similarity between customers using relevant features
similarity_matrix = cosine_similarity(customer_data[['total_spent', 'transaction_count', 'avg_transaction_value']])

print(similarity_matrix)

[[1.         0.99886795 0.99886814 ... 0.96476382 0.99886802 0.99999991]
 [0.99886795 1.         0.99999978 ... 0.97618714 0.99999999 0.99886759]
 [0.99886814 0.99999978 1.         ... 0.97618701 0.99999986 0.99886806]
 ...
 [0.96476382 0.97618714 0.97618701 ... 1.         0.97618715 0.96476351]
 [0.99886802 0.99999999 0.99999986 ... 0.97618715 1.         0.99886771]
 [0.99999991 0.99886759 0.99886806 ... 0.96476351 0.99886771 1.        ]]
