In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Step 1: Load customer and transaction data
customers = pd.read_csv('Customers.csv')  # Assuming customers data is in a CSV file
transactions = pd.read_csv('Transactions.csv')  # Assuming transactions data is in a CSV file
products = pd.read_csv('Products.csv')  # Assuming product data is in a CSV file

# Step 2: Merge transaction data with product data based on 'ProductID'
merged_data = pd.merge(transactions, products, on='ProductID', how='inner')

# Step 3: Create customer profile
customer_profile = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    purchase_count=('TransactionID', 'count')
).reset_index()

# Merge customer profiles with the original customer data
customers = pd.merge(customers, customer_profile, on='CustomerID', how='left')

# Step 4: Handle missing values in the profile data
# Option 1: Fill missing values with 0 (you can also use other strategies)
customers['total_spent'].fillna(0, inplace=True)
customers['purchase_count'].fillna(0, inplace=True)

# Option 2: Alternatively, drop rows with missing values
# customers = customers.dropna(subset=['total_spent', 'purchase_count'])

# Step 5: Feature scaling
scaler = StandardScaler()
customers_scaled = customers[['total_spent', 'purchase_count']]
customers_scaled = scaler.fit_transform(customers_scaled)

# Step 6: Calculate cosine similarity between customers based on their profiles
similarity_matrix = cosine_similarity(customers_scaled)

# Step 7: Create a function to get the top 3 lookalike customers for each customer
def get_top_lookalikes(customer_id, similarity_matrix, top_n=3):
    customer_idx = customers[customers['CustomerID'] == customer_id].index[0]
    similarity_scores = similarity_matrix[customer_idx]
    similar_customer_indices = similarity_scores.argsort()[-top_n-1:-1][::-1]
    similar_customers = customers.iloc[similar_customer_indices][['CustomerID']]
    similar_customers['SimilarityScore'] = similarity_scores[similar_customer_indices]
    return similar_customers

# Step 8: Generate the lookalike map for the first 20 customers
lookalike_map = {}
for customer_id in customers['CustomerID'][:20]:
    top_lookalikes = get_top_lookalikes(customer_id, similarity_matrix)
    lookalike_map[customer_id] = top_lookalikes.values.tolist()

# Step 9: Save the lookalike map to a CSV file
lookalike_df = pd.DataFrame.from_dict(lookalike_map, orient='index')
lookalike_df.columns = ['Lookalike1', 'Similarity1', 'Lookalike2', 'Similarity2', 'Lookalike3', 'Similarity3']
lookalike_df.to_csv('Lookalike.csv')

print("Lookalike model completed and saved as 'Lookalike.csv'")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers['total_spent'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers['purchase_count'].fillna(0, inplace=True)


<class 'ValueError'>: Length mismatch: Expected axis has 3 elements, new values have 6 elements