# Lookalike model development 

# Step 1 : Loading the data sets
We are going to import necessary libraries and data sets

In [1]:
import pandas as pd

In [3]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [4]:
print(customers.head())
print(products.head())
print(transactions.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

## Step 2 : Merging the data sets 
We will merge the transaction data and customers data to create a data set

In [5]:
merged_data = transactions.merge(customers, on='CustomerID')

## Step 3 : Creating features for each customer
We will summarize the transaction data to create features that represent customer behavior, such as total quantity purchased and total spending

In [6]:
features = merged_data.groupby('CustomerID').agg({
    'Quantity': 'sum',              # Total quantity purchased
    'TotalValue': 'sum',            # Total spending
    'Region': 'first',              # The first region
}).reset_index()

print(features.head())  # Checking the new features created


  CustomerID  Quantity  TotalValue         Region
0      C0001        12     3354.52  South America
1      C0002        10     1862.74           Asia
2      C0003        14     2725.38  South America
3      C0004        23     5354.88  South America
4      C0005         7     2034.24           Asia


## Step 4: Calculate Similarity Scores
To find similar customers, we will use cosine similarity based on the features we created

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Normalizing the features for similarity calculation
features_normalized = features[['Quantity', 'TotalValue']].values
similarity_matrix = cosine_similarity(features_normalized)

# Function to get top 3 lookalikes for a given customer ID
def get_lookalikes(customer_id):
    index = features[features['CustomerID'] == customer_id].index[0]
    similar_indices = similarity_matrix[index].argsort()[-4:-1][::-1]  # Get top 3 excluding self
    return features.iloc[similar_indices][['CustomerID']], similarity_matrix[index][similar_indices]

# Getting lookalikes for the first 20 customers
lookalikes = {}
for customer in features['CustomerID'].head(20):
    lookalike_ids, scores = get_lookalikes(customer)
    lookalikes[customer] = list(zip(lookalike_ids['CustomerID'], scores))


## Step 5: Save Lookalike Results
Finally, we will save our results in a CSV file

In [12]:
# Prepare data for saving into DataFrame
lookalike_records = []

for cust_id, similar_customers in lookalikes.items():
    for similar_customer in similar_customers:
        lookalike_records.append({
            'CustomerID': cust_id,
            'Lookalike_ID': similar_customer[0],
            'Score': similar_customer[1]
        })

# Create DataFrame from records
lookalike_df = pd.DataFrame(lookalike_records)

# Save results to CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)
