<a href="https://colab.research.google.com/github/ChallaAashritha/zeotapassignment/blob/main/FirstName_LastName_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [8]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [9]:
merged_data = transactions.merge(customers, on='CustomerID', how='left')
merged_data = merged_data.merge(products, on='ProductID', how='left')

In [10]:
customer_spending = merged_data.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_spending.rename(columns={'TotalValue': 'TotalSpending'}, inplace=True)

customer_frequency = merged_data.groupby('CustomerID')['TransactionID'].count().reset_index()
customer_frequency.rename(columns={'TransactionID': 'TransactionFrequency'}, inplace=True)

customer_preferences = merged_data.groupby('CustomerID')['Category'].apply(lambda x: ' '.join(x)).reset_index()

In [11]:
customer_features = customers.merge(customer_spending, on='CustomerID', how='left')
customer_features = customer_features.merge(customer_frequency, on='CustomerID', how='left')
customer_features = customer_features.merge(customer_preferences, on='CustomerID', how='left')
customer_features.fillna({'TotalSpending': 0, 'TransactionFrequency': 0, 'Category': ''}, inplace=True)
scaler = StandardScaler()
customer_features[['TotalSpending', 'TransactionFrequency']] = scaler.fit_transform(
    customer_features[['TotalSpending', 'TransactionFrequency']]
)
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(customer_features['Category'])
numeric_features = customer_features.drop(['CustomerID', 'CustomerName', 'SignupDate', 'Category'], axis=1)
final_features = pd.concat(
    [pd.DataFrame(numeric_features.values), pd.DataFrame(tfidf_matrix.toarray())], axis=1
)
similarity_matrix = cosine_similarity(final_features)
customer_ids = customer_features['CustomerID'].tolist()
lookalike_map = {}
for i, cust_id in enumerate(customer_ids[:20]):
    similarities = list(enumerate(similarity_matrix[i]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[1:4]
    lookalike_map[cust_id] = [(customer_ids[idx], round(score, 3)) for idx, score in similarities]
lookalike_df = pd.DataFrame(
    [{'cust_id': cust_id, 'lookalikes': lookalikes} for cust_id, lookalikes in lookalike_map.items()]
)
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)