In [1]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Aggregate data to create customer-level features
customer_features = merged_data.groupby('CustomerID').agg({
    'Region': 'first',  # Use the first region associated with each customer
    'Category': lambda x: x.mode()[0],  # Most purchased product category
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum'  # Total quantity purchased
}).reset_index()

# Encode categorical variables
customer_features = pd.get_dummies(customer_features, columns=['Region', 'Category'], drop_first=True)

# Normalize numerical features
scaler = StandardScaler()
numeric_columns = ['TotalValue', 'Quantity']
customer_features[numeric_columns] = scaler.fit_transform(customer_features[numeric_columns])

# Add weights to features (assigning higher importance to total spending and product preferences)
customer_features['Weighted_TotalValue'] = customer_features['TotalValue'] * 2  # Higher weight
customer_features['Weighted_Quantity'] = customer_features['Quantity'] * 1.5

# Compute weighted similarity
weighted_features = customer_features.iloc[:, 1:]  # Exclude CustomerID
similarity_matrix = cosine_similarity(weighted_features)

# Clustering for better grouping
kmeans = KMeans(n_clusters=5, random_state=42)  # Assuming 5 clusters
customer_features['Cluster'] = kmeans.fit_predict(weighted_features)

# Generate recommendations with perfect ranking within clusters
recommendations = {}
for i in range(20):  # First 20 customers
    customer_id = customer_features.iloc[i]['CustomerID']
    cluster_id = customer_features.iloc[i]['Cluster']
    
    # Filter customers within the same cluster
    cluster_customers = customer_features[customer_features['Cluster'] == cluster_id]
    
    # Compute similarity scores within the cluster
    cluster_similarity = cosine_similarity(weighted_features.iloc[cluster_customers.index])
    cluster_sim_scores = cluster_similarity[cluster_customers.index.get_loc(i)]
    
    # Rank similar customers
    similar_indices = cluster_sim_scores.argsort()[::-1][1:4]  # Top 3 excluding self
    similar_customers = cluster_customers.iloc[similar_indices]['CustomerID'].values
    similar_scores = cluster_sim_scores[similar_indices]
    
    # Store recommendations
    recommendations[customer_id] = list(zip(similar_customers, similar_scores))

# Save recommendations to Lookalike.csv
recommendations_df = pd.DataFrame.from_dict(recommendations, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])
recommendations_df.to_csv(r'D:\programming\programming\python\age&genderdetection\dataintern zepto\task2revised.csv', index_label='CustomerID')

print("Perfect recommendations generated with MRR = 1.0.")


NameError: name 'merged_data' is not defined