In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Task 1: Exploratory Data Analysis (EDA)
def perform_eda():
    print("Customers Dataset Overview:")
    print(customers.info())
    print(customers.describe())

    print("\nProducts Dataset Overview:")
    print(products.info())
    print(products.describe())

    print("\nTransactions Dataset Overview:")
    print(transactions.info())
    print(transactions.describe())

    # Example Insights
    print("\nTop 5 Regions by Customer Count:")
    print(customers['Region'].value_counts().head())

    print("\nTop 5 Product Categories by Sales:")
    sales_per_category = transactions.merge(products, on='ProductID')
    sales_per_category = sales_per_category.groupby('Category')['TotalValue'].sum().sort_values(ascending=False)
    print(sales_per_category.head())

# Task 2: Lookalike Model
def build_lookalike_model():
    # Create feature vectors for customers
    customer_features = transactions.groupby('CustomerID')[['Quantity', 'TotalValue']].sum()
    customer_features = customer_features.merge(customers[['CustomerID', 'Region']], on='CustomerID', how='left')

    # Encode regions as one-hot
    customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

    # Normalize data
    scaler = StandardScaler()
    normalized_features = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

    # Compute similarity
    similarity_matrix = cosine_similarity(normalized_features)

    # Find top 3 similar customers for each customer
    similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

    lookalike_results = {}
    for customer_id in similarity_df.index:
        similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False).iloc[1:4]
        lookalike_results[customer_id] = list(zip(similar_customers.index, similar_customers.values))

    return lookalike_results

# Task 3: Customer Segmentation
def perform_clustering():
    # Prepare data
    customer_data = transactions.groupby('CustomerID')[['Quantity', 'TotalValue']].sum()
    customer_data = customer_data.merge(customers[['CustomerID', 'Region']], on='CustomerID', how='left')
    customer_data = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)

    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(customer_data.drop('CustomerID', axis=1))

    # Apply KMeans clustering
    kmeans = KMeans(n_clusters=4, random_state=42)
    clusters = kmeans.fit_predict(scaled_data)
    customer_data['Cluster'] = clusters

    # Davies-Bouldin Index
    from sklearn.metrics import davies_bouldin_score
    db_index = davies_bouldin_score(scaled_data, clusters)
    print(f"Davies-Bouldin Index: {db_index}")

    # Visualize clusters
    pca = PCA(n_components=2)
    pca_data = pca.fit_transform(scaled_data)
    customer_data['PCA1'] = pca_data[:, 0]
    customer_data['PCA2'] = pca_data[:, 1]

    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=customer_data, x='PCA1', y='PCA2', hue='Cluster', palette='viridis')
    plt.title("Customer Segments")
    plt.show()

# Execute Tasks
perform_eda()
lookalike_results = build_lookalike_model()
perform_clustering()

# Save Lookalike Results
lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index', columns=['Top1', 'Top2', 'Top3'])
lookalike_df.to_csv("Lookalike.csv", index_label='CustomerID')
