<a href="https://colab.research.google.com/github/AdithyaLakshmi23/assignmentzeotap/blob/main/Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**task1**

In [None]:
# Importing required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Displaying data info and the first few rows for each dataset
print("Customers Dataset Info:")
print(customers.info())
print(customers.head(), "\n")

print("Products Dataset Info:")
print(products.info())
print(products.head(), "\n")

print("Transactions Dataset Info:")
print(transactions.info())
print(transactions.head(), "\n")

# Missing values check
print("Missing values in Customers:", customers.isnull().sum())
print("Missing values in Products:", products.isnull().sum())
print("Missing values in Transactions:", transactions.isnull().sum())

# Descriptive statistics
print("Transactions Descriptive Statistics:")
print(transactions.describe())

# EDA on Customers
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customer_region_count = customers['Region'].value_counts()
print("Customer count by region:\n", customer_region_count)

# Visualization: Customer distribution by region
plt.figure(figsize=(8, 5))
sns.barplot(x=customer_region_count.index, y=customer_region_count.values)
plt.title("Customer Distribution by Region")
plt.xlabel("Region")
plt.ylabel("Number of Customers")
plt.show()

# EDA on Products
product_category_count = products['Category'].value_counts()
print("Product count by category:\n", product_category_count)

# Visualization: Product distribution by category
plt.figure(figsize=(8, 5))
sns.barplot(x=product_category_count.index, y=product_category_count.values)
plt.title("Product Distribution by Category")
plt.xlabel("Category")
plt.ylabel("Number of Products")
plt.show()

# EDA on Transactions
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
transactions['YearMonth'] = transactions['TransactionDate'].dt.to_period('M')

# Total sales over time
sales_over_time = transactions.groupby('YearMonth')['TotalValue'].sum()

plt.figure(figsize=(10, 6))
sales_over_time.plot(kind='line', marker='o')
plt.title("Total Sales Over Time")
plt.xlabel("Year-Month")
plt.ylabel("Total Sales (USD)")
plt.grid()
plt.show()

# Top 5 products by total sales
top_products = transactions.groupby('ProductID')['TotalValue'].sum().nlargest(5)
top_product_names = products.loc[products['ProductID'].isin(top_products.index), ['ProductID', 'ProductName']]

print("Top 5 Products by Total Sales:")
print(pd.merge(top_products.reset_index(), top_product_names, on='ProductID'))

# Transactions by region
transactions_customers = transactions.merge(customers, on='CustomerID')
region_sales = transactions_customers.groupby('Region')['TotalValue'].sum()

plt.figure(figsize=(8, 5))
sns.barplot(x=region_sales.index, y=region_sales.values)
plt.title("Total Sales by Region")
plt.xlabel("Region")
plt.ylabel("Total Sales (USD)")
plt.show()


**TASK2**

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Data Preparation
# Merge transactions with customer and product data
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Feature Engineering
# Create customer-level aggregated features
customer_features = data.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    avg_spent=("TotalValue", "mean"),
    num_transactions=("TransactionID", "count")
).reset_index()

# One-hot encode region and product categories
customer_region = pd.get_dummies(customers.set_index("CustomerID")["Region"], prefix="Region")
product_categories = pd.get_dummies(data.set_index("CustomerID")["Category"], prefix="Category")
customer_features = customer_features.set_index("CustomerID").join(customer_region).join(product_categories)

# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features)

# Compute Cosine Similarity
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features.index, columns=customer_features.index)

# Generate Recommendations
recommendations = {}
for customer in customers["CustomerID"][:20]:
    similar_customers = (
        similarity_df[customer]
        .sort_values(ascending=False)[1:4]  # Exclude the customer themselves
        .reset_index()
    )
    similar_customers.columns = ["cust_id", "score"]
    recommendations[customer] = similar_customers.values.tolist()

# Save recommendations to Lookalike.csv
recommendations_df = pd.DataFrame(
    [{"cust_id": k, "similar_customers": v} for k, v in recommendations.items()]
)
recommendations_df.to_csv("Lookalike.csv", index=False)

print("Lookalike.csv has been generated with top 3 recommendations for C0001–C0020.")


**TASK3**

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score, silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
transactions_agg = transactions.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    avg_spent=("TotalValue", "mean"),
    num_transactions=("TransactionID", "count")
).reset_index()
data = customers.merge(transactions_agg, on="CustomerID", how="left")

# Fill missing values (if any customers have no transactions)
data.fillna(0, inplace=True)

# Encode region as one-hot
data = pd.get_dummies(data, columns=["Region"], drop_first=True)

# Drop non-numeric columns for clustering
data_numeric = data.drop(columns=["CustomerID", "CustomerName", "SignupDate"])

# Standardize features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric)

# Clustering using K-Means
db_scores = []
silhouette_scores = []
cluster_range = range(2, 11)

for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(data_scaled)

    # Calculate Davies-Bouldin Index and Silhouette Score
    db_index = davies_bouldin_score(data_scaled, clusters)
    silhouette_avg = silhouette_score(data_scaled, clusters)

    db_scores.append(db_index)
    silhouette_scores.append(silhouette_avg)

# Optimal number of clusters
optimal_k = cluster_range[np.argmin(db_scores)]
print(f"Optimal Number of Clusters (based on DB Index): {optimal_k}")

# Fit K-Means with optimal clusters
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
data["Cluster"] = kmeans.fit_predict(data_scaled)

# Visualize Clusters (2D scatter plot using first 2 features)
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x=data_scaled[:, 0], y=data_scaled[:, 1], hue=data["Cluster"], palette="viridis", s=50
)
plt.title("Customer Segments")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend(title="Cluster")
plt.show()

# Evaluate Clusters
print(f"Davies-Bouldin Index: {min(db_scores):.4f}")
print(f"Silhouette Score: {max(silhouette_scores):.4f}")

# Save results
data[["CustomerID", "Cluster"]].to_csv("Customer_Segments.csv", index=False)
print("Customer_Segments.csv has been saved.")
