<a href="https://colab.research.google.com/github/AnshumanAI/As/blob/main/Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the transaction data
transaction_data = pd.read_csv("transaction_data.csv")
purchase_behaviour_data = pd.read_csv("purchase_behaviour.csv")

# Merge transaction data with purchase behavior data on loyalty card number
merged_data = transaction_data.merge(purchase_behaviour_data, on="LYLTY_CARD_NBR")

# ---- Step 1: Find Top 3 Most Profitable Products ----
top_products = (
    merged_data.groupby(["PROD_NBR", "PROD_NAME"])["TOT_SALES"]
    .sum()
    .reset_index()
    .sort_values(by="TOT_SALES", ascending=False)
)
top_3_products = top_products.head(3)
print("\nTop 3 Most Profitable Products:\n", top_3_products)

# ---- Step 2: Cluster Customers Using K-Means ----
customer_data = (
    merged_data.groupby(["LYLTY_CARD_NBR", "LIFESTAGE", "PREMIUM_CUSTOMER"])
    .agg({"TOT_SALES": "sum", "PROD_QTY": "sum"})
    .reset_index()
)

# Convert categorical columns into numerical format
customer_data_encoded = pd.get_dummies(customer_data, columns=["LIFESTAGE", "PREMIUM_CUSTOMER"], drop_first=True)

# Standardize the data
scaler = StandardScaler()
customer_data_scaled = scaler.fit_transform(customer_data_encoded.drop(columns=["LYLTY_CARD_NBR"]))

# Apply K-Means clustering to segment customers
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
customer_data_encoded["Cluster"] = kmeans.fit_predict(customer_data_scaled)

# Display the clustered customer segments
print("\nCustomer Segments from Clustering:\n", customer_data_encoded.groupby("Cluster").mean())

# ---- Step 3: Predict Customer Loyalty Using Machine Learning ----
X = customer_data_encoded.drop(columns=["LYLTY_CARD_NBR", "Cluster"])
y = customer_data_encoded["Cluster"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Generate classification report
classification_rep = classification_report(y_test, y_pred)
print("\nClassification Report:\n", classification_rep)

# ---- Step 4: Analyze Feature Importance ----
feature_importance = pd.DataFrame({"Feature": X.columns, "Importance": rf_model.feature_importances_})
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

print("\nFeature Importance for Customer Segments:\n", feature_importance)


Top 3 Most Profitable Products:
     PROD_NBR                                 PROD_NAME  TOT_SALES
3          4          Dorito Corn Chp     Supreme 380g    40352.0
13        14    Smiths Crnkle Chip  Orgnl Big Bag 380g    36367.6
15        16  Smiths Crinkle Chips Salt & Vinegar 330g    34804.2

Customer Segments from Clustering:
          LYLTY_CARD_NBR  TOT_SALES   PROD_QTY  LIFESTAGE_NEW FAMILIES  \
Cluster                                                                 
0         136752.860635  14.632036   3.742454                0.063577   
1         136916.905849  24.134610   6.188166                0.000000   
2         134573.811750  51.257442  13.616850                0.006650   

         LIFESTAGE_OLDER FAMILIES  LIFESTAGE_OLDER SINGLES/COUPLES  \
Cluster                                                              
0                        0.111865                         0.218705   
1                        0.000000                         0.000000   
2                  