In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

In [33]:
# Load the dataset
df = pd.read_csv('sales_and_customer_insights.csv')
print("Dataset shape:", df.shape)
print("First 5 rows:")
print(df.head())
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())

Dataset shape: (10000, 15)
First 5 rows:
   Customer_ID   Product_ID Transaction_ID  Purchase_Frequency  \
0  CUST_9HOS83  PROD_IK97D1   TRANS_II1DZG                  17   
1  CUST_AJU17N  PROD_UNN7KP   TRANS_9HJF7I                  10   
2  CUST_11XNYF  PROD_0XEW2W   TRANS_OT96OM                   3   
3  CUST_IGH8G3  PROD_3IIAJN   TRANS_45V00G                  12   
4  CUST_OK6PUM  PROD_VMIWD2   TRANS_ZAK760                  18   

   Average_Order_Value Most_Frequent_Category  Time_Between_Purchases  \
0               172.57            Electronics                      45   
1                64.89               Clothing                       6   
2               120.38                 Sports                      23   
3                70.34               Clothing                       5   
4                42.39            Electronics                      10   

          Region  Churn_Probability  Lifetime_Value Launch_Date  \
0  South America               0.98          952.81  202

In [34]:
df = df.drop(['Customer_ID', 'Product_ID', 'Transaction_ID'], axis=1)

imputer = SimpleImputer(strategy='most_frequent')
df[df.select_dtypes(include=['object']).columns] = imputer.fit_transform(df.select_dtypes(include=['object']))

num_imputer = SimpleImputer(strategy='mean')
df[df.select_dtypes(include=[np.number]).columns] = num_imputer.fit_transform(df.select_dtypes(include=[np.number]))

In [35]:
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    if col != 'Retention_Strategy':
        df[col] = le.fit_transform(df[col])

df['Retention_Strategy'] = le.fit_transform(df['Retention_Strategy'])

df['Purchase_Value_Ratio'] = df['Average_Order_Value'] / (df['Purchase_Frequency'] + 1)
df['Lifetime_Value_per_Frequency'] = df['Lifetime_Value'] / (df['Purchase_Frequency'] + 1)

scaler = StandardScaler()
numerical_cols = df.select_dtypes(include=[np.number]).columns.drop('Retention_Strategy')
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [36]:
X = df.drop('Retention_Strategy', axis=1)
y = df['Retention_Strategy']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")

Logistic Regression Accuracy: 0.3420


In [38]:
rf = RandomForestClassifier(random_state=42, n_estimators=200, max_depth=20)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")

Random Forest Accuracy: 0.3395


In [39]:
svm = SVC(random_state=42, C=10, kernel='rbf')
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f"SVM Accuracy: {svm_accuracy:.4f}")

SVM Accuracy: 0.3375


In [40]:
xgb = XGBClassifier(random_state=42, n_estimators=200, max_depth=6)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_pred)
print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")

XGBoost Accuracy: 0.3305


In [41]:
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(estimators=[('lr', lr), ('rf', rf), ('svm', svm), ('xgb', xgb)], voting='hard')
ensemble.fit(X_train, y_train)
ensemble_pred = ensemble.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)
print(f"Ensemble Accuracy: {ensemble_accuracy:.4f}")

Ensemble Accuracy: 0.3400


In [42]:
models = ['Logistic Regression', 'Random Forest', 'SVM', 'XGBoost', 'Ensemble']
accuracies = [lr_accuracy, rf_accuracy, svm_accuracy, xgb_accuracy, ensemble_accuracy]

best_model_idx = np.argmax(accuracies)
best_model = models[best_model_idx]
best_accuracy = accuracies[best_model_idx]

print("Model Accuracies:")
for model, acc in zip(models, accuracies):
    print(f"{model}: {acc:.4f}")

print(f"\nBest Model: {best_model} with Accuracy: {best_accuracy:.4f}")

Model Accuracies:
Logistic Regression: 0.3420
Random Forest: 0.3395
SVM: 0.3375
XGBoost: 0.3305
Ensemble: 0.3400

Best Model: Logistic Regression with Accuracy: 0.3420


In [43]:
print("=== Retention Strategy Prediction ===")
print(f"Dataset: sales_and_customer_insights.csv (full dataset)")
print(f"Best Model: {best_model} with Accuracy: {best_accuracy:.4f}")
print("Predicting Retention_Strategy with multiple models.")

=== Retention Strategy Prediction ===
Dataset: sales_and_customer_insights.csv (full dataset)
Best Model: Logistic Regression with Accuracy: 0.3420
Predicting Retention_Strategy with multiple models.
