In [5]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

In [6]:
## Load the dataset
data=pd.read_excel(r"data\tdatanew.xlsx")
data

Unnamed: 0,transaction_id,customer_id,product_id,product_name,category,purchase_date,quantity,price_per_unit,total_amount,payment_method,store_location,Mobile
0,T000290,C0259,P004,Cricket Cap,Cricket,2024-07-23,1,300,300,Debit Card,Kolkata,9400950651
1,T000223,C0226,P019,Shuttlecock,Badminton,2025-03-11,1,150,150,Debit Card,Delhi,9400950651
2,T000213,C0083,P019,Shuttlecock,Badminton,2024-09-06,1,150,150,Debit Card,Delhi,9400950651
3,T000458,C0263,P014,Yoga Mat,Yoga,2025-05-18,1,700,700,Credit Card,Bangalore,9400950651
4,T000308,C0132,P027,Tennis Balls,Tennis,2025-03-09,1,250,250,Debit Card,Kolkata,9400950651
...,...,...,...,...,...,...,...,...,...,...,...,...
2495,T000328,C0023,P015,Yoga Bottle,Yoga,2024-01-04,1,350,350,Debit Card,Mumbai,9400950651
2496,T000220,C0137,P001,Cricket Ball,Cricket,2024-08-30,1,200,200,Debit Card,Delhi,9400950651
2497,T000227,C0100,P020,Badminton Net,Badminton,2024-09-16,3,1200,3600,Cash,Pune,9400950651
2498,T000259,C0234,P005,Football,Football,2024-11-06,1,1200,1200,UPI,Kolkata,9400950651


In [7]:
group = data.groupby('customer_id').agg(
    Monetary=('total_amount', 'sum'),
    total_quantity=('quantity', 'sum'),
    Frequency=('transaction_id', 'count'),
    num_unique_products=('product_id', 'nunique'),
    last_purchase_date=('purchase_date', 'max'),
    avg_price_per_unit=('price_per_unit', 'mean'),
    store_visit_frequency=('purchase_date', 'nunique'),
).reset_index()
group

Unnamed: 0,customer_id,Monetary,total_quantity,Frequency,num_unique_products,last_purchase_date,avg_price_per_unit,store_visit_frequency
0,C0000,26550,23,15,12,2025-01-07,1023.333333,3
1,C0001,5750,6,5,5,2024-05-06,790.000000,1
2,C0003,9250,7,5,5,2025-03-17,1130.000000,1
3,C0004,8500,6,5,5,2024-04-29,1460.000000,1
4,C0006,8250,8,5,5,2024-01-12,1150.000000,1
...,...,...,...,...,...,...,...,...
282,C0395,12000,8,5,5,2025-05-29,1340.000000,1
283,C0396,7350,6,5,5,2024-08-10,1230.000000,1
284,C0397,11500,11,10,10,2025-04-07,1030.000000,2
285,C0398,7100,9,5,5,2024-04-15,780.000000,1


In [8]:
membership_start = data.groupby('customer_id')['purchase_date'].min().reset_index()
membership_start.rename(columns={'purchase_date':'membership_start_date'}, inplace=True)

data = group.merge(membership_start, on='customer_id', how='left')
data

Unnamed: 0,customer_id,Monetary,total_quantity,Frequency,num_unique_products,last_purchase_date,avg_price_per_unit,store_visit_frequency,membership_start_date
0,C0000,26550,23,15,12,2025-01-07,1023.333333,3,2024-05-26
1,C0001,5750,6,5,5,2024-05-06,790.000000,1,2024-05-06
2,C0003,9250,7,5,5,2025-03-17,1130.000000,1,2025-03-17
3,C0004,8500,6,5,5,2024-04-29,1460.000000,1,2024-04-29
4,C0006,8250,8,5,5,2024-01-12,1150.000000,1,2024-01-12
...,...,...,...,...,...,...,...,...,...
282,C0395,12000,8,5,5,2025-05-29,1340.000000,1,2025-05-29
283,C0396,7350,6,5,5,2024-08-10,1230.000000,1,2024-08-10
284,C0397,11500,11,10,10,2025-04-07,1030.000000,2,2024-04-29
285,C0398,7100,9,5,5,2024-04-15,780.000000,1,2024-04-15


In [9]:

reference_date = pd.to_datetime(data['last_purchase_date'].max())
today = pd.to_datetime(datetime.today().date())



data['Active_days'] = ((reference_date - data['membership_start_date']).dt.days).round().astype(int)

data['Avg_purchase_gap_days'] = data.apply(
    lambda x: x['Active_days'] / x['store_visit_frequency'] if x['store_visit_frequency'] > 0 else x['Active_days'],
    axis=1
)


data['average_purchase_value'] = data['Monetary'] / data['total_quantity']
data['Recency'] = (today - data['last_purchase_date']).dt.days

data['last_purchase_date'] = data['last_purchase_date'].dt.date
data['membership_start_date'] = data['membership_start_date'].dt.date

data


Unnamed: 0,customer_id,Monetary,total_quantity,Frequency,num_unique_products,last_purchase_date,avg_price_per_unit,store_visit_frequency,membership_start_date,Active_days,Avg_purchase_gap_days,average_purchase_value,Recency
0,C0000,26550,23,15,12,2025-01-07,1023.333333,3,2024-05-26,370,123.333333,1154.347826,154
1,C0001,5750,6,5,5,2024-05-06,790.000000,1,2024-05-06,390,390.000000,958.333333,400
2,C0003,9250,7,5,5,2025-03-17,1130.000000,1,2025-03-17,75,75.000000,1321.428571,85
3,C0004,8500,6,5,5,2024-04-29,1460.000000,1,2024-04-29,397,397.000000,1416.666667,407
4,C0006,8250,8,5,5,2024-01-12,1150.000000,1,2024-01-12,505,505.000000,1031.250000,515
...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,C0395,12000,8,5,5,2025-05-29,1340.000000,1,2025-05-29,2,2.000000,1500.000000,12
283,C0396,7350,6,5,5,2024-08-10,1230.000000,1,2024-08-10,294,294.000000,1225.000000,304
284,C0397,11500,11,10,10,2025-04-07,1030.000000,2,2024-04-29,397,198.500000,1045.454545,64
285,C0398,7100,9,5,5,2024-04-15,780.000000,1,2024-04-15,411,411.000000,788.888889,421


In [10]:
# Create a more comprehensive churn label
data['churn'] = data.apply(lambda row: 1 if (
    (row['Recency'] > 1 * row['Avg_purchase_gap_days']) and  # More lenient recency threshold
    (row['Monetary'] < data['Monetary'].median()) and       # Low monetary value
    (row['Frequency'] < data['Frequency'].median())         # Low frequency
) else 0, axis=1)

data

print("Churn Distribution:")
print(data['churn'].value_counts(normalize=True))


Churn Distribution:
churn
0    1.0
Name: proportion, dtype: float64


In [11]:
x=data[['Monetary','Frequency','Avg_purchase_gap_days','Recency']]           
y=data['churn']
x

Unnamed: 0,Monetary,Frequency,Avg_purchase_gap_days,Recency
0,26550,15,123.333333,154
1,5750,5,390.000000,400
2,9250,5,75.000000,85
3,8500,5,397.000000,407
4,8250,5,505.000000,515
...,...,...,...,...
282,12000,5,2.000000,12
283,7350,5,294.000000,304
284,11500,10,198.500000,64
285,7100,5,411.000000,421


In [12]:

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train

Unnamed: 0,Monetary,Frequency,Avg_purchase_gap_days,Recency
140,13650,15,131.000000,158
25,3800,5,232.000000,242
132,4550,5,387.000000,397
219,4700,5,44.000000,54
66,6850,5,199.000000,209
...,...,...,...,...
188,4750,5,507.000000,517
71,17150,15,158.333333,317
106,13000,10,155.500000,265
270,5950,5,215.000000,225


In [13]:
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit_transform(x_test)
x_train


array([[ 4.45990091e-01,  1.29250434e+00, -6.16133749e-01,
        -4.60325649e-01],
       [-1.02681212e+00, -7.70098406e-01,  1.98789193e-01,
         1.30784110e-01],
       [-9.14669817e-01, -7.70098406e-01,  1.44941351e+00,
         1.22152236e+00],
       [-8.92241357e-01, -7.70098406e-01, -1.31809708e+00,
        -1.19217583e+00],
       [-5.70766764e-01, -7.70098406e-01, -6.74727586e-02,
        -1.01437581e-01],
       [ 4.08609324e-01, -7.70098406e-01, -1.34230271e+00,
        -1.21328689e+00],
       [-4.66100617e-01, -7.70098406e-01,  2.29661063e+00,
         1.96040955e+00],
       [ 7.96585771e-02,  2.61202968e-01, -9.57126626e-02,
         1.44858152e-01],
       [-5.78242917e-01, -7.70098406e-01,  2.71406089e-01,
         1.94117298e-01],
       [-3.68910623e-01,  2.61202968e-01, -1.22127455e+00,
        -7.48843508e-01],
       [-6.53004450e-01, -7.70098406e-01,  1.74583561e-01,
         1.09673047e-01],
       [-7.65146751e-01, -7.70098406e-01,  4.00502792e-01,
      

In [14]:
with open('Models/churn_scaler.pkl','wb') as file:
    pickle.dump(scaler,file)

In [15]:
data

Unnamed: 0,customer_id,Monetary,total_quantity,Frequency,num_unique_products,last_purchase_date,avg_price_per_unit,store_visit_frequency,membership_start_date,Active_days,Avg_purchase_gap_days,average_purchase_value,Recency,churn
0,C0000,26550,23,15,12,2025-01-07,1023.333333,3,2024-05-26,370,123.333333,1154.347826,154,0
1,C0001,5750,6,5,5,2024-05-06,790.000000,1,2024-05-06,390,390.000000,958.333333,400,0
2,C0003,9250,7,5,5,2025-03-17,1130.000000,1,2025-03-17,75,75.000000,1321.428571,85,0
3,C0004,8500,6,5,5,2024-04-29,1460.000000,1,2024-04-29,397,397.000000,1416.666667,407,0
4,C0006,8250,8,5,5,2024-01-12,1150.000000,1,2024-01-12,505,505.000000,1031.250000,515,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,C0395,12000,8,5,5,2025-05-29,1340.000000,1,2025-05-29,2,2.000000,1500.000000,12,0
283,C0396,7350,6,5,5,2024-08-10,1230.000000,1,2024-08-10,294,294.000000,1225.000000,304,0
284,C0397,11500,11,10,10,2025-04-07,1030.000000,2,2024-04-29,397,198.500000,1045.454545,64,0
285,C0398,7100,9,5,5,2024-04-15,780.000000,1,2024-04-15,411,411.000000,788.888889,421,0


### ANN Implementation

In [16]:
import tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime

In [17]:
(x_train.shape[1],)

(4,)

In [18]:
churn_model = Sequential([
    Input(shape=(x_train.shape[1],)),  # Input layer
    Dense(64, activation='relu'),      # Hidden Layer 1
    Dense(32, activation='relu'),      # Hidden Layer 2
    Dense(1, activation='sigmoid')     # Output Layer
])

In [19]:
churn_model.summary()

In [20]:
opt=tf.keras.optimizers.Adam(learning_rate=0.01)
loss=tf.keras.losses.BinaryCrossentropy()
loss

<LossFunctionWrapper(<function binary_crossentropy at 0x0000028DEF8500D0>, kwargs={'from_logits': False, 'label_smoothing': 0.0, 'axis': -1})>

In [21]:

churn_model.compile(optimizer=opt,loss="binary_crossentropy",metrics=['accuracy'])

In [22]:
log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback=TensorBoard(log_dir=log_dir,histogram_freq=1)

In [23]:
## Set up Early Stopping
early_stopping_callback=EarlyStopping(monitor='val_loss',patience=15,restore_best_weights=True)


In [24]:
### Train the model
history=churn_model.fit(
    x_train,y_train,validation_data=(x_test,y_test),epochs=100,
    callbacks=[tensorflow_callback,early_stopping_callback]
)

   

Epoch 1/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 75ms/step - accuracy: 0.7606 - loss: 0.4903 - val_accuracy: 1.0000 - val_loss: 0.0604
Epoch 2/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 1.0000 - loss: 0.0365 - val_accuracy: 1.0000 - val_loss: 0.0029
Epoch 3/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 1.0000 - loss: 0.0014 - val_accuracy: 1.0000 - val_loss: 3.9257e-04
Epoch 4/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 1.0000 - loss: 2.0614e-04 - val_accuracy: 1.0000 - val_loss: 1.2474e-04
Epoch 5/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 1.0000 - loss: 6.7476e-05 - val_accuracy: 1.0000 - val_loss: 6.7400e-05
Epoch 6/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 1.0000 - loss: 3.6905e-05 - val_accuracy: 1.0000 - val_loss: 4.8885e-05
Epoch 7/100


In [25]:
train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']
print(f"train_accuracy :",train_accuracy[-1])
print("val_accuracy :",val_accuracy[-1])

train_accuracy : 1.0
val_accuracy : 1.0


In [26]:
churn_model.save('Models/churn_model.h5')



In [27]:
## Load Tensorboard Extension
%load_ext tensorboard

In [28]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6006 (pid 8744), started 3 days, 13:47:37 ago. (Use '!kill 8744' to kill it.)

In [206]:
### Load the pickle file
