## Load the Cleaned data

In [3]:
import pandas as pd
data_path = "/Users/agalyaayyadurai/PycharmProjects/Customer-Value-Insights/data/processed/cleaned_data.csv"
data = pd.read_csv(data_path)
print(data.head())

     ID  Year_Birth   Education Marital_Status   Income  Kidhome  Teenhome  \
0  5524        1957  Graduation         Single  58138.0        0         0   
1  2174        1954  Graduation         Single  46344.0        1         1   
2  4141        1965  Graduation       Together  71613.0        0         0   
3  6182        1984  Graduation       Together  26646.0        1         0   
4  5324        1981         PhD        Married  58293.0        1         0   

  Dt_Customer  Recency  MntWines  ...  NumWebVisitsMonth  AcceptedCmp3  \
0  04-09-2012       58       635  ...                  7             0   
1  08-03-2014       38        11  ...                  5             0   
2  21-08-2013       26       426  ...                  4             0   
3  10-02-2014       26        11  ...                  6             0   
4  19-01-2014       94       173  ...                  5             0   

   AcceptedCmp4  AcceptedCmp5  AcceptedCmp1  AcceptedCmp2  Complain  \
0             0

## Create new Features

In [4]:
# Calculate age
data['Age'] = 2024 - data['Year_Birth']

# Total children
data['Total_Children'] = data['Kidhome'] + data['Teenhome']

# Total spend
spend_features = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
data['Total_Spend'] = data[spend_features].sum(axis=1)

# Recency group
data['Recency_Group'] = pd.cut(data['Recency'], bins=4, labels=['Very Recent', 'Recent', 'Less Recent', 'Not Recent'])

print(data.head())

     ID  Year_Birth   Education Marital_Status   Income  Kidhome  Teenhome  \
0  5524        1957  Graduation         Single  58138.0        0         0   
1  2174        1954  Graduation         Single  46344.0        1         1   
2  4141        1965  Graduation       Together  71613.0        0         0   
3  6182        1984  Graduation       Together  26646.0        1         0   
4  5324        1981         PhD        Married  58293.0        1         0   

  Dt_Customer  Recency  MntWines  ...  AcceptedCmp1  AcceptedCmp2  Complain  \
0  04-09-2012       58       635  ...             0             0         0   
1  08-03-2014       38        11  ...             0             0         0   
2  21-08-2013       26       426  ...             0             0         0   
3  10-02-2014       26        11  ...             0             0         0   
4  19-01-2014       94       173  ...             0             0         0   

   Z_CostContact  Z_Revenue  Response  Age  Total_Childr

## One-Hot encode categorical variables

In [6]:
# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['Education', 'Marital_Status'])

# Display the first few rows of the dataset after encoding
print(data.head())

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
     ID  Year_Birth   Income  Kidhome  Teenhome Dt_Customer  Recency  \
0  5524        1957  58138.0        0         0  04-09-2012       58   
1  2174        1954  46344.0        1         1  08-03-2014       38   
2  4141        1965  71613.0        0         0  21-08-2013       26   
3  6182        1984  26646.0        1         0  10-02-2014       26   
4  5324        1981  58293.0        1         0  19-01-2014       94   

   MntWines  MntFruits  MntMeatProducts  ...  Education_Master  Education_PhD  \
0       635         88              546  ...             False          False   
1        11          1                6  ...             False          False   
2       426         49              127  ...             False          False   
3        11          4               20  ...             False          False   
4       

## Define features and target variable

In [7]:
# Define features and target variable
features = data[['Age', 'Total_Spend', 'Total_Children', 'Income', 'Recency', 
                 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth'] + 
                list(data.columns[data.columns.str.startswith('Education_')]) + 
                list(data.columns[data.columns.str.startswith('Marital_Status_')])]
target = data['Response']

# Display the features and target
print(features.head())
print(target.head())

   Age  Total_Spend  Total_Children   Income  Recency  NumWebPurchases  \
0   67         1617               0  58138.0       58                8   
1   70           27               2  46344.0       38                1   
2   59          776               0  71613.0       26                8   
3   40           53               1  26646.0       26                2   
4   43          422               1  58293.0       94                5   

   NumCatalogPurchases  NumStorePurchases  NumWebVisitsMonth  \
0                   10                  4                  7   
1                    1                  2                  5   
2                    2                 10                  4   
3                    0                  4                  6   
4                    3                  6                  5   

   Education_2n Cycle  ...  Education_Master  Education_PhD  \
0               False  ...             False          False   
1               False  ...             False

## Standardize Features

In [8]:
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
features = scaler.fit_transform(features)

# Display the first few rows of the standardized features
print(features[:5])

[[ 0.98534473  1.67941681 -1.2645053   0.23690766  0.30703926  1.40930394
   2.51089024 -0.55078479  0.69390374 -0.31568395 -0.15717078  0.99376941
  -0.44481565 -0.5263848  -0.02989406 -0.03662078 -0.33990859 -0.79240582
   1.91485422 -0.59109863 -0.18867619 -0.02989406]
 [ 1.23573295 -0.96127545  1.39636071 -0.23250954 -0.38366418 -1.11040937
  -0.56871962 -1.1661254  -0.13046347 -0.31568395 -0.15717078  0.99376941
  -0.44481565 -0.5263848  -0.02989406 -0.03662078 -0.33990859 -0.79240582
   1.91485422 -0.59109863 -0.18867619 -0.02989406]
 [ 0.3176428   0.28267329 -1.2645053   0.77323095 -0.79808624  1.40930394
  -0.22654075  1.29523705 -0.54264708 -0.31568395 -0.15717078  0.99376941
  -0.44481565 -0.5263848  -0.02989406 -0.03662078 -0.33990859 -0.79240582
  -0.52223297  1.69176504 -0.18867619 -0.02989406]
 [-1.26814929 -0.91809432  0.06592771 -1.01651667 -0.79808624 -0.75045033
  -0.91089849 -0.55078479  0.28172013 -0.31568395 -0.15717078  0.99376941
  -0.44481565 -0.5263848  -0.0298

## Handle Class Imbalance

In [17]:
from imblearn.over_sampling import SMOTE

# Handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(features, target)

# Display the shape of the resampled data
print(X_resampled.shape)
print(y_resampled.shape)

(3812, 22)
(3812,)


## Saved the processed data

In [18]:
import numpy as np

# Save the processed data
np.save("/Users/agalyaayyadurai/PycharmProjects/Customer-Value-Insights/data/processed/X_resampled.npy", X_resampled)
np.save("/Users/agalyaayyadurai/PycharmProjects/Customer-Value-Insights/data/processed/y_resampled.npy", y_resampled)

## Save the engineered data

In [5]:
engineered_data_path = "/Users/agalyaayyadurai/PycharmProjects/Customer-Value-Insights/data/processed/engineered_data.csv"
data.to_csv(engineered_data_path, index=False)