In [1]:
import pandas as pd
import numpy as np

In [13]:
df = pd.read_csv('cust_cleaned.csv')

In [7]:
# Display all columns
pd.set_option('display.max_columns', None)

In [4]:
# Basic investigation
print("--- First 5 rows ---")
print(df.head())

print("\n--- Dataset Info ---")
df.info()

print("\n--- Summary Statistics ---")
print(df.describe())

print("\n--- Missing Values ---")
print(df.isnull().sum())

print("\n--- Duplicated Rows ---")
print(df.duplicated().sum())

--- First 5 rows ---
   Unnamed: 0    ID  Year_Birth Education Marital_Status      Income  Kidhome  \
0           0  5524        1957        S1         Lajang  58138000.0        0   
1           1  2174        1954        S1         Lajang  46344000.0        1   
2           2  4141        1965        S1    Bertunangan  71613000.0        0   
3           3  6182        1984        S1    Bertunangan  26646000.0        1   
4           4  5324        1981        S3        Menikah  58293000.0        1   

   Teenhome Dt_Customer  Recency  MntCoke  MntFruits  MntMeatProducts  \
0         0  04-09-2012       58   635000      88000           546000   
1         1  08-03-2014       38    11000       1000             6000   
2         0  21-08-2013       26   426000      49000           127000   
3         0  10-02-2014       26    11000       4000            20000   
4         0  19-01-2014       94   173000      43000           118000   

   MntFishProducts  MntSweetProducts  MntGoldProds  N

In [8]:
# Print distinct values
print("\nDistinct Values for Each Column:")
for col in df.columns:
    unique_values = df[col].unique()
    print(f"{col}: {unique_values[:20]}{'...' if len(unique_values) > 20 else ''} (Total: {len(unique_values)})")


Distinct Values for Each Column:
Unnamed: 0: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]... (Total: 2240)
ID: [5524 2174 4141 6182 5324 7446  965 6177 4855 5899 1994  387 2125 8180
 2569 2114 9736 4939 6565 2278]... (Total: 2240)
Year_Birth: [1957 1954 1965 1984 1981 1967 1971 1985 1974 1950 1983 1976 1959 1952
 1987 1946 1980 1949 1982 1979]... (Total: 59)
Education: ['S1' 'S3' 'S2' 'SMA' 'D3'] (Total: 5)
Marital_Status: ['Lajang' 'Bertunangan' 'Menikah' 'Cerai' 'Janda' 'Duda'] (Total: 6)
Income: [58138000. 46344000. 71613000. 26646000. 58293000. 62513000. 55635000.
 33454000. 30351000.  5648000.       nan  7500000. 63033000. 59354000.
 17323000. 82800000. 41850000. 37760000. 76995000. 33812000.]... (Total: 1975)
Kidhome: [0 1 2] (Total: 3)
Teenhome: [0 1 2] (Total: 3)
Dt_Customer: ['04-09-2012' '08-03-2014' '21-08-2013' '10-02-2014' '19-01-2014'
 '09-09-2013' '13-11-2012' '08-05-2013' '06-06-2013' '13-03-2014'
 '15-11-2013' '10-10-2012' '24-11-2012' '24-12-2012' '3

## Since there are no duplicates found, lets proceed to other pre-processing steps

### Handling Nulls

In [14]:
df['Income'].fillna(df['Income'].median(), inplace=True)
df.dropna(subset=['Conversion_Rate'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Income'].fillna(df['Income'].median(), inplace=True)


### Dropping Unnecessary Columns

In [15]:
df.drop(['Unnamed: 0', 'ID', 'Dt_Customer', 'Z_CostContact', 'Z_Revenue', 'Age_Bin'], axis=1, inplace=True)

### Handling Education Feature by Mapping into numeric

In [16]:
education_mapping = {
    'SMA': 1,
    'D3': 2,
    'S1': 3,
    'S2': 4,
    'S3': 5
}

df['Education'] = df['Education'].map(education_mapping)

### One-hot Encode Marital Status Feature

In [17]:
df = pd.get_dummies(df, columns=['Marital_Status'], prefix='Marital', drop_first=True)

In [18]:
# Some backup
df.to_csv('preproc_nonstandardize.csv', index=False)

### Standardize the Data

In [19]:
from sklearn.preprocessing import StandardScaler

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [20]:
# Some backup
df.to_csv('preproc_standardized.csv', index=False)

In [21]:
print("\n--- Missing Values ---")
print(df.isnull().sum())

print("\n--- Duplicated Rows ---")
print(df.duplicated().sum())


--- Missing Values ---
Year_Birth             0
Education              0
Income                 0
Kidhome                0
Teenhome               0
Recency                0
MntCoke                0
MntFruits              0
MntMeatProducts        0
MntFishProducts        0
MntSweetProducts       0
MntGoldProds           0
NumDealsPurchases      0
NumWebPurchases        0
NumCatalogPurchases    0
NumStorePurchases      0
NumWebVisitsMonth      0
AcceptedCmp3           0
AcceptedCmp4           0
AcceptedCmp5           0
AcceptedCmp1           0
AcceptedCmp2           0
Complain               0
Response               0
Customer_Age           0
Total_Children         0
Total_Spending         0
Total_Transactions     0
Conversion_Rate        0
Marital_Cerai          0
Marital_Duda           0
Marital_Janda          0
Marital_Lajang         0
Marital_Menikah        0
dtype: int64

--- Duplicated Rows ---
183


In [22]:
duplicated_rows = df[df.duplicated()]
print(duplicated_rows)

      Year_Birth  Education    Income   Kidhome  Teenhome   Recency   MntCoke  \
83     -0.485771   0.540696 -0.543180 -0.828645 -0.933088  0.239836 -0.570796   
179    -1.487266  -1.454687  1.076007 -0.828645 -0.933088 -0.174900 -0.288847   
281     0.599182  -0.456995 -0.025513 -0.828645  0.905863  1.207551 -0.021738   
282    -1.904556  -0.456995 -0.578100 -0.828645 -0.933088 -1.004370 -0.653896   
363     0.766098   1.538387 -0.579846  1.027686 -0.933088 -0.624196 -0.876487   
...          ...        ...       ...       ...       ...       ...       ...   
2218    0.766098  -0.456995 -1.186559  1.027686 -0.933088 -0.313145 -0.888359   
2221    1.099930   0.540696  0.965563 -0.828645 -0.933088 -1.280860  1.209933   
2225   -0.068481  -0.456995  0.266231  1.027686  0.905863  0.205274  0.188982   
2234    0.432266  -0.456995 -0.713678  1.027686 -0.933088  1.103867 -0.894295   
2236   -1.904556   1.538387  0.487932  2.884017  0.905863  0.239836  0.301761   

      MntFruits  MntMeatPro