# Customer Segmentaion Based on Personality

## 1. Imports and Data Loading



In [None]:
# Imports
import pandas as pd

In [None]:
# Load the dataset
df = pd.read_csv('customer_segmentation.csv')

---

## 2. Data Exploration

In [None]:
# Display first few rows
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


In [None]:
# Random sample
df.sample(5)

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
261,1184,1968,PhD,Divorced,69674.0,0,2,22-05-2013,46,554,...,5,0,0,0,0,0,0,3,11,0
1971,4338,1983,Graduation,Single,39062.0,1,0,25-03-2014,28,23,...,4,1,0,0,0,0,0,3,11,0
2077,954,1960,Basic,Widow,22123.0,0,0,27-03-2014,77,3,...,5,0,0,0,0,0,0,3,11,0
438,5455,1971,PhD,Married,32011.0,1,0,22-08-2013,89,99,...,7,0,0,0,0,0,0,3,11,0
1097,10245,1986,2n Cycle,Single,80910.0,0,0,31-10-2012,71,160,...,1,0,0,0,0,0,0,3,11,0


In [None]:
# Column information
df.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response'],
      dtype='object')

In [None]:
# Data types and structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [None]:
# Check for missing values
df.isna().sum()

Unnamed: 0,0
ID,0
Year_Birth,0
Education,0
Marital_Status,0
Income,24
Kidhome,0
Teenhome,0
Dt_Customer,0
Recency,0
MntWines,0


## 3. Data Processing

### Processing

In [None]:
def get_joining_year(row):
  year = row['Dt_Customer'].split('-')[2]
  return int(year)

# Extract customer joining year
df['customer_joining_year'] = df.apply(get_joining_year, axis=1)

In [None]:
# Standardize marital status
df['Marital_Status'].loc[df['Marital_Status'] == 'Together'] = 'Married'
df['Marital_Status'].loc[df['Marital_Status'].isin(['Absurd', 'YOLO', 'Alone'])] = 'Single'

In [None]:
# Standardize education levels
df = df.dropna()

In [None]:
# Remove missing values and unnecessary columns
df = df.drop(columns=['Z_CostContact', 'Z_Revenue'])

In [None]:
df['Education'].loc[df['Education'] == '2n Cycle'] = 'Master'
df['Education'].loc[df['Education'] == 'Basic'] = 'Graduation'

---

## 4. Feature Engineering

In [None]:
# Create combined features
df['number_of_progeny'] = df['Kidhome'] + df['Teenhome']
df['years_since_joining_company'] = 2025 - df['customer_joining_year']
df['MntProds'] = df['MntWines'] + df['MntFruits'] + df['MntMeatProducts'] + df['MntFishProducts'] + df['MntSweetProducts'] + df['MntGoldProds']
df['AcceptedCmp'] = df['AcceptedCmp3'] + df['AcceptedCmp4'] + df['AcceptedCmp5'] + df['AcceptedCmp1'] + df['AcceptedCmp2']

---

## 5. Data Encoding

In [None]:
def encode_education(row):
  if row['Education'] == 'PhD':
    return 3
  elif row['Education'] == 'Master':
    return 2
  else:
    return 1

# Encode education as ordinal
df['Education'] = df.apply(encode_education, axis=1)

In [None]:
# One-hot encode marital status
df = pd.get_dummies(df, columns=['Marital_Status'])

In [None]:
# Drop original columns that were combined or processed
df = df.drop(columns=[
    'ID',
    'AcceptedCmp3',
    'AcceptedCmp4',
    'AcceptedCmp5',
    'AcceptedCmp1',
    'AcceptedCmp2',
    'customer_joining_year',
    'Kidhome',
    'Teenhome',
    'MntWines',
    'MntFruits',
    'MntMeatProducts',
    'MntFishProducts',
    'MntSweetProducts',
    'MntGoldProds'
    ]
    )

In [None]:
# Final column check
df.columns

Index(['Year_Birth', 'Education', 'Income', 'Recency', 'NumDealsPurchases',
       'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
       'NumWebVisitsMonth', 'Complain', 'Response', 'number_of_progeny',
       'years_since_joining_company', 'MntProds', 'AcceptedCmp',
       'Marital_Status_Divorced', 'Marital_Status_Married',
       'Marital_Status_Single', 'Marital_Status_Widow'],
      dtype='object')