In [2]:
import numpy as np
import pandas as pd

In [3]:
#setting the random seed for reproducibility

np.random.seed(42)

In [4]:
#we decide on the number of customers we want. Let's say 500
num_customers = 500


In [5]:
#Generating synthetic data 
customerIds = np.arange(1, num_customers+1)
genders = np.random.choice(['Male','Female'], size= num_customers)
ages = np.random.randint(18, 70, size=num_customers)
annual_incomes = np.random.randint(20000, 150000, size=num_customers)
spending_scores = np.random.randint(1, 101, size=num_customers)

# Create a DataFrame
customer_data = pd.DataFrame({
    'CustomerID': customerIds,
    'Gender': genders,
    'Age': ages,
    'AnnualIncome': annual_incomes,
    'SpendingScore': spending_scores
})

# Ensure data types
customer_data = customer_data.astype({
    'CustomerID': 'int64',
    'Gender': 'object',
    'Age': 'int64',
    'AnnualIncome': 'int64',
    'SpendingScore': 'int64'
})

# Save the DataFrame to a CSV file
customer_data.to_csv('synthetic_customer_data.csv', index=False)

print("Synthetic customer data generated and saved to 'synthetic_customer_data.csv'")

Synthetic customer data generated and saved to 'synthetic_customer_data.csv'


In [6]:
data = pd.read_csv('synthetic_customer_data.csv')

In [7]:
data.head()

Unnamed: 0,CustomerID,Gender,Age,AnnualIncome,SpendingScore
0,1,Male,38,81886,57
1,2,Female,49,95745,26
2,3,Male,40,103251,41
3,4,Male,50,114456,35
4,5,Male,20,73413,63


In [9]:
data.shape

(500, 5)

In [10]:
data.describe()

Unnamed: 0,CustomerID,Age,AnnualIncome,SpendingScore
count,500.0,500.0,500.0,500.0
mean,250.5,43.8,85611.88,50.742
std,144.481833,14.835504,39845.53216,29.071735
min,1.0,18.0,20055.0,1.0
25%,125.75,31.0,48723.75,25.75
50%,250.5,44.0,88011.5,51.0
75%,375.25,56.0,121607.75,75.25
max,500.0,69.0,149922.0,100.0


In [11]:
data.isnull().sum()

CustomerID       0
Gender           0
Age              0
AnnualIncome     0
SpendingScore    0
dtype: int64

In [12]:
#We can drop that we will not need in the segmentation, like the CustomerId
data.drop(['CustomerID'], axis=1, inplace=True)