[Reference](https://medium.com/@nomannayeem/from-messy-to-magic-a-beginner-to-expert-guide-on-data-cleaning-and-preprocessing-with-python-044ed8a3eb1f)

# Generating the Dataset

In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Step 1: 기본 데이터 95개 생성
customer_ids = [f'CUST{i}' for i in range(1, 96)]
purchase_values = np.random.uniform(20, 500, 95).tolist()
regions = ['North', 'South', 'East', 'West'] * 23 + ['North', 'South', 'East']
purchase_dates = pd.date_range(start='2023-01-01', periods=95, freq='D').tolist()

# Step 2: 이상값 및 결측치 포함 8개 추가
customer_ids += ['CUST96', 'CUST97', 'CUST98', 'CUST99', 'CUST100', 'CUST5', 'CUST10']
purchase_values += [10000, None, None, None, 250.0, 45.0, 30.0]
regions += ['north', 'EAST', None, 'West', 'South', 'North', 'West']
purchase_dates += [None, None, None, pd.Timestamp('2023-04-10'), pd.Timestamp('2023-04-11'),
                   pd.Timestamp('2023-01-05'), pd.Timestamp('2023-01-10')]

# 👉 길이 확인
print("Lengths:", list(map(len, [customer_ids, purchase_values, regions, purchase_dates])))

# Step 3: DataFrame 생성
data = {
    'customer_id': customer_ids,
    'purchase_value': purchase_values,
    'region': regions,
    'purchase_date': purchase_dates
}

messy_data = pd.DataFrame(data)

# 저장 및 출력
messy_data.to_csv("messy_retail_data.csv", index=False)
print("Synthetic dataset created:")
print(messy_data.head(10))

Lengths: [102, 102, 102, 102]
Synthetic dataset created:
  customer_id  purchase_value region purchase_date
0       CUST1      199.779257  North    2023-01-01
1       CUST2      476.342867  South    2023-01-02
2       CUST3      371.357092   East    2023-01-03
3       CUST4      307.356072   West    2023-01-04
4       CUST5       94.888947  North    2023-01-05
5       CUST6       94.877370  South    2023-01-06
6       CUST7       47.880134   East    2023-01-07
7       CUST8      435.764550   West    2023-01-08
8       CUST9      308.535206  North    2023-01-09
9      CUST10      359.874837  South    2023-01-10


# Step 1: Load the Data

In [2]:
# Load the messy dataset
df = pd.read_csv("messy_retail_data.csv")
print("Original Data:")
print(df.head())

Original Data:
  customer_id  purchase_value region purchase_date
0       CUST1      199.779257  North    2023-01-01
1       CUST2      476.342867  South    2023-01-02
2       CUST3      371.357092   East    2023-01-03
3       CUST4      307.356072   West    2023-01-04
4       CUST5       94.888947  North    2023-01-05


# Step 2: Handle Missing Values

In [3]:
# Fill missing purchase values with the median
df['purchase_value'] = df['purchase_value'].fillna(df['purchase_value'].median())

# Fill missing region and purchase date
df['region'] = df['region'].fillna('Unknown')
df['purchase_date'] = df['purchase_date'].fillna('2023-01-01')

print("After Handling Missing Values:")
print(df.head())

After Handling Missing Values:
  customer_id  purchase_value region purchase_date
0       CUST1      199.779257  North    2023-01-01
1       CUST2      476.342867  South    2023-01-02
2       CUST3      371.357092   East    2023-01-03
3       CUST4      307.356072   West    2023-01-04
4       CUST5       94.888947  North    2023-01-05


# Step 3: Remove Duplicates

In [4]:
# Drop duplicate rows
df = df.drop_duplicates()

print("After Removing Duplicates:")
print(df.head())

After Removing Duplicates:
  customer_id  purchase_value region purchase_date
0       CUST1      199.779257  North    2023-01-01
1       CUST2      476.342867  South    2023-01-02
2       CUST3      371.357092   East    2023-01-03
3       CUST4      307.356072   West    2023-01-04
4       CUST5       94.888947  North    2023-01-05


# Step 4: Standardize Text Data

In [5]:
# Standardize region column
df['region'] = df['region'].str.lower()

print("After Standardizing Text Data:")
print(df['region'].unique())

After Standardizing Text Data:
['north' 'south' 'east' 'west' 'unknown']


# Step 5: Handle Outliers

In [6]:
# Cap outliers
upper_limit = df['purchase_value'].quantile(0.95)
df['purchase_value'] = df['purchase_value'].clip(upper=upper_limit)

print("After Handling Outliers:")
print(df['purchase_value'].describe())

After Handling Outliers:
count    102.000000
mean     247.154974
std      142.851603
min       22.650616
25%      114.399712
50%      246.663164
75%      371.070560
max      476.298977
Name: purchase_value, dtype: float64


# Step 6: Scale Numerical Features

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df['purchase_value_scaled'] = scaler.fit_transform(df[['purchase_value']])

print("After Scaling:")
print(df[['purchase_value', 'purchase_value_scaled']].head())

After Scaling:
   purchase_value  purchase_value_scaled
0      199.779257               0.390454
1      476.298977               1.000000
2      371.357092               0.768671
3      307.356072               0.627591
4       94.888947               0.159239


# Step 7: Save the Cleaned Data

In [8]:
# Save the cleaned dataset
df.to_csv("cleaned_retail_data.csv", index=False)
print("Cleaned dataset saved to 'cleaned_retail_data.csv'")

Cleaned dataset saved to 'cleaned_retail_data.csv'
