In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv("synthetic_ecommerce_sales_2025.csv")
df.head()

Unnamed: 0,order_id,customer_id,product_category,product_price,quantity,order_date,region,payment_method,delivery_days,is_returned,customer_rating,discount_percent,revenue
0,1,bdd640fb-0667-4ad1-9c80-317fa3b1799d,Beauty,190.4,5,2023-02-21,Europe,BankTransfer,8,0,3.8,0,952.0
1,2,23b8c1e9-3924-46de-beb1-3b9046685257,Fashion,82.22,3,2023-10-13,North America,CreditCard,5,0,3.8,0,246.66
2,3,bd9c66b3-ad3c-4d6d-9a3d-1fa7bc8960a9,Beauty,15.19,2,2023-06-28,Oceania,Cash,6,1,2.0,10,27.34
3,4,972a8469-1641-4f82-8b9d-2434e465e150,Electronics,310.65,2,2023-07-11,Europe,PayPal,9,0,2.9,5,590.23
4,5,17fc695a-07a0-4a6e-8822-e8f36c031199,Fashion,74.05,4,2023-02-24,Africa,PayPal,3,1,3.1,20,236.96


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   order_id          100000 non-null  int64  
 1   customer_id       100000 non-null  object 
 2   product_category  100000 non-null  object 
 3   product_price     100000 non-null  float64
 4   quantity          100000 non-null  int64  
 5   order_date        100000 non-null  object 
 6   region            100000 non-null  object 
 7   payment_method    100000 non-null  object 
 8   delivery_days     100000 non-null  int64  
 9   is_returned       100000 non-null  int64  
 10  customer_rating   100000 non-null  float64
 11  discount_percent  100000 non-null  int64  
 12  revenue           100000 non-null  float64
dtypes: float64(3), int64(5), object(5)
memory usage: 9.9+ MB


In [4]:
df.isnull().sum()

order_id            0
customer_id         0
product_category    0
product_price       0
quantity            0
order_date          0
region              0
payment_method      0
delivery_days       0
is_returned         0
customer_rating     0
discount_percent    0
revenue             0
dtype: int64

In [7]:
df['order_date'] = pd.to_datetime(df['order_date'])
df['order_month'] = df['order_date'].dt.month

In [11]:
# Return Rate by Product Category
category_returns = (
    df.groupby('product_category')['is_returned']
    .mean()
    .reset_index()
    .rename(columns={'is_returned': 'return_rate'})
)
category_returns

Unnamed: 0,product_category,return_rate
0,Automotive,0.052742
1,Beauty,0.047773
2,Electronics,0.05127
3,Fashion,0.121658
4,Home,0.051051
5,Sports,0.050509
6,Toys,0.048989


In [12]:
# Return Rate by Region
region_returns = (
    df.groupby('region')['is_returned']
    .mean()
    .reset_index()
    .rename(columns={'is_returned': 'return_rate'})
)
region_returns

Unnamed: 0,region,return_rate
0,Africa,0.057508
1,Asia,0.06061
2,Europe,0.060679
3,North America,0.061735
4,Oceania,0.062835
5,South America,0.060145


In [13]:
# Return Rate vs Delivery Days
delivery_returns = (
    df.groupby('delivery_days')['is_returned']
    .mean()
    .reset_index()
)
delivery_returns

Unnamed: 0,delivery_days,is_returned
0,1,0.061896
1,2,0.060066
2,3,0.062078
3,4,0.061787
4,5,0.063935
5,6,0.061673
6,7,0.060207
7,8,0.055128
8,9,0.058492


In [14]:
# Prepare Data for Logistic Regression
X = df.drop(columns=[
    'is_returned',
    'order_id',
    'customer_id',
    'order_date',
    'revenue'
])

y = df['is_returned']

categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [15]:
# Build ML Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [16]:
# Train & Evaluate Model
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

y_pred_prob = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_prob)

roc_auc

0.5882779552173129

In [17]:
# Generate Return Risk Score
df['return_risk_score'] = model.predict_proba(X)[:, 1]
df[['return_risk_score']].head()

Unnamed: 0,return_risk_score
0,0.047384
1,0.129425
2,0.04871
3,0.050638
4,0.115115


In [18]:
# Identify High-Risk Products (Top 10%)
threshold = df['return_risk_score'].quantile(0.90)

high_risk_products = (
    df[df['return_risk_score'] >= threshold]
    .groupby('product_category')
    .agg(
        avg_risk_score=('return_risk_score', 'mean'),
        return_rate=('is_returned', 'mean'),
        total_orders=('order_id', 'count')
    )
    .reset_index()
    .sort_values('avg_risk_score', ascending=False)
)

high_risk_products

Unnamed: 0,product_category,avg_risk_score,return_rate,total_orders
0,Fashion,0.123488,0.1227,10000


In [19]:
# Export CSV (Deliverable)
high_risk_products.to_csv("high_risk_products.csv", index=False)