In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

df = pd.read_csv('cleaned_dataset.csv')

In [4]:
##Quick peek
print("rows, cols:", df.shape)
display(df.head(3))

rows, cols: (10000, 20)


Unnamed: 0,Order_ID,Product_ID,User_ID,Order_Date,Return_Date,Product_Category,Product_Price,Order_Quantity,Return_Reason,Return_Status,Days_to_Return,User_Age,User_Gender,User_Location,Payment_Method,Shipping_Method,Discount_Applied,Supplier_Name,Marketing_Channel,Region
0,ORD00000000,PROD00000000,USER00000000,05/08/2023,26/08/2024,Clothing,411.59,3,Changedmind,Returned,387.0,58,Male,City54,DebitCard,Next-Day,45.27,GlobalSupplyCo,EmailMarketing,North
1,ORD00000001,PROD00000001,USER00000001,09/10/2023,09/11/2023,Books,288.88,3,Wrongitem,Returned,31.0,68,Female,City85,CreditCard,Express,47.79,MegaDistributorLLC,SocialMedia,Central
2,ORD00000002,PROD00000002,USER00000002,06/05/2023,,Toys,390.03,5,,NotReturned,,22,Female,City30,DebitCard,Next-Day,26.64,PrimeSourceLtd,EmailMarketing,South


In [5]:
# Make sure date columns are datetime (dayfirst True is safe if dates look like DD/MM/YYYY)
df['Order_Date']  = pd.to_datetime(df.get('Order_Date'),  dayfirst=True, errors='coerce')
df['Return_Date'] = pd.to_datetime(df.get('Return_Date'), dayfirst=True, errors='coerce')

In [6]:
# Create the target column `is_returned`
# Use Return_Date
df['is_returned'] = np.where(
    df['Return_Date'].notna(), 1,
    np.where(df.get('Return_Status').astype(str).str.lower() == 'returned', 1, 0)
)

# Show class balance
print("Returned / Not returned counts:")
print(df['is_returned'].value_counts())
print("\nReturned rate (fraction):")
print(df['is_returned'].value_counts(normalize=True))


Returned / Not returned counts:
is_returned
1    5052
0    4948
Name: count, dtype: int64

Returned rate (fraction):
is_returned
1    0.5052
0    0.4948
Name: proportion, dtype: float64


In [9]:
# Select features (predictors)
# Recommended features
recommended = [
    'Product_Category', 'Product_Price', 'Order_Quantity', 'Discount_Applied',
    'User_Age', 'User_Gender', 'Region', 'Supplier_Name', 'Marketing_Channel',
    'Shipping_Method', 'Payment_Method', 'Order_Channel'  # add/remove as needed
]

# Exclude identifiers and leakage columns if present
exclude = ['Order_ID','Product_ID','User_ID','Return_Date','Return_Reason','Days_to_Return','is_returned']

# Build final feature list as intersection with df columns, preserving recommended order first
features = [c for c in recommended if c in df.columns and c not in exclude]
# then add any other numeric columns that might be useful
numeric_candidates = [c for c in df.select_dtypes(include=['number']).columns if c not in features + exclude + ['is_returned']]
# choose to include numeric candidates automatically
features += numeric_candidates

print("Using features ({}):\n".format(len(features)), features)

# Create X and y
X = df[features].copy()
y = df['is_returned'].copy()

# Quick check for nulls in features
print("\nNulls per feature:")
print(X.isna().sum())

# show top rows of X and y
display(X.head(3))
print(y.head(3))


Using features (11):
 ['Product_Category', 'Product_Price', 'Order_Quantity', 'Discount_Applied', 'User_Age', 'User_Gender', 'Region', 'Supplier_Name', 'Marketing_Channel', 'Shipping_Method', 'Payment_Method']

Nulls per feature:
Product_Category     0
Product_Price        0
Order_Quantity       0
Discount_Applied     0
User_Age             0
User_Gender          0
Region               0
Supplier_Name        0
Marketing_Channel    0
Shipping_Method      0
Payment_Method       0
dtype: int64


Unnamed: 0,Product_Category,Product_Price,Order_Quantity,Discount_Applied,User_Age,User_Gender,Region,Supplier_Name,Marketing_Channel,Shipping_Method,Payment_Method
0,Clothing,411.59,3,45.27,58,Male,North,GlobalSupplyCo,EmailMarketing,Next-Day,DebitCard
1,Books,288.88,3,47.79,68,Female,Central,MegaDistributorLLC,SocialMedia,Express,CreditCard
2,Toys,390.03,5,26.64,22,Female,South,PrimeSourceLtd,EmailMarketing,Next-Day,DebitCard


0    1
1    1
2    0
Name: is_returned, dtype: int64


In [10]:
categorical_cols = X.select_dtypes(include=['object']).columns

# One-hot encode all categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

print("Original features:", X.shape[1])
print("After encoding:", X_encoded.shape[1])

Original features: 11
After encoding: 31


In [11]:
# Fill numeric columns with median
numeric_cols = X_encoded.select_dtypes(include=['number']).columns
X_encoded[numeric_cols] = X_encoded[numeric_cols].fillna(X_encoded[numeric_cols].median())

# Fill any remaining missing values with mode
X_encoded = X_encoded.fillna(X_encoded.mode().iloc[0])

print("Null values after filling:", X_encoded.isna().sum().sum())

Null values after filling: 0


In [20]:
# Split Data into Train/Test Sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (8000, 31) Test shape: (2000, 31)


In [21]:
# Build Logistic Regression Model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [17]:
# Make Predictions
# Probabilities (0-1)
y_prob = model.predict_proba(X_test)[:, 1]

# Binary predictions (0 or 1)
y_pred = model.predict(X_test)

In [22]:
# Evaluate Performance
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


# Feature Importance
# Logistic regression coefficients
feature_importance = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', key=abs, ascending=False)

print("\nTop 10 important features:")
display(feature_importance.head(10))

Accuracy: 0.4995

Confusion Matrix:
 [[433 557]
 [444 566]]

Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.44      0.46       990
           1       0.50      0.56      0.53      1010

    accuracy                           0.50      2000
   macro avg       0.50      0.50      0.50      2000
weighted avg       0.50      0.50      0.50      2000


Top 10 important features:


Unnamed: 0,Feature,Coefficient
18,Supplier_Name_ReliableSuppliesInc,-0.196485
30,Payment_Method_PayPal,-0.105204
15,Supplier_Name_MegaDistributorLLC,-0.088766
4,Product_Category_Clothing,0.087783
16,Supplier_Name_PrimeSourceLtd,-0.082743
9,Region_East,0.078605
14,Supplier_Name_GlobalSupplyCo,-0.076052
17,Supplier_Name_QuickStockPartners,-0.067369
5,Product_Category_Electronics,0.059555
12,Region_West,-0.058027


In [23]:
# Generate Risk Scores
full_prob = model.predict_proba(X_encoded)[:, 1]

df['return_probability'] = full_prob

# Create risk categories
df['risk_category'] = pd.cut(
    df['return_probability'],
    bins=[-0.01, 0.4, 0.7, 1.0],
    labels=['Low', 'Medium', 'High']
)

print("\nReturn risk distribution:")
print(df['risk_category'].value_counts())

# Export High-Risk Products
high_risk_df = df[df['risk_category'] == 'High'].copy()

# Include relevant product/order info
export_cols = ['Order_ID','Product_ID','Product_Category','Product_Price',
               'Order_Quantity','Return_Date','return_probability','risk_category']
existing_cols = [c for c in export_cols if c in df.columns]

high_risk_df[existing_cols].to_csv('high_risk_products.csv', index=False)

print("\nHigh-risk products exported: 'high_risk_products.csv'")


Return risk distribution:
risk_category
Medium    9994
Low          6
High         0
Name: count, dtype: int64

High-risk products exported: 'high_risk_products.csv'
