In [2]:
# Step 1: Install and Import pandas
import pandas as pd
from google.colab import files
uploaded = files.upload()
# Step 2: Read the uploaded CSV file
df = pd.read_csv("ecommerce_orders_with_return_status.csv")

# Display first few rows
df.head()


Saving ecommerce_orders_with_return_status.csv to ecommerce_orders_with_return_status.csv


Unnamed: 0,Order ID,Product ID,Category,Region,Supplier,Marketing Channel,Price,Order Date,Customer ID,Returned
0,O1000,P200,Books,North,Supplier C,Referral,193.52,01-01-2024,CUST100,0
1,O1001,P201,Electronics,East,Supplier C,Search Ads,475.85,02-01-2024,CUST101,0
2,O1002,P202,Clothing,East,Supplier C,Referral,368.68,03-01-2024,CUST102,0
3,O1003,P203,Clothing,East,Supplier A,Search Ads,303.34,04-01-2024,CUST103,0
4,O1004,P204,Books,South,Supplier B,Email,86.45,05-01-2024,CUST104,1


In [7]:
# Explore and clean the dataset

# Show first 5 rows
print(" First 5 rows:")
print(df.head())

# Show the column names
print("\n Column Names:")
print(df.columns)

# Data types and non-null count
print("\n Data Info:")
print(df.info())

# Summary stats for all columns
print("\n Summary Stats:")
print(df.describe(include='all'))

# Check for missing values
print("\n Missing Values:")
print(df.isnull().sum())

# Check for duplicate records
print("\n Duplicate Rows:")
print(df.duplicated().sum())

# Optional: Unique values in key columns
if 'Is Returned' in df.columns:
    print("\n Unique values in 'Is Returned':", df['Is Returned'].unique())
if 'Product Category' in df.columns:
    print(" Unique Product Categories:", df['Product Category'].unique())


 First 5 rows:
  Order ID Product ID     Category Region    Supplier Marketing Channel  \
0    O1000       P200        Books  North  Supplier C          Referral   
1    O1001       P201  Electronics   East  Supplier C        Search Ads   
2    O1002       P202     Clothing   East  Supplier C          Referral   
3    O1003       P203     Clothing   East  Supplier A        Search Ads   
4    O1004       P204        Books  South  Supplier B             Email   

    Price  Order Date Customer ID  Returned  
0  193.52  01-01-2024     CUST100         0  
1  475.85  02-01-2024     CUST101         0  
2  368.68  03-01-2024     CUST102         0  
3  303.34  04-01-2024     CUST103         0  
4   86.45  05-01-2024     CUST104         1  

 Column Names:
Index(['Order ID', 'Product ID', 'Category', 'Region', 'Supplier',
       'Marketing Channel', 'Price', 'Order Date', 'Customer ID', 'Returned'],
      dtype='object')

 Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries

In [9]:
# Analyze Return Rates
# Return Rate by Category
category_return = df.groupby("Category")["Returned"].mean().reset_index()
category_return.columns = ["Category", "Return Rate (%)"]
category_return["Return Rate (%)"] = category_return["Return Rate (%)"] * 100
print("Return Rate by Category:\n", category_return)


Return Rate by Category:
       Category  Return Rate (%)
0       Beauty        35.000000
1        Books        26.315789
2     Clothing        21.428571
3  Electronics        47.368421
4   Home Decor        21.428571


In [11]:
# Return Rate by Region
region_return = df.groupby("Region")["Returned"].mean().reset_index()
region_return.columns = ["Region", "Return Rate (%)"]
region_return["Return Rate (%)"] = region_return["Return Rate (%)"] * 100
print("\n Return Rate by Region:\n", region_return)


 Return Rate by Region:
   Region  Return Rate (%)
0   East        28.571429
1  North        34.482759
2  South        29.166667
3   West        26.923077


In [13]:
# Return Rate by Marketing Channel
channel_return = df.groupby("Marketing Channel")["Returned"].mean().reset_index()
channel_return.columns = ["Marketing Channel", "Return Rate (%)"]
channel_return["Return Rate (%)"] = channel_return["Return Rate (%)"] * 100
print("\n Return Rate by Marketing Channel:\n", channel_return)


 Return Rate by Marketing Channel:
   Marketing Channel  Return Rate (%)
0             Email        21.052632
1          Referral        43.750000
2        Search Ads        13.636364
3      Social Media        33.333333


In [15]:
# Return Rate by Supplier
supplier_return = df.groupby("Supplier")["Returned"].mean().reset_index()
supplier_return.columns = ["Supplier", "Return Rate (%)"]
supplier_return["Return Rate (%)"] = supplier_return["Return Rate (%)"] * 100
print("\n Return Rate by Supplier:\n", supplier_return)


 Return Rate by Supplier:
      Supplier  Return Rate (%)
0  Supplier A        38.888889
1  Supplier B        26.666667
2  Supplier C        23.529412


In [17]:
# Building a logistic Regression Model to Predict Returns
# Encode categorical columns
# Convert categorical columns to numeric using One-Hot Encoding

df_encoded = pd.get_dummies(df, columns=["Category", "Region", "Supplier", "Marketing Channel"], drop_first=True)

# Check the new encoded dataframe
df_encoded.head()


Unnamed: 0,Order ID,Product ID,Price,Order Date,Customer ID,Returned,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor,Region_North,Region_South,Region_West,Supplier_Supplier B,Supplier_Supplier C,Marketing Channel_Referral,Marketing Channel_Search Ads,Marketing Channel_Social Media
0,O1000,P200,193.52,01-01-2024,CUST100,0,True,False,False,False,True,False,False,False,True,True,False,False
1,O1001,P201,475.85,02-01-2024,CUST101,0,False,False,True,False,False,False,False,False,True,False,True,False
2,O1002,P202,368.68,03-01-2024,CUST102,0,False,True,False,False,False,False,False,False,True,True,False,False
3,O1003,P203,303.34,04-01-2024,CUST103,0,False,True,False,False,False,False,False,False,False,False,True,False
4,O1004,P204,86.45,05-01-2024,CUST104,1,True,False,False,False,False,True,False,True,False,False,False,False


In [23]:
from sklearn.model_selection import train_test_split

# Define input features (X) and target column (y)
X = df_encoded.drop(columns=["Order ID", "Product ID", "Order Date", "Customer ID", "Returned"])
y = df_encoded["Returned"]

# Split into 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Initialize and train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(" Accuracy:", accuracy_score(y_test, y_pred))
print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))


 Accuracy: 0.6666666666666666

 Confusion Matrix:
 [[17  2]
 [ 8  3]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.89      0.77        19
           1       0.60      0.27      0.38        11

    accuracy                           0.67        30
   macro avg       0.64      0.58      0.57        30
weighted avg       0.65      0.67      0.63        30



In [27]:
# Predict probability and export high-risk products
# Predict probabilities

return_probs = model.predict_proba(X_test)[:, 1]

# Add probabilities to a copy of the test set
X_test_copy = X_test.copy()
X_test_copy["Return Probability"] = return_probs
X_test_copy["Actual Return"] = y_test.values

# Get top 10 high-risk products (those with highest return probability)
high_risk_products = X_test_copy.sort_values(by="Return Probability", ascending=False).head(10)

# Export to CSV
high_risk_products.to_csv("high_risk_products.csv", index=False)

print(" Exported high-risk products to 'high_risk_products.csv'")


 Exported high-risk products to 'high_risk_products.csv'
