DATA CREATION

In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)
n = 1000

df = pd.DataFrame({
    'Order_ID': range(1, n+1),
    'Customer_ID': np.random.randint(1000, 1100, size=n),
    'Order_Value': np.round(np.random.exponential(scale=80, size=n), 2),
    'Items_Ordered': np.random.randint(1, 5, size=n),
    'Payment_Method': np.random.choice(['Credit Card', 'Debit Card', 'UPI', 'Wallet', 'COD'], size=n),
    'Shipping_Days': np.random.randint(1, 10, size=n),
    'Returned': np.random.choice([0, 1], size=n, p=[0.85, 0.15])
})

df.to_csv("returns_feature_engineering.csv", index=False)


## Tasks:

1. Engineer at least 3 new features (numerical or categorical). Ideas:

    - Order value per item
    - Fast shipping flag
    - High-risk payment method flag

2. Encode categorical variables if needed

3. Use any model (e.g., logistic regression, decision tree) to assess feature impact on Returned

4. Rank features by importance (e.g., .coef_ or .feature_importances_)

5. Share:

    - Which engineered feature was most important?
    - How much did it improve prediction?



In [2]:
df.head()

Unnamed: 0,Order_ID,Customer_ID,Order_Value,Items_Ordered,Payment_Method,Shipping_Days,Returned
0,1,1051,70.71,2,COD,6,0
1,2,1092,109.46,1,Wallet,5,0
2,3,1014,45.2,2,Debit Card,8,0
3,4,1071,10.92,1,Credit Card,1,0
4,5,1060,26.7,1,COD,7,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Order_ID        1000 non-null   int64  
 1   Customer_ID     1000 non-null   int32  
 2   Order_Value     1000 non-null   float64
 3   Items_Ordered   1000 non-null   int32  
 4   Payment_Method  1000 non-null   object 
 5   Shipping_Days   1000 non-null   int32  
 6   Returned        1000 non-null   int64  
dtypes: float64(1), int32(3), int64(2), object(1)
memory usage: 43.1+ KB


In [12]:
df['Returned'].value_counts(sort=True)

Returned
0    863
1    137
Name: count, dtype: int64

In [13]:
df.head(3)

Unnamed: 0,Order_ID,Customer_ID,Order_Value,Items_Ordered,Payment_Method,Shipping_Days,Returned
0,1,1051,70.71,2,COD,6,0
1,2,1092,109.46,1,Wallet,5,0
2,3,1014,45.2,2,Debit Card,8,0


In [18]:
df['Payment_Method'].unique()

array(['COD', 'Wallet', 'Debit Card', 'Credit Card', 'UPI'], dtype=object)

In [19]:
df['Order_value_per_item'] = df['Order_Value'] / df['Items_Ordered']
df['Fast_shipping'] = (df['Shipping_Days'] <= 2).astype(int)
df['High_risk_payment'] = df['Payment_Method'].isin(['COD', 'Wallet']).astype(int)

In [28]:
df_encoded = pd.get_dummies(df, columns=['Payment_Method'])
df_encoded = df_encoded.drop(columns=['Payment_Method_COD', 'Payment_Method_Wallet'])

In [29]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Order_ID                    1000 non-null   int64  
 1   Customer_ID                 1000 non-null   int32  
 2   Order_Value                 1000 non-null   float64
 3   Items_Ordered               1000 non-null   int32  
 4   Shipping_Days               1000 non-null   int32  
 5   Returned                    1000 non-null   int64  
 6   Fast_shipping               1000 non-null   int64  
 7   Order_value_per_item        1000 non-null   float64
 8   High_risk_payment           1000 non-null   int64  
 9   Payment_Method_Credit Card  1000 non-null   bool   
 10  Payment_Method_Debit Card   1000 non-null   bool   
 11  Payment_Method_UPI          1000 non-null   bool   
dtypes: bool(3), float64(2), int32(3), int64(4)
memory usage: 61.6 KB


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Features
X = df_encoded.drop(columns=['Order_ID', 'Customer_ID', 'Returned'])
y = df_encoded['Returned']

# Standardize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model
model = LogisticRegression(class_weight='balanced')  # This tells the model to pay more attention to the minority class
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.86      0.44      0.58       174
           1       0.12      0.50      0.19        26

    accuracy                           0.45       200
   macro avg       0.49      0.47      0.39       200
weighted avg       0.76      0.45      0.53       200



In [32]:
import numpy as np

feature_names = X.columns
coefficients = model.coef_[0]

# Create DataFrame for ranking
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': np.abs(coefficients)
}).sort_values(by='Importance', ascending=False)

print(importance_df)


                      Feature  Importance
4        Order_value_per_item    0.263796
2               Shipping_Days    0.242377
7   Payment_Method_Debit Card    0.226552
6  Payment_Method_Credit Card    0.153582
1               Items_Ordered    0.096009
3               Fast_shipping    0.091231
5           High_risk_payment    0.056003
0                 Order_Value    0.049041
8          Payment_Method_UPI    0.012406
