In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# === LOAD DATASET ===
df = pd.read_csv('ecommerce_returns_with_return_rates.csv')

# === FEATURES AND TARGET ===
features = [
    'Price', 'Order_Quantity', 'Discount_Applied',
    'User_Age', 'User_Gender', 'User_Location',
    'Shipping_Method', 'Payment_Method', 'Product_Category'
]
target = 'Is_Returned'

# === CHECK COLUMNS EXIST ===
missing_cols = [col for col in features + [target] if col not in df.columns]
if missing_cols:
    raise ValueError(f"Missing columns: {missing_cols}")

# === HANDLE TARGET FIRST (Ensure it's clean) ===
df = df[df[target].notnull()]
if df[target].dtype == object:
    df[target] = df[target].map({'Yes': 1, 'No': 0})  # Optional, if needed

# === DEFINE CATEGORICAL & NUMERIC COLUMNS ===
categorical_features = ['User_Gender', 'User_Location', 'Shipping_Method', 'Payment_Method', 'Product_Category']
numerical_features = ['Price', 'Order_Quantity', 'Discount_Applied', 'User_Age']

# === DEFINE PREPROCESSORS ===
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))  # Fills missing with mean
])

preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features),
    ('num', numerical_transformer, numerical_features)
])

# === DEFINE PIPELINE ===
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# === SPLIT DATA ===
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === TRAIN MODEL ===
pipeline.fit(X_train, y_train)

# === PREDICT & EVALUATE ===
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

print("\n--- Classification Report ---\n")
print(classification_report(y_test, y_pred))

print("\n--- ROC AUC Score ---")
print(roc_auc_score(y_test, y_proba))

# === EXPORT PREDICTIONS ===
risk_df = X_test.copy()
risk_df['Predicted_Probability'] = y_proba
risk_df['Predicted_Return'] = y_pred
risk_df['Actual_Return'] = y_test.reset_index(drop=True)

risk_df.to_csv('return_risk_predictions.csv', index=False)
print("\n✅ CSV exported: 'return_risk_predictions.csv'")



--- Classification Report ---

              precision    recall  f1-score   support

           0       0.49      0.42      0.45      1012
           1       0.48      0.55      0.52       989

   micro avg       0.49      0.49      0.49      2001
   macro avg       0.49      0.49      0.48      2001
weighted avg       0.49      0.49      0.48      2001


--- ROC AUC Score ---
0.4749207687727053

✅ CSV exported: 'return_risk_predictions.csv'




In [25]:
from sklearn.ensemble import RandomForestClassifier
pipeline.steps[-1] = ('classifier', RandomForestClassifier(random_state=42))


In [22]:
print(df['Is_Returned'].value_counts(normalize=True))


1    0.505199
0    0.494801
Name: Is_Returned, dtype: float64


In [24]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [28]:
from sklearn.linear_model import LogisticRegression

LogisticRegression(max_iter=1000, solver='liblinear')  # good for small/medium datasets


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [29]:
LogisticRegression(max_iter=1000, solver='lbfgs')


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)