In [1]:
pip install pandas numpy scikit-learn


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('ecommerce_dataset_updated.csv')

In [4]:


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor


In [7]:
df1 = pd.read_csv('orders_table.csv')

In [9]:
df.columns

Index(['User_ID', 'Product_ID', 'Category', 'Price (Rs.)', 'Discount (%)',
       'Final_Price(Rs.)', 'Payment_Method', 'Purchase_Date', 'order_id',
       'transaction_id', 'final_price', 'quantity', 'customer_country',
       'customer_age_group', 'customer_signup_date', 'brand', 'price_band',
       'net_revenue'],
      dtype='object')

In [10]:
# Making copies
sales_df = df.copy()
orders_df = df1.copy()

# Merge on order_id + transaction_id 
data = sales_df.merge(
    orders_df,
    on=['order_id', 'transaction_id', 'User_ID', 'Payment_Method'],
    how='left'
)


In [11]:
# Keeping delivered orders only
data = data[data['order_status'] == 'Delivered']

# Dropping rows with missing target
data = data.dropna(subset=['net_revenue'])

# Removing duplicates if any
data = data.drop_duplicates()


In [21]:
# Target
y = data['net_revenue']

# Features (remove all leakage + identifiers)
X = data.drop(columns=[
    'net_revenue',           # target
    'order_id',              # identifier
    'transaction_id',        # identifier
    'Purchase_Date',         # high-cardinality date
    'customer_signup_date',  # not useful for this objective

    # ---- LEAKAGE COLUMNS (REMOVED) ----
    'Price (Rs.)',
    'Final_Price(Rs.)',
    'final_price',
    'order_total'
], errors='ignore')



In [22]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns


In [23]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [25]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])


In [26]:
pipeline.fit(X_train, y_train)


In [27]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y_pred = pipeline.predict(X_test)

print("MAE:", round(mean_absolute_error(y_test, y_pred), 2))
print("RMSE:", round(np.sqrt(mean_squared_error(y_test, y_pred)), 2))
print("R2 Score:", round(r2_score(y_test, y_pred), 3))


MAE: 79.99
RMSE: 114.66
R2 Score: 0.815


In [35]:
numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()


In [38]:
def prepare_new_orders(df_new, expected_columns):
    # Ensure all expected columns exist
    for col in expected_columns:
        if col not in df_new.columns:
            df_new[col] = np.nan  # placeholder
    
    # Reorder
    df_new = df_new[expected_columns]
    
    # Fill numeric NaNs with 0 (or mean if you prefer)
    for col in numeric_cols:
        if col in df_new.columns:
            df_new[col] = pd.to_numeric(df_new[col], errors='coerce').fillna(0)
    
    # Fill categorical NaNs with 'Unknown'
    for col in categorical_cols:
        if col in df_new.columns:
            df_new[col] = df_new[col].fillna('Unknown')
    
    return df_new


In [42]:
# Example new order with minimal info
new_orders = pd.DataFrame([{
    'Category': 'Electronics',
    'brand': 'Brand A',
    'Payment_Method': 'Credit Card',
    'customer_country': 'India',
    'customer_age_group': '26-35',
    'price_band': 'High',
    'quantity': 4,
    'Discount (%)': 20,
    'shipping_cost': 15.0,
    'total_items': 2
}])

# Prepare new orders so it matches pipeline
new_orders_prepared = prepare_new_orders(new_orders, expected_columns)

# Predict
predicted_revenue = pipeline.predict(new_orders_prepared)
new_orders_prepared['predicted_net_revenue'] = predicted_revenue

print(new_orders_prepared[['predicted_net_revenue']])


   predicted_net_revenue
0             910.229833
