In [1]:
# Step 1: Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option("display.max_columns", 100)
sns.set(style="whitegrid")

# Step 2: Load dataset
df = pd.read_csv("ecommerce_sales_data.csv")

# Show first few rows
df.head()


Unnamed: 0,Order Date,Product Name,Category,Region,Quantity,Sales,Profit
0,2024-12-31,Printer,Office,North,4,3640,348.93
1,2022-11-27,Mouse,Accessories,East,7,1197,106.53
2,2022-05-11,Tablet,Electronics,South,5,5865,502.73
3,2024-03-16,Mouse,Accessories,South,2,786,202.87
4,2022-09-10,Mouse,Accessories,West,1,509,103.28


**Observation:**  
We can see the dataset columns and a few sample records. It contains both numeric and categorical features.


# Part C: Multiple Linear Regression

## 1. Select TWO or MORE features
We include `Quantity` and `Profit`, and also encode categorical columns.


In [13]:
# Convert Order Date to datetime and extract useful features
df["Order Date"] = pd.to_datetime(df["Order Date"])
df["Order_Year"] = df["Order Date"].dt.year
df["Order_Month"] = df["Order Date"].dt.month

# Drop original date column
df_ml = df.drop(columns=["Order Date"])

# One-hot encode categorical features
df_ml = pd.get_dummies(df_ml, columns=["Product Name", "Category", "Region"], drop_first=True)

# Features and target
X = df_ml.drop(columns=["Sales"])
y = df_ml["Sales"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
multi_reg = LinearRegression()
multi_reg.fit(X_train, y_train)

# Predict
y_pred = multi_reg.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)


MSE: 1754854.7635348828
RMSE: 1324.7093128437207
R2 Score: 0.7202933867924615


**Observation:**  
Using multiple features usually improves prediction accuracy compared to simple regression.


## 5. Interpret coefficients


In [14]:
coeff_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": multi_reg.coef_
}).sort_values(by="Coefficient", ascending=False)

coeff_df.head(10)


Unnamed: 0,Feature,Coefficient
0,Quantity,258.500915
10,Product Name_Smartphone,121.554101
17,Region_West,112.579825
14,Category_Office,58.3897
9,Product Name_Printer,58.3897
12,Product Name_Tablet,57.925417
15,Region_North,56.032601
2,Order_Year,34.319496
7,Product Name_Monitor,19.307197
8,Product Name_Mouse,16.190711


**Interpretation:**  
Positive coefficients increase Sales, negative coefficients decrease Sales.


**Part C Summary:**  
Multiple regression captures more complexity and usually improves model performance.
