In [8]:
# PART VI CODE: Amazon.csv
# Import Libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score

In [9]:
# Load Data

# Make sure Amazon.csv is in the same folder as your notebook, or give full path
df = pd.read_csv('C:/Users/DKell/Downloads/Amazon.csv')

In [10]:
# QUICK LOOK
print("Shape:", df.shape)
print(df.head())
print(df.dtypes)

Shape: (100000, 20)
      OrderID   OrderDate  CustomerID   CustomerName ProductID  \
0  ORD0000001  2023-01-31  CUST001504  Vihaan Sharma    P00014   
1  ORD0000002  2023-12-30  CUST000178    Pooja Kumar    P00040   
2  ORD0000003  2022-05-10  CUST047516    Sneha Singh    P00044   
3  ORD0000004  2023-07-18  CUST030059   Vihaan Reddy    P00041   
4  ORD0000005  2023-02-04  CUST048677  Aditya Kapoor    P00029   

           ProductName        Category       Brand  Quantity  UnitPrice  \
0           Drone Mini           Books   BrightLux         3     106.59   
1           Microphone  Home & Kitchen  UrbanStyle         1     251.37   
2  Power Bank 20000mAh        Clothing  UrbanStyle         3      35.03   
3       Webcam Full HD  Home & Kitchen      Zenith         5      33.58   
4              T-Shirt        Clothing    KiddoFun         2     515.64   

   Discount    Tax  ShippingCost  TotalAmount     PaymentMethod OrderStatus  \
0      0.00   0.00          0.09       319.86        

In [11]:
# 1. DEFINE FEATURES FOR REGRESSION
target_reg = "TotalAmount"

numeric_features_reg = ["Quantity", "UnitPrice", "Discount", "Tax", "ShippingCost"]
categorical_features_reg = ["Category", "Brand", "PaymentMethod", "City", "State", "Country"]

In [12]:
# Keep only rows where target is not missing
df_reg = df.dropna(subset=[target_reg]).copy()

X_reg = df_reg[numeric_features_reg + categorical_features_reg]
y_reg = df_reg[target_reg]

print("Regression dataset shape:", X_reg.shape, y_reg.shape)

Regression dataset shape: (100000, 11) (100000,)


In [13]:
# Preprocessor for Regression

numeric_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocessor_reg = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features_reg),
        ("cat", categorical_transformer, categorical_features_reg),
    ]
)

In [14]:
# This is assumming preprocessor_reg is already defined

linreg_model = Pipeline(
    steps=[
        ("preprocess", preprocessor_reg),
        ("model", LinearRegression())
    ]
)

tree_model = Pipeline(
    steps=[
        ("preprocess", preprocessor_reg),
        ("model", DecisionTreeRegressor(
            max_depth=10,
            min_samples_leaf=50,
            random_state=42
        ))
    ]
)

In [15]:
# Train and test split

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.3, random_state=42
)

print("Train size:", X_train_reg.shape, "Test size:", X_test_reg.shape)

Train size: (70000, 11) Test size: (30000, 11)


In [16]:
# 5. FIT MODELS & EVALUATE R² (8.1.1)

linreg_model.fit(X_train_reg, y_train_reg)
tree_model.fit(X_train_reg, y_train_reg)

y_pred_lin = linreg_model.predict(X_test_reg)
y_pred_tree = tree_model.predict(X_test_reg)

r2_lin = r2_score(y_test_reg, y_pred_lin)
r2_tree = r2_score(y_test_reg, y_pred_tree)

print(f"Linear Regression R² on test set: {r2_lin:.4f}")
print(f"Decision Tree R² on test set:     {r2_tree:.4f}")

Linear Regression R² on test set: 0.9096
Decision Tree R² on test set:     0.9963


In [29]:
# NEW RECORD PREDICTION (8.1.2)

# Create a realistic new order. EDIT values & categories as needed.
new_order = {
    "Quantity": 2,
    "UnitPrice": 499.99,
    "Discount": 50.00,
    "Tax": 80.00,
    "ShippingCost": 20.00,
    "Category": "Electronics",
    "Brand": "CoreTech",
    "PaymentMethod": "Credit Card",
    "City": "Austin",
    "State": "TX",
    "Country": "United States",
}

new_order_df = pd.DataFrame([new_order])

pred_lin_new = linreg_model.predict(new_order_df)[0]
pred_tree_new = tree_model.predict(new_order_df)[0]

print("\n=== New Order Prediction (Regression) ===")
print("Linear Regression prediction:", pred_lin_new)
print("Decision Tree prediction:     ", pred_tree_new)



=== New Order Prediction (Regression) ===
Linear Regression prediction: -38788.22925688919
Decision Tree prediction:      922.4004


In [14]:
# 7. 10-FOLD CROSS-VALIDATION (8.1.3)

# Use a subset if your data is huge (optional)
X_reg_small = X_reg
y_reg_small = y_reg

kfold_reg = KFold(n_splits=10, shuffle=True, random_state=42)

# Linear Regression CV R² and RMSE
cv_r2_lin = cross_val_score(
    linreg_model, X_reg_small, y_reg_small,
    cv=kfold_reg, scoring="r2"
)

cv_mse_lin = cross_val_score(
    linreg_model, X_reg_small, y_reg_small,
    cv=kfold_reg, scoring="neg_mean_squared_error"
)

# Decision Tree CV R² and RMSE
cv_r2_tree = cross_val_score(
    tree_model, X_reg_small, y_reg_small,
    cv=kfold_reg, scoring="r2"
)

cv_mse_tree = cross_val_score(
    tree_model, X_reg_small, y_reg_small,
    cv=kfold_reg, scoring="neg_mean_squared_error"
)

rmse_lin = np.sqrt(-cv_mse_lin)
rmse_tree = np.sqrt(-cv_mse_tree)

print("\n=== 10-fold CV: Linear Regression ===")
print(f"Mean R²:  {cv_r2_lin.mean():.4f} ± {cv_r2_lin.std():.4f}")
print(f"Mean RMSE:{rmse_lin.mean():.4f} ± {rmse_lin.std():.4f}")

print("\n=== 10-fold CV: Decision Tree ===")
print(f"Mean R²:  {cv_r2_tree.mean():.4f} ± {cv_r2_tree.std():.4f}")
print(f"Mean RMSE:{rmse_tree.mean():.4f} ± {rmse_tree.std():.4f}")


=== 10-fold CV: Linear Regression ===
Mean R²:  0.9100 ± 0.0010
Mean RMSE:217.3216 ± 1.4561

=== 10-fold CV: Decision Tree ===
Mean R²:  0.9967 ± 0.0001
Mean RMSE:41.8765 ± 0.3276


In [18]:
# 8.2 CLASSIFICATION: Logistic Regression vs KNN

# === EDIT THIS to your actual classification target column ===
target_clf = "OrderStatus"  # e.g. "OrderStatus", "Status", etc.

# Drop rows with missing target
df_clf = df.dropna(subset=[target_clf]).copy()

In [34]:
# Reuse many of the same predictors; you can add/remove as needed
numeric_features_clf = ["Quantity", "UnitPrice", "Discount", "Tax", "ShippingCost", "TotalAmount"]
categorical_features_clf = ["Category", "Brand", "PaymentMethod", "City", "State", "Country"]

# Define X_clf and y_clf
X_clf = df_clf[numeric_features_clf + categorical_features_clf]
y_clf = df_clf[target_clf]

# Define X_clf_small and y_clf_small
X_clf_small = X_clf
y_clf_small = y_clf

print("Classification dataset shape:", X_clf.shape, y_clf.shape)
print("Class distribution:\n", y_clf.value_counts(normalize=True))

Classification dataset shape: (100000, 12) (100000,)
Class distribution:
 OrderStatus
Delivered    0.74628
Shipped      0.15192
Pending      0.04103
Returned     0.03049
Cancelled    0.03028
Name: proportion, dtype: float64


In [35]:
# Preprocessor for classification

numeric_transformer_clf = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer_clf = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor_clf = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_clf, numeric_features_clf),
        ("cat", categorical_transformer_clf, categorical_features_clf),
    ]
)

In [36]:
# Train and test split

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf_small, y_clf_small,
    test_size=0.3, random_state=42, stratify=y_clf_small
)

print("Train size (clf):", X_train_clf.shape, "Test size (clf):", X_test_clf.shape)

Train size (clf): (70000, 12) Test size (clf): (30000, 12)


In [37]:
# Logistic Regression & KNN Pipelines

logreg_model = Pipeline(
    steps=[
        ("preprocess", preprocessor_clf),
        ("model", LogisticRegression(
            multi_class="multinomial",
            max_iter=200,
            n_jobs=-1
        ))
    ]
)

knn_model = Pipeline(
    steps=[
        ("preprocess", preprocessor_clf),
        ("model", KNeighborsClassifier(n_neighbors=7))
    ]
)

In [38]:
# FIT MODELS & EVALUATE ACCURACY (8.2.1)

logreg_model.fit(X_train_clf, y_train_clf)
knn_model.fit(X_train_clf, y_train_clf)

y_pred_logreg = logreg_model.predict(X_test_clf)
y_pred_knn = knn_model.predict(X_test_clf)

acc_logreg = accuracy_score(y_test_clf, y_pred_logreg)
acc_knn = accuracy_score(y_test_clf, y_pred_knn)

print(f"\nLogistic Regression accuracy: {acc_logreg:.4f}")
print(f"KNN (k=7) accuracy:           {acc_knn:.4f}")




Logistic Regression accuracy: 0.7463
KNN (k=7) accuracy:           0.7337


In [46]:
new_order_clf = {
    "Quantity": 2,
    "UnitPrice": 499.99,
    "Discount": 50.00,
    "Tax": 80.00,
    "ShippingCost": 20.00,
    "Category": "Electronics",
    "Brand": "CoreTech",          # MUST exist in your dataset
    "PaymentMethod": "Credit Card",
    "City": "Austin",
    "State": "TX",
    "Country": "United States",
}

manual_total = (
    new_order_clf["Quantity"] * new_order_clf["UnitPrice"]
    - new_order_clf["Discount"]
    + new_order_clf["Tax"]
    + new_order_clf["ShippingCost"]
)

new_order_clf["TotalAmount"] = manual_total
new_order_clf_df = pd.DataFrame([new_order_clf])

print("\nNew classification record:")
print(new_order_clf_df)

pred_logreg_new = logreg_model.predict(new_order_clf_df)[0]
pred_knn_new = knn_model.predict(new_order_clf_df)[0]

print("\n=== New Order Prediction (Classification) ===")
print("Logistic Regression prediction:", pred_logreg_new)
print("KNN prediction:                ", pred_knn_new)


New classification record:
   Quantity  UnitPrice  Discount   Tax  ShippingCost     Category     Brand  \
0         2     499.99      50.0  80.0          20.0  Electronics  CoreTech   

  PaymentMethod    City State        Country  TotalAmount  
0   Credit Card  Austin    TX  United States      1049.98  

=== New Order Prediction (Classification) ===
Logistic Regression prediction: Delivered
KNN prediction:                 Delivered


In [45]:
# NEW RECORD PREDICTION (8.2.2)
# Reuse the same new_order and add TotalAmount

new_order_clf = new_order.copy()
new_order_clf["TotalAmount"] = pred_tree_new  # or compute manually

new_order_clf_df = pd.DataFrame([new_order_clf])

pred_logreg_new = logreg_model.predict(new_order_clf_df)[0]
pred_knn_new = knn_model.predict(new_order_clf_df)[0]

print("\n=== New Order Prediction (Classification) ===")
print("Logistic Regression prediction:", pred_logreg_new)
print("KNN prediction:                ", pred_knn_new)


=== New Order Prediction (Classification) ===
Logistic Regression prediction: Delivered
KNN prediction:                 Delivered


In [43]:
# NEW RECORD PREDICTION (8.2.2)
# Reuse the same new_order and add TotalAmount

new_order_clf = new_order.copy()
new_order_clf["TotalAmount"] = pred_tree_new  # or compute manually

new_order_clf_df = pd.DataFrame([new_order_clf])

pred_logreg_new = logreg_model.predict(new_order_clf_df)[0]
pred_knn_new = knn_model.predict(new_order_clf_df)[0]

print("\n=== New Order Prediction (Classification) ===")
print("Logistic Regression prediction:", pred_logreg_new)
print("KNN prediction:                ", pred_knn_new)


=== New Order Prediction (Classification) ===
Logistic Regression prediction: Delivered
KNN prediction:                 Delivered


In [42]:
# 5-FOLD CROSS-VALIDATION FOR CLASSIFICATION (8.2.3)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_acc_logreg = cross_val_score(
    logreg_model, X_clf_small, y_clf_small,
    cv=skf, scoring="accuracy"
)

cv_acc_knn = cross_val_score(
    knn_model, X_clf_small, y_clf_small,
    cv=skf, scoring="accuracy"
)

print("\n=== 5-fold CV: Logistic Regression ===")
print(f"Mean accuracy: {cv_acc_logreg.mean():.4f} ± {cv_acc_logreg.std():.4f}")

print("\n=== 5-fold CV: KNN (k=7) ===")
print(f"Mean accuracy: {cv_acc_knn.mean():.4f} ± {cv_acc_knn.std():.4f}")




=== 5-fold CV: Logistic Regression ===
Mean accuracy: 0.7463 ± 0.0000

=== 5-fold CV: KNN (k=7) ===
Mean accuracy: 0.7347 ± 0.0005
