In [6]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Load data
df = pd.read_csv('../data/processed/final_merged_data.csv')

# Features and target
X = df.drop('is_late', axis=1)
y = df['is_late']

# Preprocessing
num_cols = ['shipping_days', 'estimated_days']
cat_cols = ['product_category_name', 'customer_state', 'seller_state']

preprocessor = ColumnTransformer([
    ('num', 'passthrough', num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True)  # SVM included with probability estimates for consistency
}

# Train and evaluate each model
results = []

for name, model in models.items():
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    print(f"\n🧠 Training: {name}")
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    print(f"📈 Accuracy for {name}: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    
    results.append((name, acc))

# Summary comparison
comparison = pd.DataFrame(results, columns=["Model", "Accuracy"]).sort_values(by="Accuracy", ascending=False)
print("\n🔍 Model Accuracy Comparison:")
print(comparison)

# Save the trained model pipeline
joblib.dump(best_model, 'C:/Users/bhavi/Desktop/delivery-delay-prediction/models/train_model.pkl')



🧠 Training: Logistic Regression
📈 Accuracy for Logistic Regression: 0.9944
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20254
           1       0.98      0.95      0.96      1786

    accuracy                           0.99     22040
   macro avg       0.99      0.97      0.98     22040
weighted avg       0.99      0.99      0.99     22040


🧠 Training: Random Forest
📈 Accuracy for Random Forest: 0.9901
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     20254
           1       0.96      0.91      0.94      1786

    accuracy                           0.99     22040
   macro avg       0.98      0.95      0.97     22040
weighted avg       0.99      0.99      0.99     22040


🧠 Training: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📈 Accuracy for XGBoost: 0.9943
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20254
           1       0.98      0.95      0.96      1786

    accuracy                           0.99     22040
   macro avg       0.99      0.98      0.98     22040
weighted avg       0.99      0.99      0.99     22040


🧠 Training: KNN
📈 Accuracy for KNN: 0.9921
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20254
           1       0.96      0.94      0.95      1786

    accuracy                           0.99     22040
   macro avg       0.98      0.97      0.97     22040
weighted avg       0.99      0.99      0.99     22040


🧠 Training: SVM
📈 Accuracy for SVM: 0.9943
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20254
           1       0.98      0.95      0.96      1786

    accuracy                           0.99     22040
   macro 

['C:/Users/bhavi/Desktop/delivery-delay-prediction/models/train_model.pkl']