In [37]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from imblearn.over_sampling import SMOTE
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from fairlearn.metrics import MetricFrame, selection_rate, true_positive_rate, false_positive_rate
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference

In [1]:
from read_and_clean import read_and_clean

In [15]:
orders_df = read_and_clean()

In [18]:
print(orders_df.dtypes)

Row ID                           int64
Order Priority                  object
Discount                       float64
Unit Price                     float64
Shipping Cost                  float64
Customer ID                      int64
Customer Name                   object
Ship Mode                       object
Customer Segment                object
Product Category                object
Product Sub-Category            object
Product Container               object
Product Name                    object
Product Base Margin            float64
Country                         object
Region                          object
State or Province               object
City                            object
Postal Code                      int64
Order Date              datetime64[ns]
Ship Date               datetime64[ns]
Profit                         float64
Quantity ordered new             int64
Sales                          float64
Order ID                         int64
Year                     

In [16]:
orders_df

Unnamed: 0,Row ID,Order Priority,Discount,Unit Price,Shipping Cost,Customer ID,Customer Name,Ship Mode,Customer Segment,Product Category,...,City,Postal Code,Order Date,Ship Date,Profit,Quantity ordered new,Sales,Order ID,Year,Month
0,20847,High,0.01,2.84,0.93,3,Bonnie Potter,Express Air,Corporate,Office Supplies,...,Anacortes,98221,2015-01-07,2015-01-08,4.5600,4,13.01,88522,2015,1
1,20228,Not Specified,0.02,500.98,26.00,5,Ronnie Proctor,Delivery Truck,Home Office,Furniture,...,San Gabriel,91776,2015-06-13,2015-06-15,4390.3665,12,6362.85,90193,2015,6
2,21776,Critical,0.06,9.48,7.29,11,Marcus Dunlap,Regular Air,Home Office,Furniture,...,Roselle,7203,2015-02-15,2015-02-17,-53.8096,22,211.15,90192,2015,2
3,24844,Medium,0.09,78.69,19.99,14,Gwendolyn F Tyson,Regular Air,Small Business,Furniture,...,Prior Lake,55372,2015-05-12,2015-05-14,803.4705,16,1164.45,86838,2015,5
4,24846,Medium,0.08,3.28,2.31,14,Gwendolyn F Tyson,Regular Air,Small Business,Office Supplies,...,Prior Lake,55372,2015-05-12,2015-05-13,-24.0300,7,22.23,86838,2015,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1947,19842,High,0.01,10.90,7.46,3397,Andrea Shaw,Regular Air,Small Business,Office Supplies,...,Danville,61832,2015-03-11,2015-03-12,-116.7600,18,207.31,87536,2015,3
1948,19843,High,0.10,7.99,5.03,3397,Andrea Shaw,Regular Air,Small Business,Technology,...,Danville,61832,2015-03-11,2015-03-12,-160.9520,22,143.12,87536,2015,3
1949,26208,Not Specified,0.08,11.97,5.81,3399,Marvin Reid,Regular Air,Small Business,Office Supplies,...,Des Plaines,60016,2015-03-29,2015-03-31,-41.8700,5,59.98,87534,2015,3
1950,24911,Medium,0.10,9.38,4.93,3400,Florence Gold,Express Air,Small Business,Furniture,...,Fairmont,26554,2015-04-04,2015-04-04,-24.7104,15,135.78,87537,2015,4


In [24]:

# Example: Predicting 'Profit' as a binary classification (profitable or not)
orders_df['Profitable'] = (orders_df['Profit'] > 0).astype(int)

# Features and target
X = orders_df.drop(['Profit', 'Profitable'], axis=1)
y = orders_df['Profitable']

# For simplicity, drop non-numeric columns (or you can encode them as needed)
X_numeric = X.select_dtypes(include=[np.number])

X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42, stratify=y)

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

# Impute missing values in X_train and X_test
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Fit logistic regression on imputed numeric features for binary profit prediction
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_imputed, y_train)
y_pred_logreg = logreg.predict(X_test_imputed)

print("Logistic Regression F1:", f1_score(y_test, y_pred_logreg))
print("Logistic Regression Precision:", precision_score(y_test, y_pred_logreg))
print("Logistic Regression Recall:", recall_score(y_test, y_pred_logreg))
print("Logistic Regression ROC-AUC:", roc_auc_score(y_test, logreg.predict_proba(X_test_imputed)[:, 1]))

Logistic Regression F1: 0.6714285714285714
Logistic Regression Precision: 0.6409090909090909
Logistic Regression Recall: 0.705
Logistic Regression ROC-AUC: 0.7036910994764398


In [29]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score



xgb_reg = XGBRegressor(random_state=42, eval_metric='rmse')
xgb_reg.fit(X_train_imputed, y_train)
y_pred_xgb = xgb_reg.predict(X_test_imputed)

print("XGBoost RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_xgb)))
print("XGBoost R2:", r2_score(y_test, y_pred_xgb))

XGBoost RMSE: 0.42086882166212325
XGBoost R2: 0.29110217094421387


In [38]:


models_params_reg = {
    'CatBoost': {
        'model': CatBoostRegressor(random_state=42, verbose=0),
        'params': {
            'depth': [6, 7, 10],
            'iterations': [100, 200, 250, 300],
            'learning_rate': [0.01, 0.03, 0.05, 0.1],
            'l2_leaf_reg': [5, 7, 9, 12, 15],
            'border_count': [32, 64, 128, 256]
        }
    },
    'GBM': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200, 250, 300],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 5, 7],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'XGBoost': {
        'model': XGBRegressor(random_state=42, eval_metric='rmse'),
        'params': {
            'n_estimators': [70, 100, 200, 300],
            'learning_rate': [0.05, 0.1, 0.2],
            'max_depth': [3, 5, 7, 10],
            'subsample': [0.6, 0.7, 0.8, 1.0],
            'colsample_bytree': [0.7, 0.8, 1.0],
            'reg_alpha': [0, 0.1, 0.05, 1],
            'reg_lambda': [1, 5, 10, 12]
        }
    },
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [200, 250, 300],
            'max_depth': [None, 10, 15, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'bootstrap': [True, False]
        }
    },
    'AdaBoost': {
        'model': AdaBoostRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 150, 200, 250],
            'learning_rate': [0.01, 0.1, 0.3, 0.5, 1.0]
        }
    }
}


In [None]:
results = []
best_models = {}

for name, config in models_params.items():
    grid = GridSearchCV(config['model'], config['params'], scoring='f1', cv=3, n_jobs=-1)
    grid.fit(X_train_imputed, y_train)
    best = grid.best_estimator_
    best_models[name] = best
    y_pred = best.predict(X_test_imputed)
    results.append({
        "Model": name,
        "Best Params": grid.best_params_,
        "Test F1": f1_score(y_test, y_pred),
        "Test Precision": precision_score(y_test, y_pred),
        "Test Recall": recall_score(y_test, y_pred),
        "Test ROC-AUC": roc_auc_score(y_test, y_pred)
    })

results_df = pd.DataFrame(results).sort_values(by="Test F1", ascending=False)
print("\n===  Model Selection Results ===")
print(results_df)