<a href="https://colab.research.google.com/github/Asakeblessing/Data-Science-Project/blob/main/FinalSupplier_Performance_Data_Science_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data PreProcessing

In [None]:
# Libraries Import

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
#Install Packages

!pip install pandas numpy scikit-learn matplotlib seaborn


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#load my Dataset

df = pd.read_csv("Supplier selection and performance Dataset.csv")

#show the first few rows
df.head()

Data Cleaning

In [None]:
# Convert to datetime
df['Order_Date'] = pd.to_datetime(df['Order_Date'])
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'])

# Fill missing Delivery_Date based on average delivery duration
avg_delivery_days = (df['Delivery_Date'] - df['Order_Date']).dt.days.mean()
df['Delivery_Date'] = df['Delivery_Date'].fillna(df['Order_Date'] + pd.to_timedelta(avg_delivery_days, unit='D'))

# Fill missing Defective_Units with mean
df['Defective_Units'] = df['Defective_Units'].fillna(df['Defective_Units'].mean())

df.info()
df.isnull().sum()


In [None]:
df = df.drop_duplicates()


KPI Engineering

In [None]:
df['Cost_Savings'] = (df['Unit_Price'] - df['Negotiated_Price']) * df['Quantity']
df['Delivery_Duration'] = (df['Delivery_Date'] - df['Order_Date']).dt.days
df['Defect_Rate_Percent'] = (df['Defective_Units'] / df['Quantity']) * 100


Data Prep For Topsis

In [None]:
topsis_df = df[['Supplier', 'Cost_Savings', 'Delivery_Duration', 'Defect_Rate_Percent']]
topsis_grouped = topsis_df.groupby('Supplier').mean().reset_index()


In [None]:
#Normalize and Weight the KPIs

In [None]:

# Normalize the KPIs using Min-Max Scaling

scaler = MinMaxScaler()
normalized = scaler.fit_transform(topsis_grouped[['Cost_Savings', 'Delivery_Duration', 'Defect_Rate_Percent']])

weights = np.array([0.4, 0.3, 0.3])  # Set weights for the KPIs
impacts = np.array([1, -1, -1])      # +1 for beneficial, -1 for non-beneficial

# Weighted normalized matrix
weighted = normalized * weights



Ideal and Negative-Ideal Solutions

In [None]:
ideal = np.max(weighted, axis=0) * (impacts > 0) + np.min(weighted, axis=0) * (impacts < 0)
negative_ideal = np.min(weighted, axis=0) * (impacts > 0) + np.max(weighted, axis=0) * (impacts < 0)



Euclidean distances Calculation

In [None]:
dist_to_ideal = np.linalg.norm(weighted - ideal, axis=1)
dist_to_negative_ideal = np.linalg.norm(weighted - negative_ideal, axis=1)


TOPSIS Score

In [None]:
topsis_scores = dist_to_negative_ideal / (dist_to_ideal + dist_to_negative_ideal)


Ranking For TOPSIS Score

In [None]:
topsis_grouped['TOPSIS_Score'] = topsis_scores
topsis_grouped['Rank'] = topsis_grouped['TOPSIS_Score'].rank(ascending=False)
topsis_ranked = topsis_grouped.sort_values(by='Rank')


In [None]:
#Visualize TOPSIS Score using Barchart

topsis_ranked_sorted = topsis_ranked.sort_values(by='TOPSIS_Score', ascending=True)
plt.figure(figsize=(10, 6))
plt.barh(
    topsis_ranked_sorted['Supplier'],
    topsis_ranked_sorted['TOPSIS_Score'],
    color='mediumseagreen'
)
plt.xlabel('TOPSIS Score')
plt.title('Supplier Ranking Based on TOPSIS Score')
plt.tight_layout()
plt.show()

In [None]:
#Final Ranking

topsis_ranked.reset_index(drop=True)

In [None]:
# Assign supplier performance class based on TOPSIS Score tertiles
topsis_ranked['Performance_Class'] = pd.qcut(
    topsis_ranked['TOPSIS_Score'],
    q=3,
    labels=['Low', 'Medium', 'High']
)


In [None]:
# Pie chart
class_counts = topsis_ranked['Performance_Class'].value_counts() # Calculate class counts here
plt.figure(figsize=(6, 6))
plt.pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%', colors=['lightcoral', 'gold', 'mediumseagreen'])
plt.title('Supplier Performance Class Distribution')
plt.tight_layout()
plt.show()

In [None]:
# Create grouped table
grouped_suppliers = topsis_ranked[['Supplier', 'Performance_Class']].sort_values(by='Performance_Class')
grouped_suppliers


In [None]:
topsis_ranked.to_csv("TOPSIS_final_ranking.csv", index=False)


Machine Learning Pipepline Using All 3 KPIs for Performance Class/ Independent Of Topsis

In [None]:
# Create DataFrame for ML
kpi_df = df[['Supplier', 'Cost_Savings', 'Delivery_Duration', 'Defect_Rate_Percent']].copy()


In [None]:
# Drop rows with missing or invalid KPI values
kpi_df.dropna(subset=['Cost_Savings', 'Delivery_Duration', 'Defect_Rate_Percent'], inplace=True)


Binary Performance label using all 3 KPIs Cost_Savings,Delivery_Duration and Defect_Rate_Percent

In [None]:
# Define binary performance label
def classify_binary(row):
    return 1 if (
        row['Cost_Savings'] > 3500 and
        row['Delivery_Duration'] < 8 and
        row['Defect_Rate_Percent'] < 7
    ) else 0

# Apply it
kpi_df['Performance_Binary'] = kpi_df.apply(classify_binary, axis=1)


In [None]:
print(kpi_df.columns)

In [None]:
kpi_df[['Cost_Savings', 'Delivery_Duration', 'Defect_Rate_Percent', 'Performance_Binary']].head()


Features and Binary Labels

In [None]:
X = kpi_df[['Cost_Savings', 'Delivery_Duration', 'Defect_Rate_Percent']]
y = kpi_df['Performance_Binary']

Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


Define Regularized Models With Class Balancing

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from collections import Counter

# Calculate ratio for XGBoost class weight
class_counts = Counter(y)
scale_ratio = class_counts[0] / class_counts[1]


ML Model Training with Regularization and Class Weights

In [None]:
logistic_model = LogisticRegression(
    class_weight='balanced',
    C=0.05,  # Stronger regularization
    penalty='l2',
    solver='liblinear',
    max_iter=1000,
    random_state=42
)


rf_model = RandomForestClassifier(
    n_estimators=30,             # Fewer trees to reduce overfitting
    max_depth=2,                 # Very shallow trees (cannot memorize)
    min_samples_leaf=10,         # Prevent learning from noise
    min_samples_split=10,
    max_features='sqrt',         # O use sqrt(n_features) at each split
    bootstrap=True,              # to Ensure sampling randomness
    class_weight='balanced',     # helps Handle class imbalance
    random_state=42
)

xgb_model = XGBClassifier(
    n_estimators=50,            #Balanced number of boosting rounds to learn patters without over fittting
    max_depth=2,                #shallow trees to avoid memorising noise : focus on simple and general rules
    learning_rate=0.01,         #small learning rate to improve stabiility and require more boosting rounds for gradual learning
    subsample=0.6,              #small to increase diversity
    colsample_bytree=0.6,       #reduce overfitting
    reg_lambda=10,             # L2 regularization
    reg_alpha=5,               # L1 regularization
    scale_pos_weight=10,       # Balance class distribution
    eval_metric='logloss',
    random_state=42
)


# Model dictionary
models = {
    "Logistic Regression": logistic_model,
    "Random Forest": rf_model,
    "XGBoost": xgb_model
}

Evaluate and Cross - Validation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(model, X, y, scoring='f1_macro', cv=skf)
print("Stratified 5-Fold F1 Macro:", scores.mean())

for name, model in models.items():
    print(f"\n {name} Evaluation")

    # Train  model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Confusion Matrix
    print(" Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Classification Report
    print("\n Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Low', 'High']))

    # Cross-validation F1 macro
    scores = cross_val_score(model, X, y, cv=skf, scoring='f1_macro')
    print(f" Stratified 5-Fold F1 Macro Avg: {scores.mean():.4f}")


Confusion Matrices

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import ConfusionMatrixDisplay

#  confusion matrices from evaluations
cm_logistic = np.array([[128, 14],
                        [1, 13]])

cm_rf = np.array([[134, 8],
                  [0, 14]])

cm_xgb = np.array([[136, 6],
                   [0, 14]])

# Model titles
model_titles = ['Logistic Regression', 'Random Forest', 'XGBoost']
cms = [cm_logistic, cm_rf, cm_xgb]

# Plot confusion matrices side-by-side
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, cm, title in zip(axes, cms, model_titles):
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Low', 'High'])
    disp.plot(ax=ax, cmap='Blues', colorbar=False)
    ax.set_title(title)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')

plt.tight_layout()
plt.show()


Best Performing Models

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

#   model scores  evaluation
model_scores = {
    'Logistic Regression': 0.7382,
    'Random Forest': 0.8391,
    'XGBoost': 0.8535
}

# Convert to DataFrame
score_df = pd.DataFrame(list(model_scores.items()), columns=['Model', 'F1 Macro Score'])

# Plot F1 Macro Scores
plt.figure(figsize=(9, 5))
bars = plt.bar(score_df['Model'], score_df['F1 Macro Score'], color=['#4c78a8', '#72b7b2', '#f58518'])
plt.title('Model Performance Comparison (F1 Macro Score)', fontsize=14)

plt.ylim(0, 1)
plt.ylabel('F1 Macro Score')
plt.grid(axis='y', linestyle='--', alpha=0.6)

# Annotate bars with values
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, f'{yval:.4f}', ha='center', fontsize=11)

plt.tight_layout()
plt.show()


In [None]:
# Save your final DataFrame (e.g., kpi_df) to CSV
kpi_df.to_csv("final_supplier_results.csv", index=False)

# Provide download link
from google.colab import files
files.download("final_supplier_results.csv")

Supplier Predictions across Models

In [None]:
import os, joblib
import pandas as pd

# Configuration
FEATURES = ["Cost_Savings", "Delivery_Duration", "Defect_Rate_Percent"]
THRESHOLDS = {"LogisticRegression":0.5, "RandomForest":0.5, "XGBoost":0.5}
PKL_PATHS = {"LogisticRegression":"lr_model.pkl", "RandomForest":"rf_model.pkl", "XGBoost":"xgb_model.pkl"}

#recal
if "df_all" not in globals():
    df_all = pd.DataFrame({
        "Supplier":["Alpha_Inc","Beta_Supplies","Gamma_Co","Delta_Logistics","Epsilon_Group"],
        "Cost_Savings":[4000,3600,3100,2900,3700],
        "Delivery_Duration":[7,6,5,10,9],
        "Defect_Rate_Percent":[5.5,4.2,3.8,8.1,6.0]
    })

#  checks/casting
req = ["Supplier"] + FEATURES
missing = [c for c in req if c not in df_all.columns]
if missing: raise ValueError(f"df_all must contain: {missing}")
df_all[FEATURES] = df_all[FEATURES].apply(pd.to_numeric, errors="coerce").fillna(df_all[FEATURES].median(numeric_only=True))

# Models:
try:
    models  # noqa
    print("Using models from memory.")
except NameError:
    print("Loading models from disk...")
    models = {name: joblib.load(path) for name, path in PKL_PATHS.items() if os.path.exists(path)}
    if not models: raise RuntimeError("No models provided and no .pkl files found.")
for n, m in models.items():
    if not hasattr(m, "predict_proba"): raise AttributeError(f"{n} lacks predict_proba().")

#  Row-level scoring
def score_rows(df, model_dict, feats=FEATURES, thr=THRESHOLDS):
    X = df[feats].copy()
    out = []
    for name, model in model_dict.items():
        p = model.predict_proba(X)[:, 1]
        out.append(pd.DataFrame({"Supplier": df["Supplier"], "Model": name, "Prob_High": p, "Pred": (p >= thr.get(name,0.5)).astype(int)}))
    return pd.concat(out, ignore_index=True)

row_preds = score_rows(df_all, models)

# Supplier aggregation & ranking (unchanged logic)
sup_agg = (row_preds.groupby(["Model","Supplier"], as_index=False)
           .agg(Prob_High_mean=("Prob_High","mean"), High_rate=("Pred","mean")))
sup_agg["Model_Rank"] = (sup_agg.groupby("Model")["Prob_High_mean"]
                         .rank(ascending=False, method="dense").astype(int))

rank_wide = sup_agg.pivot(index="Supplier", columns="Model", values="Model_Rank").rename_axis(None, axis=1).reset_index()
prob_wide = sup_agg.pivot(index="Supplier", columns="Model", values="Prob_High_mean").rename_axis(None, axis=1).reset_index()
comparison = rank_wide.merge(prob_wide, on="Supplier", suffixes=("_Rank","_Prob"))

print("\n=== Supplier ranking & mean probability by model (no TOPSIS) ===")
print(comparison.to_string(index=False))

comparison.to_csv("supplier_model_comparison_no_topsis.csv", index=False)
sup_agg.to_csv("supplier_model_scores_long_no_topsis.csv", index=False)


In [None]:
# 5) Visualize ranks across models

rank_cols = [c for c in comparison.columns if c.endswith("_Rank")]
if rank_cols:
    ax = comparison.set_index("Supplier")[rank_cols].plot(kind="bar", figsize=(12,6))
    ax.set_ylabel("Rank (1 = best)")
    ax.set_title("Supplier Ranks by Model (LogReg, RF, XGB)")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()
else:
    print("No rank columns to plot.")

Topsis Vs Machine Learning





In [None]:
# Visualize XGBoost vs TOPSIS ranking
# comparison (ML results) and 'topsis_df' (TOPSIS results)

# Merge ML comparison with TOPSIS ranks
topsis = topsis_grouped[["Supplier", "Rank"]].rename(columns={"Rank": "TOPSIS_Rank"})
comp_with_topsis = comparison.merge(topsis, on="Supplier", how="inner")

# Plot XGBoost vs TOPSIS rank side by side
target_model = "XGBoost"
if f"{target_model}_Rank" in comp_with_topsis.columns:
    plt.figure(figsize=(12,6))
    idx = np.arange(len(comp_with_topsis))
    bar_w = 0.4
    plt.bar(idx, comp_with_topsis[f"{target_model}_Rank"], bar_w, label=f'{target_model} Rank')
    plt.bar(idx+bar_w, comp_with_topsis["TOPSIS_Rank"], bar_w, label='TOPSIS Rank')
    plt.xticks(idx+bar_w/2, comp_with_topsis["Supplier"], rotation=45, ha="right")
    plt.ylabel("Rank (1 = best)")
    plt.title(f"Supplier Rankings: {target_model} vs TOPSIS")
    plt.legend()
    plt.tight_layout()
    plt.show()
else:
    print(f"No {target_model}_Rank column found in comparison table.")

Powerbi Deployment Pipeline

In [None]:
import os, json, joblib
import pandas as pd
import numpy as np

# File paths
MODEL_FILE = "xgb_model.pkl"
META_FILE  = "model_meta.json"
# INPUT_FILE = "suppliers_input.csv"      # KPI dataset # Removed
OUTPUT_FILE = "suppliers_scored.csv"    # predictions output

#  Load data # Modified to use existing df
# df = pd.read_csv(INPUT_FILE) # Removed

#  Load model + metadata
# Assuming the model and meta files were saved previously
try:
    model = joblib.load(MODEL_FILE)
    with open(META_FILE, "r") as f:
        meta = json.load(f)
except FileNotFoundError:
    print(f"Error: Model or metadata file not found. Please ensure '{MODEL_FILE}' and '{META_FILE}' exist.")
    # Exit or handle the error appropriately
    raise # Re-raise the exception to stop execution if files are missing


required = ["Cost_Savings", "Delivery_Duration", "Defect_Rate_Percent"]
feat_order = meta.get("feature_order", required)
thr = meta.get("thresholds", {"cost_savings": 3500, "delivery_duration": 8, "defect_rate": 7})

#  numeric
for c in required:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df[required] = df[required].fillna(df[required].median(numeric_only=True))

# Prediction
X = df[feat_order]
df["ML_Pred"] = model.predict(X).astype(int)

try:
    df["Prob_High"] = model.predict_proba(X)[:, 1]
    df["Prob_High_Pct"] = (df["Prob_High"] * 100).round(1)
except Exception:
    df["Prob_High"] = np.nan
    df["Prob_High_Pct"] = np.nan


df["ML_Pred_Label"] = np.where(df["ML_Pred"] == 1, "High", "Low")

# Rule-based label for comparison
df["Rule_Based"] = (
    (df["Cost_Savings"] > thr["cost_savings"]) &
    (df["Delivery_Duration"] < thr["delivery_duration"]) &
    (df["Defect_Rate_Percent"] < thr["defect_rate"])
).astype(int)

# Save predictions
df.to_csv(OUTPUT_FILE, index=False)
print(f"✅ Predictions saved to {OUTPUT_FILE}")

In [None]:
from google.colab import files
files.download("suppliers_scored.csv")
