<a href="https://colab.research.google.com/github/Abhi-nand-shaji/Optimal_Transport_Mode_Selector/blob/main/SER_ML_(Final).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# --- Step 1: Load & Clean Data ---
df = pd.read_csv("/content/drive/MyDrive/ML data/DataforML.csv", skiprows=1)
# --- Rename Columns ---
df.columns = [
    "Perishable", "Commodity", "Category", "Source", "Destination", "Density", "Demand",
    "Distance_Rail", "Distance_Road", "Distance_Air", "Distance_Hyperloop",
    "Time_Rail", "Time_Road", "Time_Air", "Time_Hyperloop",
    "Cost_Rail", "Cost_Road", "Cost_Air", "Cost_Hyperloop",
    "Carbon_Rail", "Carbon_Road", "Carbon_Air", "Carbon_Hyperloop",
    "Unused1", "Unused2"
]
df = df.drop(columns=["Unused1", "Unused2"])

In [None]:
# Convert necessary columns to numeric
cols_to_convert = [
    "Distance_Rail", "Distance_Road", "Distance_Air", "Distance_Hyperloop",
    "Time_Rail", "Time_Road", "Time_Air", "Time_Hyperloop",
    "Cost_Rail", "Cost_Road", "Cost_Air", "Cost_Hyperloop",
    "Carbon_Rail", "Carbon_Road", "Carbon_Air", "Carbon_Hyperloop",
    "Demand", "Density", "Perishable"
]

for col in cols_to_convert:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with any NaNs
df.dropna(inplace=True)

In [None]:
# Normalize Cost, Time, Carbon
from sklearn.preprocessing import MinMaxScaler
scaler_minmax = MinMaxScaler()
for metric in ["Cost", "Time", "Carbon"]:
    for mode in ["Rail", "Road", "Air", "Hyperloop"]:
        df[f"{metric}_{mode}"] = scaler_minmax.fit_transform(df[[f"{metric}_{mode}"]])

In [None]:
print(df.head(10))

   Perishable              Commodity        Category   Source Destination  \
0         1.0                   Milk            FMCG  Chennai   Bangalore   
1         1.0        Packaged Snacks            FMCG  Chennai   Bangalore   
2         1.0                   Meat            FMCG  Chennai   Bangalore   
3         1.0               Seafoods            FMCG  Chennai   Bangalore   
4         1.0                Pickles            FMCG  Chennai   Bangalore   
5         1.0           Leafy Greens  fruits/veggies  Chennai   Bangalore   
6         1.0              Mushrooms  fruits/veggies  Chennai   Bangalore   
7         1.0             Cut Fruits  fruits/veggies  Chennai   Bangalore   
8         1.0               Vaccines          pharma  Chennai   Bangalore   
9         1.0  Rapid Diagnostic Kits          pharma  Chennai   Bangalore   

   Density  Demand  Distance_Rail  Distance_Road  Distance_Air  ...  Time_Air  \
0   1030.0     9.0          347.0          326.0         284.0  ...  0.

In [None]:
# Compute weighted scores for each mode
def get_adjusted_weights(demand_0_to_10, perishable):
    base_cost = 0.452
    base_carbon = 0.267
    base_time = 0.281
    demand = min(max(demand_0_to_10 / 10.0, 0), 1)
    perishability_factor = 0.1 if perishable else 0.0
    demand_factor = demand * 0.2
    w_time = base_time + perishability_factor + demand_factor
    w_carbon = base_carbon + 0.5 * perishability_factor
    w_cost = 1.0 - w_time - w_carbon
    return w_cost, w_carbon, w_time

for idx, row in df.iterrows():
    w_cost, w_carbon, w_time = get_adjusted_weights(row["Demand"], row["Perishable"])
    for mode in ["Rail", "Road", "Air", "Hyperloop"]:
        df.at[idx, f"Score_{mode}"] = (
            w_cost * row[f"Cost_{mode}"] +
            w_carbon * row[f"Carbon_{mode}"] +
            w_time * row[f"Time_{mode}"]
        )

In [None]:
# Define best mode
df["Best_Mode"] = df[["Score_Rail", "Score_Road", "Score_Air", "Score_Hyperloop"]].idxmin(axis=1)
df["Best_Mode"] = df["Best_Mode"].map({
    "Score_Rail": "rail",
    "Score_Road": "road",
    "Score_Air": "air",
    "Score_Hyperloop": "hyperloop"
})

In [None]:
# Feature selection
feature_cols = [
    "Cost_Rail", "Cost_Road", "Cost_Air", "Cost_Hyperloop",
    "Time_Rail", "Time_Road", "Time_Air", "Time_Hyperloop",
    "Carbon_Rail", "Carbon_Road", "Carbon_Air", "Carbon_Hyperloop",
    "Demand", "Density", "Perishable"
]

X = df[feature_cols]
y = df["Best_Mode"]

In [None]:
# Ensure all features are numeric
X = X.apply(pd.to_numeric, errors='coerce')
X.dropna(inplace=True)
y = y.loc[X.index]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42
)


In [None]:
# Final check before training
print("Checking for NaNs in X before scaling...")
print(X.isnull().sum())

# Drop rows with any remaining NaNs
X = X.dropna()
y = y[X.index]  # Ensure y aligns with cleaned X

Checking for NaNs in X before scaling...
Cost_Rail           0
Cost_Road           0
Cost_Air            0
Cost_Hyperloop      0
Time_Rail           0
Time_Road           0
Time_Air            0
Time_Hyperloop      0
Carbon_Rail         0
Carbon_Road         0
Carbon_Air          0
Carbon_Hyperloop    0
Demand              0
Density             0
Perishable          0
dtype: int64


In [None]:
print(f"Final row count: {len(df)}")

Final row count: 2496


In [None]:
# Define and train model
mlffnn = MLPClassifier(
    hidden_layer_sizes=(400, 300),
    activation='relu',
    solver='adam',
    alpha=0.0005,
    max_iter=1000,
    random_state=42
)
mlffnn.fit(X_train, y_train)

# Evaluate
y_pred = mlffnn.predict(X_test)
print("\n📊 MLFFNN Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))




📊 MLFFNN Classification Report:
              precision    recall  f1-score   support

         air       0.80      1.00      0.89         4
   hyperloop       1.00      0.98      0.99       555
        rail       0.96      0.99      0.98       182
        road       1.00      1.00      1.00         8

    accuracy                           0.99       749
   macro avg       0.94      0.99      0.96       749
weighted avg       0.99      0.99      0.99       749



In [None]:


# Inverse transform to get original feature values
X_original = scaler.inverse_transform(X_test)
original_features_df = pd.DataFrame(X_original, columns=feature_cols)

# Get source and destination columns corresponding to the test set
source_dest_df = df[["Source", "Destination"]].loc[X.index].reset_index(drop=True)

results_df = pd.DataFrame(y_pred)

# Combine all parts
final_df = pd.concat([
    original_features_df.reset_index(drop=True),
    source_dest_df,
    results_df
], axis=1)

# Save to CSV
final_df.to_csv("ml_predictions_with_originals.csv", index=False)

print("\n✅ ML predictions with original features saved to 'ml_predictions_with_originals.csv'")




✅ ML predictions with original features saved to 'ml_predictions_with_originals.csv'


In [None]:
cost_scaler = MinMaxScaler().fit(df[["Cost_Rail", "Cost_Road", "Cost_Air", "Cost_Hyperloop"]])
time_scaler = MinMaxScaler().fit(df[["Time_Rail", "Time_Road", "Time_Air", "Time_Hyperloop"]])
carbon_scaler = MinMaxScaler().fit(df[["Carbon_Rail", "Carbon_Road", "Carbon_Air", "Carbon_Hyperloop"]])


In [None]:
import joblib
joblib.dump(cost_scaler, "cost_scaler.pkl")
joblib.dump(time_scaler, "time_scaler.pkl")
joblib.dump(carbon_scaler, "carbon_scaler.pkl")


['carbon_scaler.pkl']

In [None]:
# Normalize Cost, Time, Carbon
from sklearn.preprocessing import MinMaxScaler

# 👇 Define these outside so we can reuse them later
cost_scaler.pkl = MinMaxScaler()
time_scaler.pkl = MinMaxScaler()
carbon_scaler.pkl = MinMaxScaler()

for metric, scaler in zip(["Cost", "Time", "Carbon"], [cost_scaler, time_scaler, carbon_scaler]):
    cols = [f"{metric}_{mode}" for mode in ["Rail", "Road", "Air", "Hyperloop"]]
    df[cols] = scaler.fit_transform(df[cols])


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib

# ---- Step 1: Estimate raw cost, time, and carbon metrics ----
def estimate_metrics_from_distance(distance_km):
    return {
        "cost": {
            "rail": 0.00136 * distance_km,
            "road": 0.0028 * distance_km,
            "air": 0.025 * distance_km,
            "hyperloop": 0.00415242 * distance_km,
        },
        "time": {
            "rail": (distance_km / 60),
            "road": (distance_km / 50),
            "air": (distance_km / 850),
            "hyperloop": (distance_km / 765)
        },
        "carbon": {
            "rail": 0.00996 * distance_km,
            "road": 0.062 * distance_km,
            "air": 0.6 * distance_km,
            "hyperloop": 0.006 * distance_km
        }
    }

In [None]:
# ---- Step 2: Normalize metrics using fitted scalers ----
def normalize_metrics(metrics, cost_scaler, time_scaler, carbon_scaler):
    cost_df = pd.DataFrame([{
        "Cost_Rail": metrics["cost"]["rail"],
        "Cost_Road": metrics["cost"]["road"],
        "Cost_Air": metrics["cost"]["air"],
        "Cost_Hyperloop": metrics["cost"]["hyperloop"]
    }])
    time_df = pd.DataFrame([{
        "Time_Rail": metrics["time"]["rail"],
        "Time_Road": metrics["time"]["road"],
        "Time_Air": metrics["time"]["air"],
        "Time_Hyperloop": metrics["time"]["hyperloop"]
    }])
    carbon_df = pd.DataFrame([{
        "Carbon_Rail": metrics["carbon"]["rail"],
        "Carbon_Road": metrics["carbon"]["road"],
        "Carbon_Air": metrics["carbon"]["air"],
        "Carbon_Hyperloop": metrics["carbon"]["hyperloop"]
    }])

    cost_scaled = cost_scaler.transform(cost_df)[0]
    time_scaled = time_scaler.transform(time_df)[0]
    carbon_scaled = carbon_scaler.transform(carbon_df)[0]

    normalized = {
        "cost": dict(zip(["rail", "road", "air", "hyperloop"], cost_scaled)),
        "time": dict(zip(["rail", "road", "air", "hyperloop"], time_scaled)),
        "carbon": dict(zip(["rail", "road", "air", "hyperloop"], carbon_scaled))
    }

    return normalized

In [None]:
# ---- Step 3: Calculate weighted scores and return best mode ----
def weighted_score_mode_selection(normalized_metrics, demand, perishable):
    base_cost = 0.452
    base_carbon = 0.267
    base_time = 0.281
    perishability_factor = 0.1 if perishable else -0.1
    demand_factor = min(max(demand / 10.0, 0), 1) * 0.2

    w_time = base_time + perishability_factor + demand_factor
    w_carbon = base_carbon + 0.1 * perishability_factor
    w_cost = 1.0 - w_time - w_carbon

    scores = {}
    for mode in ["rail", "road", "air", "hyperloop"]:
        score = (
            w_cost * normalized_metrics["cost"][mode] +
            w_time * normalized_metrics["time"][mode] +
            w_carbon * normalized_metrics["carbon"][mode]
        )
        scores[mode] = score

    best_mode = min(scores, key=scores.get)
    return best_mode, scores


In [None]:
# ---- Step 4: Wrapper to run rule-based prediction ----
def rule_based_predict_mode(distance_km, demand, density, perishable, cost_scaler, time_scaler, carbon_scaler):
    raw_metrics = estimate_metrics_from_distance(distance_km)
    normalized_metrics = normalize_metrics(raw_metrics, cost_scaler, time_scaler, carbon_scaler)
    best_mode, score_dict = weighted_score_mode_selection(normalized_metrics, demand, perishable)
    return best_mode, score_dict, raw_metrics

# ---- Load your previously trained scalers ----
cost_scaler = joblib.load("cost_scaler.pkl")
time_scaler = joblib.load("time_scaler.pkl")
carbon_scaler = joblib.load("carbon_scaler.pkl")


In [None]:
# ---- Predict best mode for Chennai → Bangalore ----
mode, scores, raw = rule_based_predict_mode(
    distance_km=1000,
    demand=9,
    density=600,  # Not used but kept for API consistency
    perishable=False,
    cost_scaler=cost_scaler,
    time_scaler=time_scaler,
    carbon_scaler=carbon_scaler
)

# ---- Display results ----
print("🚚 Rule-Based Best Mode:", mode)
print("📊 Normalized Scores:", scores)
print("🧾 Raw Metrics (Unnormalized):", raw)

🚚 Rule-Based Best Mode: hyperloop
📊 Normalized Scores: {'rail': np.float64(9.09590666666667), 'road': np.float64(24.223599999999998), 'air': np.float64(164.17470588235295), 'hyperloop': np.float64(3.6001198648366017)}
🧾 Raw Metrics (Unnormalized): {'cost': {'rail': 1.36, 'road': 2.8, 'air': 25.0, 'hyperloop': 4.15242}, 'time': {'rail': 16.666666666666668, 'road': 20.0, 'air': 1.1764705882352942, 'hyperloop': 1.3071895424836601}, 'carbon': {'rail': 9.96, 'road': 62.0, 'air': 600.0, 'hyperloop': 6.0}}


# LightGBM

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1. Split the data correctly (shuffled and stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# 2. Scale only on train
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Train LightGBM or XGBoost cleanly
from lightgbm import LGBMClassifier

model = LGBMClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# 4. Evaluate
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000269 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1053
[LightGBM] [Info] Number of data points in the train set: 1747, number of used features: 15
[LightGBM] [Info] Start training from score -5.268431
[LightGBM] [Info] Start training from score -0.300162
[LightGBM] [Info] Start training from score -1.411216
[LightGBM] [Info] Start training from score -4.575284
              precision    recall  f1-score   support

         air       0.80      1.00      0.89         4
   hyperloop       1.00      1.00      1.00       555
        rail       1.00      0.99      1.00       182
        road       1.00      1.00      1.00         8

    accuracy                           1.00       749
   macro avg       0.95      1.00      0.97       749
weighted avg       1.00      1.00      1.00       74



# **XGBoost**

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import joblib

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Converts ['rail', 'road', ...] → [0, 1, 2, 3]

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42
)

# ----- Step 2: Scale Features -----
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for future use
joblib.dump(scaler, "xgb_scaler.pkl")

# ----- Step 3: Train XGBoost Model -----
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y)),  # number of classes
    max_depth=4,
    learning_rate=0.1,
    n_estimators=100,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)

xgb_model.fit(X_train_scaled, y_train)

# Save the model
joblib.dump(xgb_model, "xgb_model.pkl")

# ----- Step 4: Evaluate -----
y_pred = xgb_model.predict(X_test_scaled)

print("📊 Classification Report for XGBoost:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

📊 Classification Report for XGBoost:
              precision    recall  f1-score   support

         air       0.80      1.00      0.89         4
   hyperloop       0.99      1.00      1.00       555
        rail       1.00      0.98      0.99       182
        road       1.00      1.00      1.00         8

    accuracy                           0.99       749
   macro avg       0.95      1.00      0.97       749
weighted avg       0.99      0.99      0.99       749



Parameters: { "use_label_encoder" } are not used.



# **RandomForest**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


# === Step 3: Label Encode target ===
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # ['rail', 'road', ...] → [0, 1, 2, 3]

# === Step 4: Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42
)

# === Step 5: Scale features ===
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === Step 6: Train Random Forest model ===
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42
)
rf_model.fit(X_train_scaled, y_train)

# === Step 7: Evaluate model ===
y_pred = rf_model.predict(X_test_scaled)
print("🎯 Accuracy:", accuracy_score(y_test, y_pred))
print("📊 Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# === Step 8: Save model and encoders ===
joblib.dump(rf_model, "random_forest_model.pkl")
joblib.dump(scaler, "rf_scaler.pkl")
joblib.dump(label_encoder, "rf_label_encoder.pkl")

🎯 Accuracy: 0.9946595460614153
📊 Classification Report:
               precision    recall  f1-score   support

         air       0.80      1.00      0.89         4
   hyperloop       0.99      1.00      1.00       555
        rail       1.00      0.98      0.99       182
        road       1.00      1.00      1.00         8

    accuracy                           0.99       749
   macro avg       0.95      1.00      0.97       749
weighted avg       0.99      0.99      0.99       749



['rf_label_encoder.pkl']

# **SVM**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib



# === Step 3: Encode target classes ===
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # e.g., 'rail' → 0

# === Step 4: Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42
)

# === Step 5: Normalize features ===
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === Step 6: Train SVM classifier ===
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)

# === Step 7: Evaluation ===
y_pred = svm_model.predict(X_test_scaled)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("📊 Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# === Step 8: Save model and preprocessing ===
joblib.dump(svm_model, "svm_model.pkl")
joblib.dump(scaler, "svm_scaler.pkl")
joblib.dump(label_encoder, "svm_label_encoder.pkl")


✅ Accuracy: 0.9559412550066756
📊 Classification Report:
               precision    recall  f1-score   support

         air       0.00      0.00      0.00         4
   hyperloop       0.96      0.99      0.97       555
        rail       0.95      0.86      0.90       182
        road       1.00      1.00      1.00         8

    accuracy                           0.96       749
   macro avg       0.73      0.71      0.72       749
weighted avg       0.95      0.96      0.95       749



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['svm_label_encoder.pkl']

# Logistic **Regression**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib


# === Step 3: Encode target labels ===
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # e.g., 'rail' → 0

# === Step 4: Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42
)

# === Step 5: Normalize features ===
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === Step 6: Train logistic regression model ===
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train_scaled, y_train)

# === Step 7: Evaluation ===
y_pred = log_model.predict(X_test_scaled)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("📊 Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# === Step 8: Save model and tools ===
joblib.dump(log_model, "log_model.pkl")
joblib.dump(scaler, "log_scaler.pkl")
joblib.dump(label_encoder, "log_label_encoder.pkl")


✅ Accuracy: 0.9305740987983978
📊 Classification Report:
               precision    recall  f1-score   support

         air       0.00      0.00      0.00         4
   hyperloop       0.92      1.00      0.96       555
        rail       0.97      0.77      0.86       182
        road       1.00      0.25      0.40         8

    accuracy                           0.93       749
   macro avg       0.72      0.50      0.55       749
weighted avg       0.93      0.93      0.92       749



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['log_label_encoder.pkl']

# KNN





In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# === Step 3: Encode target labels ===
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# === Step 4: Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42
)

# === Step 5: Normalize features ===
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === Step 6: Train KNN classifier ===
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

# === Step 7: Evaluation ===
y_pred = knn_model.predict(X_test_scaled)
print("✅ KNN Accuracy:", accuracy_score(y_test, y_pred))
print("📊 Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# === Step 8: Save model and tools ===
joblib.dump(knn_model, "knn_model.pkl")
joblib.dump(scaler, "knn_scaler.pkl")
joblib.dump(label_encoder, "knn_label_encoder.pkl")


✅ KNN Accuracy: 0.9572763684913218
📊 Classification Report:
               precision    recall  f1-score   support

         air       1.00      1.00      1.00         4
   hyperloop       0.96      0.98      0.97       555
        rail       0.93      0.89      0.91       182
        road       1.00      1.00      1.00         8

    accuracy                           0.96       749
   macro avg       0.97      0.97      0.97       749
weighted avg       0.96      0.96      0.96       749



['knn_label_encoder.pkl']

# **Ensemble learning**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 🎯 Define base models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
knn_model = KNeighborsClassifier(n_neighbors=5)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

ensemble_model = VotingClassifier(
    estimators=[('rf', rf_model), ('knn', knn_model), ('xgb', xgb_model)],
    voting='soft'
)


# Train
ensemble_model.fit(X_train_scaled, y_train)

# Evaluate
y_pred = ensemble_model.predict(X_test_scaled)
acc = accuracy_score(y_test, y_pred)
print("✅ Ensemble Voting Accuracy:", acc)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


✅ Ensemble Voting Accuracy: 0.9959946595460614
              precision    recall  f1-score   support

         air       1.00      1.00      1.00         3
   hyperloop       0.99      1.00      1.00       559
        rail       1.00      0.98      0.99       182
        road       1.00      1.00      1.00         5

    accuracy                           1.00       749
   macro avg       1.00      1.00      1.00       749
weighted avg       1.00      1.00      1.00       749



Parameters: { "use_label_encoder" } are not used.

