In [1]:
import pandas as pd

df = pd.read_csv("carbon_emission_dataset_with_Industry.csv")

print("COLUMNS EXACTLY AS READ:")
for col in df.columns:
    print(f"'{col}'")

print("\nFIRST 5 ROWS:")
print(df.head())


COLUMNS EXACTLY AS READ:
'Company_ID'
'Date'
'Sector'
'Total_Energy_Consumption_kWh'
'Renewable_Energy_Consumption_kWh'
'NonRenewable_Energy_Consumption_kWh'
'Production_Output_Units'
'Supply_Chain_Transport_km'
'Supply_Chain_Transport_Mode'
'Raw_Material_Usage_kg'
'Carbon_Emission_tCO2e_TARGET'
'Energy_Cost_USD'
'Carbon_Tax_USD'
'Process_Efficiency_Percent'
'Employment_Count'
'Public_Acceptance_Index'
'Carbon_Reduction_Strategy'
'Strategy_Implementation_Cost_USD'
'Expected_Carbon_Reduction_Percent'
'Expected_Renewable_Share_Percent'
'Social_Impact_Score'
'Industry_Sectors'

FIRST 5 ROWS:
  Company_ID        Date         Sector  Total_Energy_Consumption_kWh  \
0       C001  2024-01-01  Manufacturing                      53751.61   
1       C001  2024-01-02  Manufacturing                     170872.89   
2       C001  2024-01-03  Manufacturing                     142777.96   
3       C001  2024-01-04  Manufacturing                     152532.15   
4       C001  2024-01-05  Manufacturing

In [2]:
# Remove extra spaces & standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

print("CLEANED COLUMNS:")
print(df.columns)


CLEANED COLUMNS:
Index(['company_id', 'date', 'sector', 'total_energy_consumption_kwh',
       'renewable_energy_consumption_kwh',
       'nonrenewable_energy_consumption_kwh', 'production_output_units',
       'supply_chain_transport_km', 'supply_chain_transport_mode',
       'raw_material_usage_kg', 'carbon_emission_tco2e_target',
       'energy_cost_usd', 'carbon_tax_usd', 'process_efficiency_percent',
       'employment_count', 'public_acceptance_index',
       'carbon_reduction_strategy', 'strategy_implementation_cost_usd',
       'expected_carbon_reduction_percent', 'expected_renewable_share_percent',
       'social_impact_score', 'industry_sectors'],
      dtype='object')


In [3]:
X = df[[
    "supply_chain_transport_km",
    "supply_chain_transport_mode"
]]

y = df["carbon_emission_tco2e_target"]


In [4]:
df["supply_chain_transport_mode"] = (
    df["supply_chain_transport_mode"]
    .astype(str)
    .str.lower()
    .str.strip()
)


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(
    transformers=[
        (
            "transport",
            OneHotEncoder(handle_unknown="ignore"),
            ["supply_chain_transport_mode"]
        )
    ],
    remainder="passthrough"
)

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

print("✅ Model trained successfully using REAL spreadsheet columns")


✅ Model trained successfully using REAL spreadsheet columns


In [7]:
test_data = pd.DataFrame({
    "supply_chain_transport_km": [350, 350, 350],
    "supply_chain_transport_mode": ["train", "truck", "plane"]
})

predictions = model.predict(test_data)

for t, d, e in zip(
    test_data["supply_chain_transport_mode"],
    test_data["supply_chain_transport_km"],
    predictions
):
    print(f"{t.upper()} | {d} km → {e:.4f} tCO₂e")


TRAIN | 350 km → 32.4116 tCO₂e
TRUCK | 350 km → 32.4116 tCO₂e
PLANE | 350 km → 32.4116 tCO₂e


In [8]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Predictions on test set
y_pred = model.predict(X_test)

# Metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score  : {r2:.4f}")
print(f"MAE       : {mae:.4f} tCO₂e")
print(f"RMSE      : {rmse:.4f} tCO₂e")


R² Score  : -0.0016
MAE       : 11.7852 tCO₂e
RMSE      : 14.5144 tCO₂e


In [9]:
import pandas as pd

df = pd.read_csv("ecommerce_logistics_carbon_emissions_v1.csv")

print(df.head())
print(df.columns)


  Transaction_ID        Date             Origin_Facility   Destination_City  \
0   TRX-F775953B  2025-02-18  Jakarta Fulfillment Center         Waynehaven   
1   TRX-B4C50003  2025-09-11  Jakarta Fulfillment Center  East Patriciaside   
2   TRX-F7C71F39  2025-06-15              Semarang Depot        Pottershire   
3   TRX-A60E6AAA  2025-06-23              Semarang Depot    North Christina   
4   TRX-D0CFAF87  2025-02-15  Jakarta Fulfillment Center          Lisamouth   

          Vehicle_Type       Route_Type  Distance_KM  Package_Weight_KG  \
0    Electric Van (EV)      Mixed Route         80.4               15.6   
1  Diesel Van (Euro 6)      Mixed Route        135.3               18.8   
2       Drone Delivery  Urban Last Mile          9.0               25.4   
3       Drone Delivery  Urban Last Mile         11.8               13.5   
4          Heavy Truck       Inter-City        156.5             1792.6   

  Traffic_Conditions  Carbon_Emission_kgCO2e  Is_Eco_Friendly  
0         

In [10]:
df = df.drop(columns=[
    "transaction_id",
    "date",
    "destination_city"
], errors="ignore")


In [11]:
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

print(df.columns)


Index(['transaction_id', 'date', 'origin_facility', 'destination_city',
       'vehicle_type', 'route_type', 'distance_km', 'package_weight_kg',
       'traffic_conditions', 'carbon_emission_kgco2e', 'is_eco_friendly'],
      dtype='object')


In [13]:
print("EXACT COLUMNS:")
for col in df.columns:
    print(f"'{col}'")


EXACT COLUMNS:
'transaction_id'
'date'
'origin_facility'
'destination_city'
'vehicle_type'
'route_type'
'distance_km'
'package_weight_kg'
'traffic_conditions'
'carbon_emission_kgco2e'
'is_eco_friendly'


In [14]:
df = df.drop(
    columns=["transaction_id", "date", "destination_city"],
    errors="ignore"
)


In [15]:
X = df[
    [
        "origin_facility",
        "vehicle_type",
        "route_type",
        "distance_km"
    ]
]

y = df["carbon_emission_kgco2e"]


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor


In [17]:
categorical_features = [
    "origin_facility",
    "vehicle_type",
    "route_type"
]

numeric_features = ["distance_km"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)


In [18]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

print("✅ Model trained successfully")


✅ Model trained successfully


In [19]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

y_pred = model.predict(X_test)

print("R² Score :", round(r2_score(y_test, y_pred), 4))
print("MAE      :", round(mean_absolute_error(y_test, y_pred), 4))
print("RMSE     :", round(np.sqrt(mean_squared_error(y_test, y_pred)), 4))


R² Score : 0.9465
MAE      : 12.9469
RMSE     : 41.0837


In [20]:
test_input = pd.DataFrame({
    "origin_facility": ["WH_Bangalore"],
    "vehicle_type": ["truck"],
    "route_type": ["highway"],
    "distance_km": [350]
})

prediction = model.predict(test_input)

print(f"Predicted Carbon Emission: {prediction[0]:.2f} kgCO₂e")


Predicted Carbon Emission: 303.88 kgCO₂e


In [21]:
import joblib

# Save model
joblib.dump(model, "carbon_emission_model.pkl")

print("✅ Model saved as carbon_emission_model.pkl")


✅ Model saved as carbon_emission_model.pkl
