In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
#1. LOAD DATA
df = pd.read_csv('final_engineered_dataset.csv')
#2. GENERATE TARGET VALUES (Simulating missing labels)
# Cost Prediction: Let's assume cost is a function of Material, Strength, and Weight
# (Base price + Strength multiplier + Weight factor)
np.random.seed(42)
df['Estimated_Cost'] = (df['Packaging_Material_Encoded'] * 0.5) + \
                        (df['Strength_Encoded'] * 10) + \
                        (df['Weight Capacity (kg)'] * 2) + \
                        np.random.normal(0, 5, len(df))

# Shipping Category: Derived from weight (0: Light, 1: Standard, 2: Heavy Duty)
df['Shipping_Category'] = pd.cut(df['Weight Capacity (kg)'], bins=3, labels=[0, 1, 2]).astype(int)

#3. FEATURE SELECTION
# X = Input Features | y = Targets
features = [
    'Product_Category_Encoded', 'Packaging_Material_Encoded', 
    'Strength_Encoded', 'Biodegradability score', 'Recyclability %', 
    'Weight Capacity (kg)'
]

X = df[features]
y_cost = df['Estimated_Cost']
y_co2 = df['CO2_Impact_Index']

#4. TRAIN-TEST SPLIT
# Splitting for Cost Prediction (80% Training, 20% Testing)
# Pass both targets (y_cost and y_co2) into the split function
X_train, X_test, y_train_cost, y_test_cost, y_train_co2, y_test_co2 = train_test_split(X, y_cost, y_co2, test_size=0.2, random_state=42)

#5. PREPARE DATA PIPELINES & SCALING
# We use StandardScaler here to ensure the model treats all features as "z-scores"
pipeline = Pipeline([('scaler', StandardScaler())])

In [3]:
# Fit and transform the training data
X_train_scaled = pipeline.fit_transform(X_train).round(5)
# Transform the testing data (using training mean/std to avoid leakage)
X_test_scaled = pipeline.transform(X_test).round(5)

print(f"Dataset Split Successful!")
print(f"Training set size: {X_train_scaled.shape}")
print(f"Testing set size: {X_test_scaled.shape}")

Dataset Split Successful!
Training set size: (6472, 6)
Testing set size: (1618, 6)


#RECOMENDATION MODEL

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [18]:
#1. TRAIN MODELS

# Model A: Random Forest for Cost Prediction
rf_cost_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_cost_model.fit(X_train_scaled, y_train_cost)

# Model B: XGBoost for CO2 Footprint Prediction
# (Using y_co2 which is the CO2_Impact_Index target)
xgb_co2_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_co2_model.fit(X_train_scaled, y_train_co2)

#2. MODEL EVALUATION

def evaluate_model(y_true, y_pred, name):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n--- Metrics for {name} ---")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE:  {mae:.4f}")
    print(f"R2 Score: {r2:.4f}")

# Predictions
cost_preds = rf_cost_model.predict(X_test_scaled)
co2_preds = xgb_co2_model.predict(X_test_scaled)

evaluate_model(y_test_cost, cost_preds, "Random Forest (Cost)")
evaluate_model(y_test_co2, co2_preds, "XGBoost (CO2)")
import joblib

# Save the models and encoders
joblib.dump(pipeline, 'pipeline.pkl')
joblib.dump(rf_cost_model, 'rf_cost_model.pkl')
joblib.dump(xgb_co2_model, 'xgb_co2_model.pkl')
joblib.dump(le_cat, 'le_cat.pkl')
joblib.dump(le_sub, 'le_sub.pkl')



--- Metrics for Random Forest (Cost) ---
RMSE: 5.4044
MAE:  4.2914
R2 Score: 0.9114

--- Metrics for XGBoost (CO2) ---
RMSE: 0.0163
MAE:  0.0103
R2 Score: 0.9948


['le_sub.pkl']

In [6]:
#3. MATERIAL RANKING SYSTEM
# Combine predictions back into a dataframe for ranking
test_results = pd.DataFrame({
    'Actual_Cost': y_test_cost,
    'Predicted_Cost': cost_preds,
    'Predicted_CO2': co2_preds
})

# Ranking Formula: Lower CO2 and Lower Cost is better
# We give equal weight (0.5) to both for a balanced recommendation
test_results['Recommendation_Score'] = (test_results['Predicted_Cost'] * 0.5) + (test_results['Predicted_CO2'] * 0.5)
ranked_materials = test_results.sort_values(by='Recommendation_Score')

print("\nTop 5 Recommended Packaging Options (Lowest Score = Best):")
ranked_materials.head()


Top 5 Recommended Packaging Options (Lowest Score = Best):


Unnamed: 0,Actual_Cost,Predicted_Cost,Predicted_CO2,Recommendation_Score
367,10.768652,9.664457,0.051426,4.857941
378,23.039772,10.164347,0.054847,5.109597
1025,14.606555,10.287741,0.05543,5.171586
4454,16.945377,10.514496,0.060032,5.287264
4534,14.403292,10.771078,0.07088,5.420979


GET RECOMMENDATION

In [7]:
from sklearn.preprocessing import LabelEncoder

In [16]:
# 1. Fit encoders (Keep this as you had it)
le_cat = LabelEncoder().fit(df['Product category'])
le_sub = LabelEncoder().fit(df['Product sub category'])

def get_material_recommendations(input_category, input_subcategory, input_weight, is_fragile):
    # Normalize inputs to match dataset Title Case
    input_category = input_category.strip().title()
    input_subcategory = input_subcategory.strip().title()
    user_fragile_val = 1 if is_fragile.lower() == 'yes' else 0

    # 1. FIX: Get unique materials WITH their actual scores from the dataset
    # This ensures Algae-based film keeps its high biodegradability score
    unique_materials = df.groupby('Packaging material').agg({
        'Packaging_Material_Encoded': 'first',
        'Strength_Encoded': 'first',
        'Strength': 'first',
        'Biodegradability score': 'mean', # Specific to this material
        'Recyclability %': 'mean'         # Specific to this material
    }).reset_index()

    try:
        cat_encoded = le_cat.transform([input_category])[0]
    except ValueError:
        return f"Error: Category '{input_category}' not recognized."

    # 2. Create Simulation Data
    sim_data = unique_materials.copy()
    sim_data['Product_Category_Encoded'] = cat_encoded
    sim_data['Weight Capacity (kg)'] = input_weight
    
    # Define features for the model
    features = ['Product_Category_Encoded', 'Packaging_Material_Encoded', 
                'Strength_Encoded', 'Biodegradability score', 'Recyclability %', 
                'Weight Capacity (kg)']
    
    # 3. Predict using actual material features
    X_sim_scaled = pipeline.transform(sim_data[features])
    sim_data['Predicted_Cost'] = rf_cost_model.predict(X_sim_scaled)
    sim_data['Predicted_CO2'] = xgb_co2_model.predict(X_sim_scaled)
    
    # 4. Calculate Score & Apply Fragility Penalty
    sim_data['Final_Score'] = (sim_data['Predicted_Cost'] * 0.5) + (sim_data['Predicted_CO2'] * 0.5)
    
    if user_fragile_val == 1:
        # High penalty for low strength materials when item is fragile
        sim_data.loc[sim_data['Strength_Encoded'] == 1, 'Final_Score'] += 15.0
        sim_data.loc[sim_data['Strength_Encoded'] == 2, 'Final_Score'] += 5.0
    
    # 5. FIX: Biodegradable Logic
    # Now that we have the real scores, this will correctly show 'YES' for bio-materials
    sim_data['Is_Biodegradable'] = sim_data['Biodegradability score'].apply(lambda x: 'YES' if x > 50 else 'NO')

    # Return top 5 unique recommendations
    top_5 = sim_data.sort_values('Final_Score').head(5)
    return top_5[['Packaging material', 'Strength', 'Is_Biodegradable', 'Predicted_Cost', 'Predicted_CO2']]

# --- Updated Input Section ---
print("--- AI PACKAGING ADVISOR ---")
user_category = input("Enter the product Category: ").strip().title()
user_subcategory = input("Enter the sub category: ").strip().title()
user_weight = int(input("Weight of the product (kg): "))
user_fragile = input("Is the product fragile? (yes/no): ").strip().lower() # New Input

# Pass the 4th argument (is_fragile) to the function
results = get_material_recommendations(user_category, user_subcategory, user_weight, user_fragile)

if isinstance(results, str):
    result.head(5)
else:
    print(f"\nTop 5 Recommendations for {user_subcategory} (Fragile: {user_fragile.upper()}):")
    print(results.to_string(index=False))

--- AI PACKAGING ADVISOR ---


Enter the product Category:  electronics
Enter the sub category:  smartphone
Weight of the product (kg):  5
Is the product fragile? (yes/no):  yes



Top 5 Recommendations for Smartphone (Fragile: YES):
   Packaging material Strength Is_Biodegradable  Predicted_Cost  Predicted_CO2
          ABS Plastic     high               NO       32.571841       0.796806
Bio-PE(Green Plastic)     high               NO       34.167262       0.049925
    Bio-Polycarbonate     high               NO       33.863590       0.428624
Bamboo Fiber (Molded)     high               NO       34.698028       0.078913
             Aluminum     high               NO       35.693619       0.060265
