# Waste Classifier Model Training
This notebook trains a model to predict the condition score of waste items using both images and form data.

In [1]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.optimizers import Adam
import pickle




In [2]:
# Step 2: Load the dataset
df = pd.read_csv("../data/full_items_extended_dataset.csv")
df.head()

Unnamed: 0,item_type,years_used,condition,description,image_damage,condition_score,green_points,output,image_path
0,Blenders/Mixers,1,Working,"Motor works fine, jar slightly scratched",Low,0.95,93,Refurbish and Resell,data/images/blender_low.jpeg
1,Blenders/Mixers,2,Repairable,"Motor works fine, jar slightly scratched",Moderate,0.67,70,Salvage Components,waste_classifier/data/images/blender_moderate.jpg
2,Blenders/Mixers,3,Repairable,"Motor works fine, jar slightly scratched",High,0.45,47,Salvage Components,waste_classifier/data/images/blenders_high-1.jpeg
3,Blenders/Mixers,4,Repairable,"Motor works fine, jar slightly scratched",High,0.35,41,Salvage Components,waste_classifier/data/images/blenders_high-2.jpeg
4,Blenders/Mixers,5,Dead,"Motor doesn't work, jar slightly scratched",High,0.1,26,Recycle,waste_classifier/data/images/blenders_dead.png


In [3]:
# Step 2b: Standardize image paths
import re

def clean_image_path(path):
    # Remove any leading workspace or folder names
    path = re.sub(r'^(waste_classifier/)?', '', path)
    path = re.sub(r'^(data/)?', '', path)
    path = re.sub(r'^images/', 'data/images/', path)
    if not path.startswith('data/images/'):
        path = 'data/images/' + path.split('/')[-1]
    return path

df['image_path'] = df['image_path'].apply(clean_image_path)
print('Sample cleaned image paths:', df['image_path'].head())

# Step 3: Load and preprocess images
def process_image(image_path):
    try:
        img = load_img(image_path, target_size=(224, 224))
        img_array = img_to_array(img)
        return preprocess_input(img_array)
    except Exception as e:
        print(f"Error loading {image_path}: {e}")
        return np.zeros((224, 224, 3))  # fallback for missing images

images = np.array([process_image(path) for path in df["image_path"]])
print("Images shape:", images.shape)

Sample cleaned image paths: 0        data/images/blender_low.jpeg
1    data/images/blender_moderate.jpg
2    data/images/blenders_high-1.jpeg
3    data/images/blenders_high-2.jpeg
4       data/images/blenders_dead.png
Name: image_path, dtype: object
Error loading data/images/blender_low.jpeg: [Errno 2] No such file or directory: 'data/images/blender_low.jpeg'
Error loading data/images/blender_moderate.jpg: [Errno 2] No such file or directory: 'data/images/blender_moderate.jpg'
Error loading data/images/blenders_high-1.jpeg: [Errno 2] No such file or directory: 'data/images/blenders_high-1.jpeg'
Error loading data/images/blenders_high-2.jpeg: [Errno 2] No such file or directory: 'data/images/blenders_high-2.jpeg'
Error loading data/images/blenders_dead.png: [Errno 2] No such file or directory: 'data/images/blenders_dead.png'
Error loading data/images/electric-kettle-best.jpg: [Errno 2] No such file or directory: 'data/images/electric-kettle-best.jpg'
Error loading data/images/electric-k

In [4]:
# Step 4: Encode categorical columns
df["item_type"] = LabelEncoder().fit_transform(df["item_type"])
df["condition"] = LabelEncoder().fit_transform(df["condition"])

In [5]:
# Step 4b: Text preprocessing for description
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize descriptions
max_words = 1000
max_len = 30
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df['description'])
desc_sequences = tokenizer.texts_to_sequences(df['description'])
desc_padded = pad_sequences(desc_sequences, maxlen=max_len, padding='post')
print('Description padded shape:', desc_padded.shape)

with open("../model/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
print("Tokenizer saved to ../model/tokenizer.pkl")

Description padded shape: (50, 30)
Tokenizer saved to ../model/tokenizer.pkl


**Note:** After running the cell above, make sure that `tokenizer.pkl` exists in the `model` folder. If it does, you can now run your Streamlit app and it will work with the description input.

In [6]:
# Step 5: Prepare features and target
form_data = df[["item_type", "years_used", "condition"]].values
condition_score = df["condition_score"].values

In [7]:
# Step 5: Prepare features and target
form_data = df[["item_type", "years_used", "condition"]].values
condition_score = df["condition_score"].values
output_class = df["output"].map({"Refurbish and Resell": 0, "Salvage Components": 1, "Recycle": 2}).values

In [8]:
# Step 6: Build the model
# Image branch
image_input = Input(shape=(224, 224, 3))
base_model = MobileNetV2(include_top=False, input_tensor=image_input, weights="imagenet", pooling="avg")
image_features = base_model.output
image_features = Dense(128, activation='relu')(image_features)

# Form branch
form_input = Input(shape=(form_data.shape[1],))
form_features = Dense(64, activation='relu')(form_input)

# Merge branches
combined = Concatenate()([image_features, form_features])
combined = Dense(64, activation='relu')(combined)
combined = Dropout(0.3)(combined)

# Output
score_output = Dense(1, name="score")(combined)

# Final model
model = Model(inputs=[image_input, form_input], outputs=[score_output])
model.compile(loss='mse', optimizer=Adam(1e-4), metrics=['mae'])
model.summary()




Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 Conv1 (Conv2D)              (None, 112, 112, 32)         864       ['input_1[0][0]']             
                                                                                                  
 bn_Conv1 (BatchNormalizati  (None, 112, 112, 32)         128       ['Conv1[0][0]']               
 on)                                                                                              
                                                                                                  
 Conv1_relu (ReLU)           (None, 112, 112, 32)         0         ['bn_Conv1[0][0]']     

In [9]:
# Step 6: Build the multi-input, multi-output model
from tensorflow.keras.layers import Embedding, LSTM, Flatten
from tensorflow.keras.utils import to_categorical

# Text branch
text_input = Input(shape=(desc_padded.shape[1],))
text_emb = Embedding(input_dim=max_words, output_dim=32, input_length=desc_padded.shape[1])(text_input)
text_lstm = LSTM(32)(text_emb)

# Image branch
image_input = Input(shape=(224, 224, 3))
base_model = MobileNetV2(include_top=False, input_tensor=image_input, weights="imagenet", pooling="avg")
image_features = base_model.output
image_features = Dense(128, activation='relu')(image_features)

# Form branch
form_input = Input(shape=(form_data.shape[1],))
form_features = Dense(64, activation='relu')(form_input)

# Merge all
combined = Concatenate()([image_features, form_features, text_lstm])
combined = Dense(64, activation='relu')(combined)
combined = Dropout(0.3)(combined)

# Outputs
score_output = Dense(1, name="score_output")(combined)
class_output = Dense(3, activation='softmax', name="class_output")(combined)

# Final model
model = Model(inputs=[image_input, form_input, text_input], outputs=[score_output, class_output])
model.compile(loss={"score_output": "mean_squared_error", "class_output": "sparse_categorical_crossentropy"},
              optimizer=Adam(1e-4),
              metrics={"score_output": "mae", "class_output": "accuracy"})
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 Conv1 (Conv2D)              (None, 112, 112, 32)         864       ['input_4[0][0]']             
                                                                                                  
 bn_Conv1 (BatchNormalizati  (None, 112, 112, 32)         128       ['Conv1[0][0]']               
 on)                                                                                              
                                                                                                  
 Conv1_relu (ReLU)           (None, 112, 112, 32)         0         ['bn_Conv1[0][0]']      

In [10]:
# Step 7: Train/test split
X_img_train, X_img_val, X_form_train, X_form_val, y_train, y_val = train_test_split(
    images, form_data, condition_score, test_size=0.2, random_state=42
)

In [11]:
# Step 7: Train/test split
X_img_train, X_img_val, X_form_train, X_form_val, X_text_train, X_text_val, y_train, y_val, y_class_train, y_class_val = train_test_split(
    images, form_data, desc_padded, condition_score, output_class, test_size=0.2, random_state=42
)

In [12]:
# Step 8: Train the model
history = model.fit(
    [X_img_train, X_form_train, X_text_train],
    {"score_output": y_train, "class_output": y_class_train},
    validation_data=([X_img_val, X_form_val, X_text_val], {"score_output": y_val, "class_output": y_class_val}),
    epochs=10, batch_size=16
)

Epoch 1/10




Epoch 2/10
Epoch 2/10
Epoch 3/10
Epoch 3/10
Epoch 4/10
Epoch 4/10
Epoch 5/10
Epoch 5/10
Epoch 6/10
Epoch 6/10
Epoch 7/10
Epoch 7/10
Epoch 8/10
Epoch 8/10
Epoch 9/10
Epoch 9/10
Epoch 10/10
Epoch 10/10


In [13]:
# Step 8: Train the model
history = model.fit(
    [X_img_train, X_form_train, X_text_train],
    {"score_output": y_train, "class_output": y_class_train},
    validation_data=([X_img_val, X_form_val, X_text_val], {"score_output": y_val, "class_output": y_class_val}),
    epochs=10, batch_size=16
)

Epoch 1/10
Epoch 2/10
Epoch 2/10
Epoch 3/10
Epoch 3/10
Epoch 4/10
Epoch 4/10
Epoch 5/10
Epoch 5/10
Epoch 6/10
Epoch 6/10
Epoch 7/10
Epoch 7/10
Epoch 8/10
Epoch 8/10
Epoch 9/10
Epoch 9/10
Epoch 10/10
Epoch 10/10


In [14]:
# Step 9: Save the trained model
model.save("../model/waste_model.h5")
print("Model saved to ../model/waste_model.h5")

  saving_api.save_model(


Model saved to ../model/waste_model.h5


In [15]:
# Step 9: Save the trained model
model.save("../model/waste_model.h5")
print("Model saved to ../model/waste_model.h5")

Model saved to ../model/waste_model.h5


In [17]:
# Multi-input model: Tabular + Image features for waste classification

# 1. Install required packages (if needed)
# !pip install tensorflow scikit-learn pandas pillow

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
import os

# 2. Load and preprocess tabular data
csv_path = '../data/full_items_extended_dataset.csv'
df = pd.read_csv(csv_path)

for col in ['item_type', 'condition', 'image_damage', 'output']:
    df[col] = LabelEncoder().fit_transform(df[col])

# Select tabular features
tabular_features = ['item_type', 'years_used', 'condition', 'image_damage', 'condition_score', 'green_points']
X_tabular = df[tabular_features].values
scaler = StandardScaler()
X_tabular = scaler.fit_transform(X_tabular)

# Target variable
y = LabelEncoder().fit_transform(df['output'])
y = to_categorical(y)

# 3. Load and preprocess images
IMG_SIZE = 224
def load_image(path):
    if not os.path.exists(path):
        # fallback for missing images
        return np.zeros((IMG_SIZE, IMG_SIZE, 3))
    img = image.load_img(path, target_size=(IMG_SIZE, IMG_SIZE))
    img = image.img_to_array(img)
    img = preprocess_input(img)
    return img

X_images = np.array([load_image(p) for p in df['image_path']])

# 4. Build the multi-input model
image_input = Input(shape=(IMG_SIZE, IMG_SIZE, 3))
base_model = ResNet50(weights='imagenet', include_top=False, input_tensor=image_input)
for layer in base_model.layers:
    layer.trainable = False
x_img = Flatten()(base_model.output)
x_img = Dense(128, activation='relu')(x_img)
x_img = Dropout(0.5)(x_img)

tabular_input = Input(shape=(X_tabular.shape[1],))
x_tab = Dense(64, activation='relu')(tabular_input)
x_tab = Dropout(0.5)(x_tab)

combined = Concatenate()([x_img, x_tab])
x = Dense(64, activation='relu')(combined)
output = Dense(y.shape[1], activation='softmax')(x)

model = Model(inputs=[image_input, tabular_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# 5. Train the model
model.fit([X_images, X_tabular], y, epochs=10, batch_size=8, validation_split=0.2)


A local file was found, but it seems to be incomplete or outdated because the auto file hash does not match the original value of 4d473c1dd8becc155b73f8504c6f6626 so we will re-download the data.
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_7 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv1_pad (ZeroPadding2D)   (None, 230, 230, 3)          0         ['input_7[0][0]']             
    

<keras.src.callbacks.History at 0x14ee41fcd90>