<a href="https://colab.research.google.com/github/Chairsama578/replica2/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install dependencies
!pip install kagglehub tensorflow keras opencv-python joblib matplotlib seaborn scikit-learn

In [None]:
# Cell 2: Import libraries
import os
import shutil
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2, EfficientNetB0
from tensorflow.keras.layers import GlobalAveragePooling2D, Dropout, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import joblib
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

In [None]:
# Cell 3: Download and map dataset to 5 classes
DATASET_SLUG = "sumn2u/garbage-classification-v2"
path = kagglehub.dataset_download(DATASET_SLUG)
print("Dataset downloaded to:", path)

# Original classes (based on dataset)
original_classes = ['Metal', 'Glass', 'Biological', 'Paper', 'Battery', 'Trash', 'Cardboard', 'Shoes', 'Clothes', 'Plastic']

# Mapping to 5 classes as per report
class_mapping = {
    'Paper': 'paper',
    'Cardboard': 'paper',
    'Plastic': 'plastic',
    'Metal': 'metal',
    'Glass': 'glass',
    'Biological': 'organic'
    # Exclude: Battery, Trash, Shoes, Clothes
}

# Create images_raw/ with 5 subfolders
images_raw = 'images_raw'
os.makedirs(images_raw, exist_ok=True)
class_counts = {c: 0 for c in set(class_mapping.values())}

for orig_class in original_classes:
    src_dir = os.path.join(path, orig_class)  # Assume structure: root/class/image.jpg
    if os.path.exists(src_dir) and orig_class in class_mapping:
        target_class = class_mapping[orig_class]
        target_dir = os.path.join(images_raw, target_class)
        os.makedirs(target_dir, exist_ok=True)
        for img in os.listdir(src_dir):
            if img.lower().endswith(('.jpg', '.png', '.jpeg')):
                shutil.copy(os.path.join(src_dir, img), os.path.join(target_dir, img))
                class_counts[target_class] += 1

print("Mapped class counts (approximate as per report):", class_counts)
# Expected: paper ~4100+ (Paper+Cardboard), plastic ~1439, metal ~1077, glass ~3199, organic ~997

# Plot distribution
plt.bar(class_counts.keys(), class_counts.values())
plt.title("Phân bố dataset 5 lớp")
plt.show()

In [None]:
# Cell 4: Data augmentation and generators
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest',
    validation_split=0.2  # 80% train, 20% val
)

train_generator = train_datagen.flow_from_directory(
    images_raw,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='training'
)

validation_generator = train_datagen.flow_from_directory(
    images_raw,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='validation'
)

# Save class indices (labels)
os.makedirs('models', exist_ok=True)
joblib.dump(train_generator.class_indices, 'models/labels.pkl')

In [None]:
# Cell 5: Build and train primary model (MobileNetV2)
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False  # Freeze base

model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dropout(0.5),
    Dense(5, activation='softmax')  # 5 classes
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ReduceLROnPlateau(factor=0.2, patience=3)
]

history = model.fit(
    train_generator,
    validation_data=validation_generator,
    epochs=50,
    callbacks=callbacks
)

# Fine-tuning: Unfreeze base and retrain with low LR
base_model.trainable = True
model.compile(optimizer=Adam(1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
fine_history = model.fit(
    train_generator,
    validation_data=validation_generator,
    epochs=20,
    callbacks=callbacks
)

# Save model
model.save('models/waste_model.h5')

# Plot training curves
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'] + fine_history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'] + fine_history.history['val_accuracy'], label='Val Acc')
plt.title('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'] + fine_history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'] + fine_history.history['val_loss'], label='Val Loss')
plt.title('Loss')
plt.legend()
plt.show()

In [None]:
# Cell 6: Evaluation (confusion matrix, classification report)
# Get predictions on validation set
val_steps = validation_generator.samples // validation_generator.batch_size
y_true = []
y_pred = []
for _ in range(val_steps):
    x, y = next(validation_generator)
    pred = model.predict(x)
    y_true.extend(np.argmax(y, axis=1))
    y_pred.extend(np.argmax(pred, axis=1))

# Report and confusion matrix
print(classification_report(y_true, y_pred, target_names=list(train_generator.class_indices.keys())))
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=list(train_generator.class_indices.keys()), yticklabels=list(train_generator.class_indices.keys()))
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Cell 7: Comparison model (EfficientNetB0) - Optional, run if needed
base_eff = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_eff.trainable = False

model_eff = Sequential([
    base_eff,
    GlobalAveragePooling2D(),
    Dropout(0.5),
    Dense(5, activation='softmax')
])

model_eff.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history_eff = model_eff.fit(
    train_generator,
    validation_data=validation_generator,
    epochs=50,
    callbacks=callbacks
)

# Compare accuracies (simple print)
print("MobileNetV2 Val Acc:", max(history.history['val_accuracy'] + fine_history.history['val_accuracy']))
print("EfficientNetB0 Val Acc:", max(history_eff.history['val_accuracy']))