# **Emoji Classification Project - Computer Vision**

### Team Members : `Daniil NOTKIN`, `Yuhan SU` & `Yassine ERRAJI`

## *Importations*

In [64]:
import os
import numpy as np
import pandas as pd 
import cv2
from pathlib import Path
from PIL import Image
from skimage import io, color
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from warnings import filterwarnings
filterwarnings("ignore")

## *Functions*

In [None]:
def load_single_image(path):
    """
    This will be the function you use to preprocess your image.
    
    """
    img = Image.open(path).convert("RGBA")
    img = np.array(img)

    # RGBA -> RGB
    if img.ndim == 3 and img.shape[2] == 4:
        img = color.rgba2rgb(img) 
    # grayscale -> RGB
    elif img.ndim == 2:
        img = np.stack([img] * 3, axis=-1)

    # unint8 conversion
    if img.dtype != np.uint8:
        img = (img * 255).astype(np.uint8)

    # here you could add additional preprocessing

    return img.astype("float32") / 255.0


# ============================================================
# Improved imageLoader
# - shuffles every epoch
# - resizes images
# - optional data augmentation
# ============================================================


def imageLoader(
    files,
    labels,
    batch_size,
    target_size=(72, 72),
    augment=False
):
    """
    Generator yielding (X, y) batches.
    - Ensures same image size
    - Shuffles data each epoch
    - Applies optional data augmentation
    """

    files = np.array(files)
    L = len(files)

    while True:
        # Shuffle at the beginning of each epoch
        indices = np.random.permutation(L)
        files = files[indices]

        batch_start = 0

        while batch_start < L:
            batch_files = files[batch_start:batch_start + batch_size]

            X_batch = []
            y_batch = []

            for f in batch_files:
                img_id = Path(f).stem  # DO NOT CHANGE (constraint)

                # Load image (float32, [0,1])
                img = load_single_image(f)

                # Resize to fixed shape
                img = cv2.resize(img, target_size)

                # ðŸŽ¨ Data augmentation (train only)
                if augment:
                    if np.random.rand() < 0.5:
                        img = cv2.flip(img, 1)

                    if np.random.rand() < 0.3:
                        angle = np.random.uniform(-15, 15)
                        M = cv2.getRotationMatrix2D(
                            (target_size[0] // 2, target_size[1] // 2),
                            angle,
                            1.0
                        )
                        img = cv2.warpAffine(img, M, target_size)

                X_batch.append(img)
                y_batch.append(label_to_index[labels[img_id]])

            X = np.stack(X_batch).astype("float32")
            y = np.array(y_batch, dtype=np.int32)

            yield X, y

            batch_start += batch_size

### Dataset

In [None]:
PATH = Path("data")

In [None]:
train_dir = PATH / "train"
train_files = sorted([str(p) for p in train_dir.iterdir() if p.is_file()])

In [None]:
test_dir = PATH / "test"
test_files = sorted([str(p) for p in test_dir.iterdir() if p.is_file()])
test_ids = [Path(f).stem for f in test_files]

In [None]:
y_train_df = pd.read_csv(PATH / "train_labels.csv")

y_train_dct = dict(zip(y_train_df["Id"], y_train_df["Label"]))

In [None]:
gen = imageLoader(
    files=train_files,
    labels=y_train_dct,
    batch_size=32,
)

## Training

In [None]:
# ============================================================
# Train / Validation split
# ============================================================

# Split train files into train / validation
train_files_split, val_files_split = train_test_split(
    train_files,
    test_size=0.2,        # 80% train / 20% validation
    random_state=42,
    shuffle=True
)

print(f"Train samples: {len(train_files_split)}")
print(f"Validation samples: {len(val_files_split)}")

In [None]:
unique_labels = y_train_df["Label"].unique().tolist()
unique_labels

### Label encoding (strings â†’ integers)

Keras requires numeric labels.

In [None]:
# ============================================================
# Convert label dict keys to STRING to match Path(f).stem
# ============================================================

labels_str = {
    f"{int(k):05d}": v
    for k, v in y_train_dct.items()
}

In [None]:
# ============================================================
# Label encoding (string â†’ integer)
# ============================================================

# Sorted for reproducibility
unique_labels = sorted(y_train_df["Label"].unique())

label_to_index = {label: i for i, label in enumerate(unique_labels)}
index_to_label = {i: label for label, i in label_to_index.items()}

num_classes = len(unique_labels)

print("Label to index mapping:")
print(label_to_index)

In [None]:
# ============================================================
# Compute class weights to handle imbalance
# ============================================================

y_encoded = [label_to_index[label] for label in y_train_df["Label"]]

class_weights_array = compute_class_weight(
    class_weight="balanced",
    classes=np.arange(num_classes),
    y=y_encoded
)

class_weight = dict(enumerate(class_weights_array))

print("Class weights:")
print(class_weight)

In [None]:
# Encode string labels to integers
label_to_index = {label: i for i, label in enumerate(unique_labels)}
index_to_label = {i: label for label, i in label_to_index.items()}

num_classes = len(unique_labels)
print(label_to_index)

### Wrap the existing generator for Keras

The imageLoader already yields (X, y) â€” we just adapt labels.

In [None]:
def keras_generator(files, labels_dict, batch_size, target_size=(128, 128)):
    """
    Wraps imageLoader to:
    - resize images to fixed shape
    - convert string labels to class indices
    """
    base_gen = imageLoader(files, labels_dict, batch_size)

    while True:
        X, y_str = next(base_gen)

        # Resize images (critical fix)
        X_resized = np.array([
            cv2.resize(img, target_size) for img in X
        ], dtype=np.float32)

        # Encode labels
        y = np.array([label_to_index[label] for label in y_str])

        yield X_resized, y

### Define a compact, efficient CNN

This model:

â€¢   Trains fast

â€¢   Fits in memory

â€¢   Should be enough for this task

In [None]:
# ============================================================
# Build a stronger but still lightweight CNN
# ============================================================

def build_model(input_shape=(72, 72, 3), num_classes=7):
    model = models.Sequential([

        # Block 1
        layers.Conv2D(32, 3, padding="same", activation="relu", input_shape=input_shape),
        layers.MaxPooling2D(),

        # Block 2
        layers.Conv2D(64, 3, padding="same", activation="relu"),
        layers.MaxPooling2D(),

        # Block 3
        layers.Conv2D(128, 3, padding="same", activation="relu"),
        layers.MaxPooling2D(),

        # Block 4 (new)
        layers.Conv2D(256, 3, padding="same", activation="relu"),
        layers.MaxPooling2D(),

        # Head
        layers.GlobalAveragePooling2D(),
        layers.Dense(256, activation="relu"),
        layers.Dropout(0.4),

        layers.Dense(num_classes, activation="softmax")
    ])

    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )

    return model

### **Instantiate the model**

In [None]:
# ============================================================
# Instantiate model
# ============================================================

model = build_model(num_classes=num_classes)
model.summary()

### Train the model

We must define steps_per_epoch manually because we use a generator.

In [None]:
# ============================================================
# Training (with sample weights since class_weight isn't supported for Python generators)
# ============================================================

batch_size = 32

# 1) Base generators (your imageLoader already outputs X, y with y as integer class index)
train_gen_base = imageLoader(
    files=train_files_split,
    labels=labels_str,
    batch_size=batch_size,
    augment=True
)

val_gen = imageLoader(
    files=val_files_split,
    labels=labels_str,
    batch_size=batch_size,
    augment=False
)

# 2) Convert the class_weight dict into a vector so we can map y -> weight quickly
#    class_weight is already computed earlier as: {0: w0, 1: w1, ...}
class_weight_vec = np.array([class_weight[i] for i in range(num_classes)], dtype=np.float32)

def add_sample_weights(gen, class_weight_vector):
    """
    Wrap a (X, y) generator to yield (X, y, sample_weight).
    sample_weight[i] = class_weight[y[i]]
    """
    while True:
        X, y = next(gen)  # y is shape (batch,)
        sw = class_weight_vector[y]  # shape (batch,)
        yield X, y, sw

train_gen = add_sample_weights(train_gen_base, class_weight_vec)

# 3) Steps
steps_per_epoch = len(train_files_split) // batch_size
validation_steps = len(val_files_split) // batch_size

# 4) Train (NO class_weight argument anymore)
history = model.fit(
    train_gen,
    validation_data=val_gen,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
    epochs=20,
    verbose=1
)

### **We now replace the dummy prediction function**

In [None]:
def predict_with_my_solid_model(test_files):
    """
    Predict labels for the test set using the trained CNN.
    """
    predictions = []

    for f in test_files:
        img = load_single_image(f)
        img = np.expand_dims(img, axis=0)  # (1, H, W, C)

        probs = model.predict(img, verbose=0)
        pred_idx = np.argmax(probs, axis=1)[0]
        predictions.append(index_to_label[pred_idx])

    return predictions

### **Generate submission**

In [None]:
y_test_pred = predict_with_my_solid_model(test_files)

test_ids_sr = pd.Series(test_ids, name="Id")
y_test_pred_sr = pd.Series(y_test_pred, name="Label")

submission_df = pd.concat([test_ids_sr, y_test_pred_sr], axis=1)
submission_df.head()

In [None]:
submission_df.to_csv("submission.csv", index=False)