# **Emoji Classification Project - Computer Vision**

### Team Members : `Daniil NOTKIN`, `Yuhan SU` & `Yassine ERRAJI`

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## *Importations*

In [25]:
from warnings import filterwarnings
filterwarnings("ignore")

import os
import numpy as np
import pandas as pd
import cv2
from pathlib import Path
from PIL import Image
from skimage import io, color
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import layers, models, optimizers

## *Functions*

In [26]:
def load_single_image(path):
    """
    This will be the function you use to preprocess your image.

    """
    img = Image.open(path).convert("RGBA")
    img = np.array(img)

    # RGBA -> RGB
    if img.ndim == 3 and img.shape[2] == 4:
        img = color.rgba2rgb(img)
    # grayscale -> RGB
    elif img.ndim == 2:
        img = np.stack([img] * 3, axis=-1)

    # unint8 conversion
    if img.dtype != np.uint8:
        img = (img * 255).astype(np.uint8)

    # here you could add additional preprocessing

    return img.astype("float32") / 255.0


# ============================================================
# Improved imageLoader
# - shuffles every epoch
# - resizes images
# - optional data augmentation
# - optional sample weights (to replace class_weight)
# ============================================================


def imageLoader(
    files,
    labels,
    batch_size,
    target_size=(72, 72),
    augment=False,
    class_weight=None,   # NEW: dict like {class_index: weight}
    seed=42              # NEW: for reproducibility if you want it
):
    """
    Python generator yielding:
      - (X, y) if class_weight is None
      - (X, y, sample_weight) if class_weight is provided

    Notes:
    - labels must be a dict: {img_id (string): label_name (string)}
    - label_to_index must already exist in the notebook scope
    """

    rng = np.random.default_rng(seed)

    files = np.array(files)
    L = len(files)

    while True:
        # Shuffle at the beginning of each epoch
        indices = rng.permutation(L)
        files_shuffled = files[indices]

        batch_start = 0
        while batch_start < L:
            batch_files = files_shuffled[batch_start:batch_start + batch_size]

            X_batch = np.empty((len(batch_files), target_size[1], target_size[0], 3), dtype=np.float32)
            y_batch = np.empty((len(batch_files),), dtype=np.int32)

            for i, f in enumerate(batch_files):
                img_id = Path(f).stem  # DO NOT CHANGE (constraint)

                # Load image (float32, [0,1])
                img = load_single_image(f)

                # Resize to fixed shape
                img = cv2.resize(img, target_size, interpolation=cv2.INTER_AREA)

                # Data augmentation (train only)
                if augment:
                    # Horizontal flip
                    if rng.random() < 0.5:
                        img = cv2.flip(img, 1)

                    # Small rotation
                    if rng.random() < 0.3:
                        angle = rng.uniform(-15, 15)
                        M = cv2.getRotationMatrix2D(
                            (target_size[0] // 2, target_size[1] // 2),
                            angle,
                            1.0
                        )
                        img = cv2.warpAffine(
                            img,
                            M,
                            (target_size[0], target_size[1]),
                            flags=cv2.INTER_LINEAR,
                            borderMode=cv2.BORDER_REFLECT_101
                        )

                # Store
                X_batch[i] = img
                y_batch[i] = label_to_index[labels[img_id]]

            if class_weight is None:
                yield X_batch, y_batch
            else:
                # Convert per-class weights into per-sample weights
                sample_weight = np.array([class_weight[int(c)] for c in y_batch], dtype=np.float32)
                yield X_batch, y_batch, sample_weight

            batch_start += batch_size

### Dataset

In [27]:
PATH = Path("data")

In [28]:
train_dir = PATH / "train"
train_files = sorted([str(p) for p in train_dir.iterdir() if p.is_file()])

In [29]:
test_dir = PATH / "test"
test_files = sorted([str(p) for p in test_dir.iterdir() if p.is_file()])
test_ids = [Path(f).stem for f in test_files]

In [30]:
y_train_df = pd.read_csv(PATH / "train_labels.csv")

y_train_dct = dict(zip(y_train_df["Id"], y_train_df["Label"]))

In [31]:
gen = imageLoader(
    files=train_files,
    labels=y_train_dct,
    batch_size=32,
)

## Training

In [32]:
# ============================================================
# Train / Validation split
# ============================================================

# Split train files into train / validation
train_files_split, val_files_split = train_test_split(
    train_files,
    test_size=0.2,        # 80% train / 20% validation
    random_state=42,
    shuffle=True
)

print(f"Train samples: {len(train_files_split)}")
print(f"Validation samples: {len(val_files_split)}")

Train samples: 7903
Validation samples: 1976


In [33]:
unique_labels = y_train_df["Label"].unique().tolist()
unique_labels

['samsung', 'apple', 'facebook', 'google', 'messenger', 'whatsapp', 'mozilla']

### Label encoding (strings → integers)

Keras requires numeric labels.

In [35]:
# ============================================================
# Convert label dict keys to STRING to match Path(f).stem
# ============================================================

labels_str = {
    f"{int(k):05d}": v
    for k, v in y_train_dct.items()
}

In [36]:
# ============================================================
# Label encoding (string → integer)
# ============================================================

# Sorted for reproducibility
unique_labels = sorted(y_train_df["Label"].unique())

label_to_index = {label: i for i, label in enumerate(unique_labels)}
index_to_label = {i: label for label, i in label_to_index.items()}

num_classes = len(unique_labels)

print("Label to index mapping:")
print(label_to_index)

Label to index mapping:
{'apple': 0, 'facebook': 1, 'google': 2, 'messenger': 3, 'mozilla': 4, 'samsung': 5, 'whatsapp': 6}


In [37]:
# ============================================================
# Compute class weights to handle imbalance
# ============================================================

y_encoded = [label_to_index[label] for label in y_train_df["Label"]]

class_weights_array = compute_class_weight(
    class_weight="balanced",
    classes=np.arange(num_classes),
    y=y_encoded
)

class_weight = dict(enumerate(class_weights_array))

print("Class weights:")
print(class_weight)

Class weights:
{0: np.float64(0.7335164835164835), 1: np.float64(0.8466021081497986), 2: np.float64(0.7518837050003806), 3: np.float64(2.4332512315270938), 4: np.float64(3.5548758546239654), 5: np.float64(0.7884277733439745), 6: np.float64(0.8584462982273201)}


In [38]:
# Encode string labels to integers
label_to_index = {label: i for i, label in enumerate(unique_labels)}
index_to_label = {i: label for label, i in label_to_index.items()}

num_classes = len(unique_labels)
print(label_to_index)

{'apple': 0, 'facebook': 1, 'google': 2, 'messenger': 3, 'mozilla': 4, 'samsung': 5, 'whatsapp': 6}


### Define a compact, efficient CNN

This model:

•   Trains fast

•   Fits in memory

•   Should be enough for this task

In [39]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import layers, models, optimizers

def build_model(input_shape=(72, 72, 3), num_classes=7):
    """
    MobileNetV2-based classifier.
    Phase 1: backbone frozen
    Phase 2: partial fine-tuning
    """

    # Pretrained backbone
    base_model = MobileNetV2(
        input_shape=input_shape,
        include_top=False,
        weights="imagenet"
    )

    # Phase 1: freeze entire backbone
    base_model.trainable = False

    # Classification head
    x = base_model.output
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.4)(x)
    outputs = layers.Dense(num_classes, activation="softmax")(x)

    model = models.Model(inputs=base_model.input, outputs=outputs)

    model.compile(
        optimizer=optimizers.Adam(learning_rate=1e-3),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )

    return model

### **Instantiate the model**

In [40]:
# ============================================================
# Instantiate model
# ============================================================

model = build_model(num_classes=num_classes)
model.summary()

### Train the model

We must define steps_per_epoch manually because we use a generator.

In [43]:
# ============================================================
# Training generators (consistent with 72x72 model input)
# ============================================================

batch_size = 32
target_size = (72, 72)

# plots the gradual improvement of accuracy/loss for training/validation
# sets as a line plot
liveplot1 = LivePlot()

train_gen = imageLoader(
    files=train_files_split,
    labels=labels_str,          # keys must match Path(f).stem
    batch_size=batch_size,
    target_size=target_size,
    augment=True,
    class_weight=class_weight,  # IMPORTANT: generator will output sample weights
    seed=42
)

val_gen = imageLoader(
    files=val_files_split,
    labels=labels_str,
    batch_size=batch_size,
    target_size=target_size,
    augment=False,
    class_weight=None,          # no weights needed for validation
    seed=123
)

# Use ceil division so you don't drop the tail of the dataset
steps_per_epoch = int(np.ceil(len(train_files_split) / batch_size))
validation_steps = int(np.ceil(len(val_files_split) / batch_size))

print("steps_per_epoch:", steps_per_epoch)
print("validation_steps:", validation_steps)

steps_per_epoch: 247
validation_steps: 62


In [44]:
# ============================================================
# Training - Phase 1 (frozen backbone)
# ============================================================



history_phase1 = model.fit(
    train_gen,
    validation_data=val_gen,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
    epochs=15,
    verbose=1
)

Epoch 1/15
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 264ms/step - accuracy: 0.7438 - loss: 0.6550 - val_accuracy: 0.6953 - val_loss: 0.8883
Epoch 2/15
[1m151/247[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m21s[0m 222ms/step - accuracy: 0.7436 - loss: 0.6427

KeyboardInterrupt: 

In [None]:
# plotting the changes after the fact

plt.figure(figsize=(10,4))

plt.subplot(1,2,1)
plt.plot(history_phase1.history["loss"], label="train loss")
plt.plot(history_phase1.history.get("val_loss", []), label="val loss")
plt.legend(); plt.grid()

plt.subplot(1,2,2)
plt.plot(history_phase1.history["accuracy"], label="train acc")
plt.plot(history_phase1.history.get("val_accuracy", []), label="val acc")
plt.legend(); plt.grid()

plt.show()


In [None]:
# ============================================================
# Fine-tuning - Phase 2
# ============================================================

import tensorflow as tf

# Number of layers to fine-tune from the end
FINE_TUNE_LAYERS = 60

# Freeze all layers first
for layer in model.layers:
    layer.trainable = False

# Unfreeze last N layers (except BatchNorm)
for layer in model.layers[-FINE_TUNE_LAYERS:]:
    if not isinstance(layer, layers.BatchNormalization):
        layer.trainable = True

# Recompile with lower learning rate (critical)
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# plotting the accuracy/loss for fine-tuning
liveplot2 = LivePlot()

history_phase2 = model.fit(
    train_gen,
    validation_data=val_gen,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
    epochs=10,
    verbose=1
)

### **We now replace the dummy prediction function**

In [None]:
def predict_with_my_solid_model(test_files):
    """
    Predict labels for the test set using the trained CNN.
    """
    predictions = []

    for f in test_files:
        img = load_single_image(f)
        img = np.expand_dims(img, axis=0)  # (1, H, W, C)

        probs = model.predict(img, verbose=0)
        pred_idx = np.argmax(probs, axis=1)[0]
        predictions.append(index_to_label[pred_idx])

    return predictions

### **Generate submission**

In [None]:
y_test_pred = predict_with_my_solid_model(test_files)

test_ids_sr = pd.Series(test_ids, name="Id")
y_test_pred_sr = pd.Series(y_test_pred, name="Label")

submission_df = pd.concat([test_ids_sr, y_test_pred_sr], axis=1)
submission_df.head()

In [None]:
submission_df.to_csv("submission.csv", index=False)