In [27]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib
import copy

import torchvision.models as models
import torch.optim as optim

In [28]:
df = pd.read_pickle('combined_df.pkl')
df = df.dropna(subset=['image'])

In [29]:
# Mapping dictionary
engagement_mapping = {
    "not engaged": [
        "isolation", "neglect", "pity", "sentimentality", "loneliness", "gloom", "alienation", "defeat", "anguish", "dejection",
        "hopelessness", "melancholy", "depression", "homesickness", "longing"
    ],
    "engaged-positive": [
        "lust", "desire", "infatuation", "passion", "attraction", "liking",
        "excitement", "hope", "optimism", "eagerness", "zeal", "arousal", "joy", "zest",
        "cheerfulness", "happiness", "elation", "rapture", "enjoyment", "gladness",
        "bliss", "gaiety", "jubilation", "delight", "euphoria", "jolliness", "joviality",
        "glee", "ecstasy", "caring", "love", "tenderness", "affection", "adoration",
        "fondness", "compassion", "sympathy", "pleasure", "pride", "satisfaction",
        "contentment", "relief", "triumph", "enthusiasm", "amusement", "surprise",
        "astonishment", "amazement", "shock", "thrill", "exhilaration", "enthrallment"
    ],
    "engaged-negative": [
        "irritation", "wrath", "annoyance", "rage", "aggravation", "anger", "resentment",
        "grumpiness", "frustration", "fury", "hostility", "exasperation", "outrage",
        "grouchiness", "spite", "unhappiness", "disappointment", "insult",
        "rejection", "agitation", "bitterness", "hate",
        "disgust", "dislike", "contempt", "scorn", "displeasure", "envy", "loathing",
        "jealousy", "revulsion", "nervousness", "alarm", "fear", "fright", "horror",
        "terror", "dread", "hysteria", "dismay", "apprehension", "worry", "panic",
        "tenseness", "uneasiness", "anxiety", "suffering", "hurt", "agony",
        "insecurity", "distress", "torment", "sadness", "grief", "glumness", "sorrow", "despair", "misery", "woe", "regret", "guilt", "shame", "embarrassment", "mortification",
        "remorse", "humiliation"
    ]
}

# Step 1: Flatten the mapping
flat_mapping = {}
for engagement_type, labels in engagement_mapping.items():
    for label in labels:
        flat_mapping[label] = engagement_type

# Step 2: Apply mapping to your DataFrame
df['engagement_type'] = df['label'].map(flat_mapping)

# Step 3: Optional - check distribution
print(df['engagement_type'].value_counts())

engagement_type
engaged-negative    13160
engaged-positive    12874
not engaged          3652
Name: count, dtype: int64


In [30]:
df.columns

Index(['name', 'description', 'label', 'base_name', 'emotion_category',
       'image', 'engagement_type'],
      dtype='object')

In [31]:
df[['name', 'description', 'label', 'base_name', 'emotion_category', 'engagement_type']].head()

Unnamed: 0,name,description,label,base_name,emotion_category,engagement_type
0,bigstockphoto_irritation_portrait_450_147.jpg,Image of displeased beautiful woman talking on...,irritation,bigstockphoto_irritation_portrait_450_147,Anger / Irritation,engaged-negative
3,alamy_frustration_face_10_76.jpg,Cute little blue eyed european blond boy looks...,frustration,alamy_frustration_face_10_76,Anger / Irritation,engaged-negative
4,alamy_irritation_portrait_10_10.jpg,Portrait of young man with acne problem at home,irritation,alamy_irritation_portrait_10_10,Anger / Irritation,engaged-negative
5,shutterstock_wrath_look_10_89.jpg,Portrait Angry Business Man Fists Air Stock Ph...,wrath,shutterstock_wrath_look_10_89,Anger / Irritation,engaged-negative
6,alamy_irritation_face_11_98.jpg,,irritation,alamy_irritation_face_11_98,Anger / Irritation,engaged-negative


In [32]:
# Step 1: Define the target sample size
min_class_size = df['engagement_type'].value_counts().min()

# Step 2: Sample each class down to the minimum size
df_balanced = (
    df.groupby('engagement_type', group_keys=False)
      .apply(lambda x: x.sample(n=min_class_size, random_state=42))
      .reset_index(drop=True)
)

# Step 3: Check the balance
print(df_balanced['engagement_type'].value_counts())

engagement_type
engaged-negative    3652
engaged-positive    3652
not engaged         3652
Name: count, dtype: int64


  .apply(lambda x: x.sample(n=min_class_size, random_state=42))


In [33]:
le = LabelEncoder()
df_balanced['label_encoded'] = le.fit_transform(df_balanced['engagement_type'])
joblib.dump(le, 'engagement_label_encoder.joblib')

['engagement_label_encoder.joblib']

In [34]:
class EngagementDataset(Dataset):
    def __init__(self, dataframe):
        self.images = dataframe['image'].tolist()
        self.labels = dataframe['label_encoded'].tolist()

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        return self.images[idx], self.labels[idx]

In [35]:
train_df, val_df = train_test_split(
    df_balanced, test_size=0.2, stratify=df_balanced['label_encoded'], random_state=42)

train_dataset = EngagementDataset(train_df)
val_dataset = EngagementDataset(val_df)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [36]:
num_classes = len(le.classes_)

model = models.alexnet(pretrained=True)
model.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)



In [37]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [38]:
num_epochs = 10

for epoch in range(num_epochs):
    # -------- TRAINING --------
    model.train()
    running_loss = 0.0

    for images, labels in train_loader:
        images, labels = images.to(device), torch.tensor(labels).to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)

    # -------- VALIDATION --------
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), torch.tensor(labels).to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_accuracy = 100 * correct / total

    # -------- LOG RESULTS --------
    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Accuracy: {val_accuracy:.2f}%")


  images, labels = images.to(device), torch.tensor(labels).to(device)
  images, labels = images.to(device), torch.tensor(labels).to(device)


Epoch 1/10 | Train Loss: 0.9789 | Val Accuracy: 56.71%
Epoch 2/10 | Train Loss: 0.8616 | Val Accuracy: 57.44%
Epoch 3/10 | Train Loss: 0.7700 | Val Accuracy: 57.71%
Epoch 4/10 | Train Loss: 0.6615 | Val Accuracy: 56.98%
Epoch 5/10 | Train Loss: 0.5427 | Val Accuracy: 58.67%
Epoch 6/10 | Train Loss: 0.4289 | Val Accuracy: 57.39%
Epoch 7/10 | Train Loss: 0.3264 | Val Accuracy: 57.12%
Epoch 8/10 | Train Loss: 0.2538 | Val Accuracy: 57.12%
Epoch 9/10 | Train Loss: 0.2169 | Val Accuracy: 56.89%
Epoch 10/10 | Train Loss: 0.1713 | Val Accuracy: 57.25%


In [39]:
import copy

num_epochs = 10
#best_val_accuracy = 0.0
#best_model_state = None

for epoch in range(num_epochs):
    # -------- TRAINING --------
    model.train()
    running_loss = 0.0
    correct_train, total_train = 0, 0

    for images, labels in train_loader:
        images, labels = images.to(device), torch.tensor(labels).to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    avg_train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * correct_train / total_train

    # -------- VALIDATION --------
    model.eval()
    correct_val, total_val = 0, 0
    val_loss = 0.0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), torch.tensor(labels).to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct_val / total_val

    # -------- LOG RESULTS --------
    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.2f}% | "
          f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.2f}%")

    # -------- SAVE BEST MODEL --------
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model_state = copy.deepcopy(model.state_dict())
        print("✅ New best model saved.")

# -------- LOAD BEST MODEL AFTER TRAINING --------
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"\n🏁 Best validation accuracy: {best_val_accuracy:.2f}%")
else:
    print("❌ No improvement during training.")


  images, labels = images.to(device), torch.tensor(labels).to(device)
  images, labels = images.to(device), torch.tensor(labels).to(device)


Epoch 1/10 | Train Loss: 0.1557 | Train Acc: 94.12% | Val Loss: 1.6248 | Val Acc: 57.80%
Epoch 2/10 | Train Loss: 0.1415 | Train Acc: 94.69% | Val Loss: 1.6574 | Val Acc: 58.49%
Epoch 3/10 | Train Loss: 0.1329 | Train Acc: 94.60% | Val Loss: 1.8441 | Val Acc: 56.98%
Epoch 4/10 | Train Loss: 0.1171 | Train Acc: 95.32% | Val Loss: 1.6399 | Val Acc: 56.89%
Epoch 5/10 | Train Loss: 0.1106 | Train Acc: 95.46% | Val Loss: 1.9287 | Val Acc: 55.93%
Epoch 6/10 | Train Loss: 0.1100 | Train Acc: 95.20% | Val Loss: 1.9009 | Val Acc: 57.03%
Epoch 7/10 | Train Loss: 0.1127 | Train Acc: 95.25% | Val Loss: 1.8682 | Val Acc: 59.35%
✅ New best model saved.
Epoch 8/10 | Train Loss: 0.1002 | Train Acc: 95.60% | Val Loss: 2.0391 | Val Acc: 58.53%
Epoch 9/10 | Train Loss: 0.1028 | Train Acc: 95.94% | Val Loss: 2.0941 | Val Acc: 54.84%
Epoch 10/10 | Train Loss: 0.0924 | Train Acc: 95.77% | Val Loss: 2.0452 | Val Acc: 58.85%

🏁 Best validation accuracy: 59.35%


In [40]:
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [41]:
import copy

num_epochs = 10
#best_val_accuracy = 0.0
#best_model_state = None

for epoch in range(num_epochs):
    # -------- TRAINING --------
    model.train()
    running_loss = 0.0
    correct_train, total_train = 0, 0

    for images, labels in train_loader:
        images, labels = images.to(device), torch.tensor(labels).to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    avg_train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * correct_train / total_train

    # -------- VALIDATION --------
    model.eval()
    correct_val, total_val = 0, 0
    val_loss = 0.0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), torch.tensor(labels).to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct_val / total_val

    # -------- LOG RESULTS --------
    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.2f}% | "
          f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.2f}%")

    # -------- SAVE BEST MODEL --------
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model_state = copy.deepcopy(model.state_dict())
        print("✅ New best model saved.")

# -------- LOAD BEST MODEL AFTER TRAINING --------
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"\n🏁 Best validation accuracy: {best_val_accuracy:.2f}%")
else:
    print("❌ No improvement during training.")


  images, labels = images.to(device), torch.tensor(labels).to(device)
  images, labels = images.to(device), torch.tensor(labels).to(device)


Epoch 1/10 | Train Loss: 0.1089 | Train Acc: 95.61% | Val Loss: 2.3149 | Val Acc: 57.39%
Epoch 2/10 | Train Loss: 0.1006 | Train Acc: 95.85% | Val Loss: 2.4829 | Val Acc: 58.03%
Epoch 3/10 | Train Loss: 0.1032 | Train Acc: 95.45% | Val Loss: 2.0431 | Val Acc: 58.30%
Epoch 4/10 | Train Loss: 0.0969 | Train Acc: 95.92% | Val Loss: 2.3320 | Val Acc: 56.89%
Epoch 5/10 | Train Loss: 0.0936 | Train Acc: 95.70% | Val Loss: 2.6946 | Val Acc: 57.76%
Epoch 6/10 | Train Loss: 0.0928 | Train Acc: 95.96% | Val Loss: 2.1022 | Val Acc: 58.03%
Epoch 7/10 | Train Loss: 0.0857 | Train Acc: 96.04% | Val Loss: 2.4626 | Val Acc: 58.26%
Epoch 8/10 | Train Loss: 0.0811 | Train Acc: 96.50% | Val Loss: 2.0467 | Val Acc: 56.30%
Epoch 9/10 | Train Loss: 0.0833 | Train Acc: 96.25% | Val Loss: 2.4335 | Val Acc: 58.44%
Epoch 10/10 | Train Loss: 0.0755 | Train Acc: 96.33% | Val Loss: 2.1407 | Val Acc: 57.85%

🏁 Best validation accuracy: 59.35%


In [42]:
torch.save(model, "alexnet_best_final_corrected.pth")

In [43]:
print(model)

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

optimizer = optim.Adam(model.parameters(), lr=1e-5)

model.classifier = nn.Sequential(
    nn.Dropout(p=0.6),
    nn.Linear(256 * 6 * 6, 4096),
    nn.ReLU(inplace=True),
    nn.Dropout(p=0.6),
    nn.Linear(4096, 4096),
    nn.ReLU(inplace=True),
    nn.Linear(4096, num_classes)
)

model.to(device)  # 🔁 Move full model to the correct device AFTER modifying


import copy

num_epochs = 30
best_val_accuracy = 0.0
best_model_state = None

for epoch in range(num_epochs):
    # -------- TRAINING --------
    model.train()
    running_loss = 0.0
    correct_train, total_train = 0, 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    avg_train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * correct_train / total_train

    # -------- VALIDATION --------
    model.eval()
    running_val_loss = 0.0
    correct_val, total_val = 0, 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    avg_val_loss = running_val_loss / len(val_loader)
    val_accuracy = 100 * correct_val / total_val

    # -------- SAVE BEST MODEL --------
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model_state = copy.deepcopy(model.state_dict())  # Save model weights
        print(f"✅ New best model found at epoch {epoch+1} with Val Acc: {val_accuracy:.2f}%")

    # -------- LOG RESULTS --------
    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.2f}% | "
          f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.2f}%")

# -------- LOAD BEST MODEL AFTER TRAINING --------
model.load_state_dict(best_model_state)
print(f"🏁 Best validation accuracy was: {best_val_accuracy:.2f}%")


# TESTING THE MODEL 

In [14]:
import cv2
import torch
import torch.nn.functional as F
import time
from torchvision import transforms
import joblib

# ===== Load model (if not already) =====
#model.load_state_dict(best_model_state)
model.eval()
model.to(device)

# ===== Load label encoder =====
le = joblib.load("engagement_label_encoder.joblib")

# ===== Define preprocessing =====
preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# ===== Open webcam =====
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("❌ Cannot open webcam.")
else:
    print("📷 Webcam started. Press 'q' to quit.")

last_prediction_time = 0
prediction_interval = 2  # seconds
current_prediction = "..."

try:
    while True:
        ret, frame = cap.read()
        if not ret:
            print("❌ Failed to grab frame.")
            break

        # Only run prediction every 2 seconds
        if time.time() - last_prediction_time >= prediction_interval:
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            input_tensor = preprocess(rgb_frame).unsqueeze(0).to(device)

            with torch.no_grad():
                output = model(input_tensor)
                pred_idx = output.argmax(dim=1).item()
                current_prediction = le.inverse_transform([pred_idx])[0]

            last_prediction_time = time.time()

        # Draw prediction on frame
        cv2.putText(frame, f'Engagement: {current_prediction}', (20, 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 255, 0), 2, cv2.LINE_AA)

        # Show webcam feed with overlay
        cv2.imshow("Live Engagement Detection", frame)

        # Press 'q' to exit
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

except KeyboardInterrupt:
    print("\n⏹️ Stopped by user.")

finally:
    cap.release()
    cv2.destroyAllWindows()


📷 Webcam started. Press 'q' to quit.

⏹️ Stopped by user.


## Debuging

In [15]:
le.inverse_transform([pred_idx])[0]

'not engaged'

In [16]:
print(le.classes_)


['engaged-negative' 'engaged-positive' 'not engaged']


In [17]:
print(df_balanced['engagement_type'].value_counts())


engagement_type
engaged-negative    3652
engaged-positive    3652
not engaged         3652
Name: count, dtype: int64


All three classes are perfectly balanced at 3652 samples each, so overfitting due to class imbalance is not the issue. This means the model had a fair chance to learn all engagement types, but still defaults to predicting "not engaged" at test time. So now we move to:

#### Test the model on a known sample from validation data

In [18]:
model.eval()

# Pick a known validation sample — change index if needed
image, true_label = val_dataset[0]

# Prepare input
input_tensor = image.unsqueeze(0).to(device)

# Run prediction
with torch.no_grad():
    output = model(input_tensor)
    pred_idx = output.argmax(dim=1).item()

# Decode predicted and true labels
predicted_label = le.inverse_transform([pred_idx])[0]
true_label_name = le.inverse_transform([true_label])[0]

print(f"✅ True label:      {true_label_name}")
print(f"🤖 Model predicted: {predicted_label}")


✅ True label:      engaged-negative
🤖 Model predicted: engaged-negative


Your model correctly predicted engaged-negative for a known validation sample — this confirms that:

🧠 The model itself is working

✅ Inference decoding is correct

🧪 Training and validation pipelines are solid

So the problem is not with the model, but likely in the webcam input pipeline.

#### Debug the webcam input & preprocessing

In [19]:
import matplotlib.pyplot as plt

# Capture one frame
ret, frame = cap.read()
cap.release()  # Release after capture

if not ret:
    print("❌ Failed to grab frame.")
else:
    # Convert BGR to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Preprocess
    input_tensor = preprocess(rgb_frame)

    # De-normalize for display
    unnorm = transforms.Normalize(
        mean=[-m/s for m, s in zip([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])],
        std=[1/s for s in [0.229, 0.224, 0.225]]
    )
    image_vis = unnorm(input_tensor).permute(1, 2, 0).cpu().numpy()
    image_vis = (image_vis * 255).clip(0, 255).astype('uint8')

    # Display using matplotlib
    plt.imshow(image_vis)
    plt.axis("off")
    plt.title("Preprocessed Image Sent to Model")
    plt.show()


❌ Failed to grab frame.
