In [7]:
!pip install faiss-cpu
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import kagglehub
import random
import faiss
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras import layers, models, optimizers, callbacks
from sentence_transformers import SentenceTransformer


CONFIG = {
    'IMG_SIZE': (48, 48),
    'BATCH_SIZE': 64,
    'EPOCHS': 25,
    'LEARNING_RATE': 0.001,
    'SEED': 42,
    'CLASSES': ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'],
    'VECTOR_MODEL': 'all-MiniLM-L6-v2' # Lightweight, fast sentence-transformer
}


np.random.seed(CONFIG['SEED'])
tf.random.set_seed(CONFIG['SEED'])
random.seed(CONFIG['SEED'])

print("="*50)
print("Pipeline Initiated: FER -> Review Gen -> Vector Store")
print("="*50)


print("\n--- [STAGE 1] DATASET PREPARATION ---")
try:
    dataset_path = kagglehub.dataset_download("msambare/fer2013")
    print(f"[SUCCESS] Dataset found at: {dataset_path}")
except Exception as e:
    print(f"[ERROR] Failed to download dataset: {e}")
    exit()

train_dir = os.path.join(dataset_path, 'train')
test_dir = os.path.join(dataset_path, 'test')


print("[INFO] Loading Data...")
train_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir, labels='inferred', label_mode='categorical',
    class_names=CONFIG['CLASSES'], color_mode='grayscale',
    batch_size=CONFIG['BATCH_SIZE'], image_size=CONFIG['IMG_SIZE'],
    shuffle=True, seed=CONFIG['SEED']
)

test_ds_raw = tf.keras.utils.image_dataset_from_directory(
    test_dir, labels='inferred', label_mode='categorical',
    class_names=CONFIG['CLASSES'], color_mode='grayscale',
    batch_size=CONFIG['BATCH_SIZE'], image_size=CONFIG['IMG_SIZE'],
    shuffle=False
)

# Get file paths and true labels from the raw dataset before caching/prefetching
file_paths = test_ds_raw.file_paths
true_labels_raw = np.concatenate([y for x, y in test_ds_raw], axis=0)

AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds_raw.cache().prefetch(buffer_size=AUTOTUNE)

print("\n--- [STAGE 1] MODEL TRAINING ---")
def build_fer_cnn(input_shape, num_classes):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(0.1),
        layers.RandomZoom(0.1),
        layers.Rescaling(1./255),
        layers.Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal'),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Dropout(0.2),
        layers.Conv2D(128, (3, 3), padding='same', kernel_initializer='he_normal'),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Dropout(0.3),
        layers.Conv2D(256, (3, 3), padding='same', kernel_initializer='he_normal'),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Dropout(0.4),
        layers.Flatten(),
        layers.Dense(512, kernel_initializer='he_normal'),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ], name="FER_Custom_CNN")
    return model


if os.path.exists("best_fer_model.keras"):
    print("[INFO] Loading existing trained model...")
    model = models.load_model("best_fer_model.keras")
else:
    print("[INFO] Training new model...")
    model = build_fer_cnn((48, 48, 1), len(CONFIG['CLASSES']))
    model.compile(optimizer=optimizers.Adam(learning_rate=CONFIG['LEARNING_RATE']),
                  loss='categorical_crossentropy', metrics=['accuracy'])

    callbacks_list = [
        callbacks.ModelCheckpoint("best_fer_model.keras", save_best_only=True, monitor='val_accuracy'),
        callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True),
        callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
    ]
    history = model.fit(train_ds, epochs=CONFIG['EPOCHS'], validation_data=test_ds, callbacks=callbacks_list)

print("\n--- [STAGE 1] EVALUATION ---")
y_pred_probs = []
for images, labels in test_ds:
    y_pred_probs.extend(model.predict(images, verbose=0))

y_pred = np.argmax(np.array(y_pred_probs), axis=1)
y_true = np.argmax(true_labels_raw, axis=1) # Use true labels from the raw dataset
print(classification_report(y_true, y_pred, target_names=CONFIG['CLASSES']))


filenames = [os.path.basename(fp) for fp in file_paths] # Use filenames from the raw dataset
results_df = pd.DataFrame({
    'filename': filenames,
    'true_label': [CONFIG['CLASSES'][i] for i in y_true],
    'predicted_label': [CONFIG['CLASSES'][i] for i in y_pred],
    'confidence': [np.max(probs) for probs in y_pred_probs]
})


print("\n--- [STAGE 2] DATA GENERATION (REVIEWS) ---")

REVIEW_TEMPLATES = {
    'angry': [
        "I am absolutely furious with this service!",
        "This is completely unacceptable, I want a refund.",
        "Worst experience ever, made me so angry.",
        "Don't waste your time, it's frustrating and maddening."
    ],
    'disgust': [
        "That was absolutely revolting.",
        "I'm disgusted by the lack of hygiene here.",
        "Ugh, terrible quality, completely gross.",
        "I can't believe how awful this is, yuck."
    ],
    'fear': [
        "I felt very unsafe and anxious the whole time.",
        "Scary experience, I wouldn't go back.",
        "It was terrifying, I was literally shaking.",
        "Nervous wreck after dealing with this situation."
    ],
    'happy': [
        "Absolutely wonderful experience, loved it!",
        "Great service, made my day so much better.",
        "I'm so happy with the results, thank you!",
        "Delightful and pleasant, highly recommended."
    ],
    'neutral': [
        "It was okay, nothing special.",
        "Average experience, met expectations but didn't exceed them.",
        "Fine, no major complaints but nothing to rave about.",
        "Standard service, it was adequate."
    ],
    'sad': [
        "Really disappointing, made me feel down.",
        "Unfortunate outcome, I'm quite upset about it.",
        "Depressing atmosphere and poor results.",
        "Heartbreakingly bad service today."
    ],
    'surprise': [
        "Wow, I didn't expect that at all!",
        "Incredible! I'm genuinely shocked by how good it was.",
        "A very surprising turn of events, but interesting.",
        "Taken aback by the sudden change, wow."
    ]
}

def generate_review(row):
    """Generates a review based on predicted emotion and confidence."""
    emotion = row['predicted_label']
    confidence = row['confidence']

    base_review = random.choice(REVIEW_TEMPLATES[emotion])


    if confidence > 0.9:
        prefix = "Definitely! "
    elif confidence < 0.5:
        prefix = "I'm not entirely sure, but "
    else:
        prefix = ""

    return f"{prefix}{base_review}"

print("[INFO] Generating synthetic reviews based on predicted emotions...")
results_df['generated_review'] = results_df.apply(generate_review, axis=1)

print("\nSample Generated Reviews:")
print(results_df[['predicted_label', 'confidence', 'generated_review']].sample(5))


print("\n--- [STAGE 3] EMBEDDING & VECTOR STORE ---")

print(f"[INFO] Loading Sentence Transformer model: {CONFIG['VECTOR_MODEL']}...")

st_model = SentenceTransformer(CONFIG['VECTOR_MODEL'])

print("[INFO] Encoding reviews into vectors (this may take a moment)...")

embeddings = st_model.encode(results_df['generated_review'].tolist(), show_progress_bar=True)


faiss.normalize_L2(embeddings)

print(f"[INFO] Embeddings shape: {embeddings.shape}")

dimension = embeddings.shape[1] # e.g., 384 for all-MiniLM-L6-v2
print(f"[INFO] Creating FAISS index with dimension: {dimension}")


index = faiss.IndexFlatIP(dimension)
index.add(embeddings)

print(f"[SUCCESS] FAISS index populated with {index.ntotal} vectors.")


print("\n[INFO] Saving all outputs...")

results_df.to_csv('fer_predictions_with_reviews.csv', index=False)


faiss.write_index(index, 'fer_reviews.index')

print("[COMPLETED] Pipeline finished. Files generated:")
print("1. fer_predictions_with_reviews.csv (Data + Reviews)")
print("2. fer_reviews.index (Vector Store)")

Pipeline Initiated: FER -> Review Gen -> Vector Store

--- [STAGE 1] DATASET PREPARATION ---
Using Colab cache for faster access to the 'fer2013' dataset.
[SUCCESS] Dataset found at: /kaggle/input/fer2013
[INFO] Loading Data...
Found 28709 files belonging to 7 classes.
Found 7178 files belonging to 7 classes.

--- [STAGE 1] MODEL TRAINING ---
[INFO] Loading existing trained model...

--- [STAGE 1] EVALUATION ---
              precision    recall  f1-score   support

       angry       0.51      0.53      0.52       958
     disgust       0.74      0.21      0.32       111
        fear       0.56      0.26      0.36      1024
       happy       0.84      0.81      0.83      1774
     neutral       0.46      0.71      0.56      1233
         sad       0.48      0.47      0.47      1247
    surprise       0.74      0.74      0.74       831

    accuracy                           0.60      7178
   macro avg       0.62      0.53      0.54      7178
weighted avg       0.62      0.60      0.5

Batches:   0%|          | 0/225 [00:00<?, ?it/s]

[INFO] Embeddings shape: (7178, 384)
[INFO] Creating FAISS index with dimension: 384
[SUCCESS] FAISS index populated with 7178 vectors.

[INFO] Saving all outputs...
[COMPLETED] Pipeline finished. Files generated:
1. fer_predictions_with_reviews.csv (Data + Reviews)
2. fer_reviews.index (Vector Store)
