## Basic Imports

In [2]:
import nltk; nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
import os, pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from torchvision.datasets.utils import download_url
from tqdm import tqdm

## Step 1: Load Flickr30k

In [5]:
flickr_dir = "../data/flickr30k_images"
flickr_captions_path = "../data/captions.txt"  # Modify if using JSON

flickr_data = []
sia = SentimentIntensityAnalyzer()

with open(flickr_captions_path, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("\t")
        if len(parts) < 2: continue
        img, caption = parts
        sentiment = sia.polarity_scores(caption)['compound']
        label = "positive" if sentiment > 0.2 else "negative" if sentiment < -0.2 else "neutral"
        flickr_data.append({
            "image_path": os.path.join(flickr_dir, img),
            "caption": caption,
            "text_sentiment": label,
            "image_sentiment": "neutral",  # Will use CLIP later
            "source": "flickr"
        })

df_flickr = pd.DataFrame(flickr_data)

## STEP 2: Load Memotion

In [7]:
memotion_dir = "../data/Memotion_Dataset/memes"
memotion_labels_path = "../data/Memotion_Dataset/labels.csv"

df_memotion_raw = pd.read_csv(memotion_labels_path)

memotion_data = []
for _, row in df_memotion_raw.iterrows():
    text = row['text_corrected'] if 'text_corrected' in row else row['text']
    sentiment = sia.polarity_scores(text)['compound']
    label = "positive" if sentiment > 0.2 else "negative" if sentiment < -0.2 else "neutral"
    memotion_data.append({
        "image_path": os.path.join(memotion_dir, row['image_name']),
        "caption": text,
        "text_sentiment": label,
        "image_sentiment": row.get("overall_sentiment", "unknown"),
        "humor": row.get("humour", 0),
        "sarcasm": row.get("sarcasm", 0),
        "offensive": row.get("offensive", 0),
        "motivation": row.get("motivational", 0),
        "source": "memotion"
    })

df_memotion = pd.DataFrame(memotion_data)

AttributeError: 'float' object has no attribute 'encode'

## 🧬 STEP 3: Normalize columns

shared_cols = ['image_path', 'caption', 'text_sentiment', 'image_sentiment', 'source']
df_memotion = df_memotion[shared_cols + ['humor', 'sarcasm', 'offensive', 'motivation']]
df_flickr = df_flickr[shared_cols]
for col in ['humor', 'sarcasm', 'offensive', 'motivation']:
    df_flickr[col] = 0

## 💾 STEP 4: Merge + Save

In [None]:
df_fusion = pd.concat([df_flickr, df_memotion], ignore_index=True)
df_fusion.to_csv("fusion_mood_dataset.csv", index=False)
print("✅ Fusion dataset saved: fusion_mood_dataset.csv")