## Basic Imports

In [2]:
import nltk; nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
import os, pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from torchvision.datasets.utils import download_url
from tqdm import tqdm

## Step 1: Load Flickr30k

In [4]:
df_flickr_raw = pd.read_csv("../data/captions.txt", names=["image_name", "comment_number", "comment"], skiprows=1)

sia = SentimentIntensityAnalyzer()

flickr_data = []
for _, row in df_flickr_raw.iterrows():
    caption = str(row["comment"]).strip()
    if caption.lower() == "nan" or caption == "":
        continue  # skip empty captions

    sentiment = sia.polarity_scores(caption)['compound']
    label = "positive" if sentiment > 0.2 else "negative" if sentiment < -0.2 else "neutral"

    flickr_data.append({
        "image_path": os.path.join("../data/flickr30k_images", row["image_name"]),
        "caption": caption,
        "text_sentiment": label,
        "image_sentiment": "neutral",
        "source": "flickr"
    })

df_flickr = pd.DataFrame(flickr_data)

In [5]:
df_flickr.head()

Unnamed: 0,image_path,caption,text_sentiment,image_sentiment,source
0,../data/flickr30k_images\1000092795.jpg,Two young guys with shaggy hair look at their ...,neutral,neutral,flickr
1,../data/flickr30k_images\1000092795.jpg,Two young White males are outside near many b...,neutral,neutral,flickr
2,../data/flickr30k_images\1000092795.jpg,Two men in green shirts are standing in a yard .,neutral,neutral,flickr
3,../data/flickr30k_images\1000092795.jpg,A man in a blue shirt standing in a garden .,neutral,neutral,flickr
4,../data/flickr30k_images\1000092795.jpg,Two friends enjoy time spent together .,positive,neutral,flickr


## STEP 2: Load Memotion

In [6]:
df_memotion_raw = pd.read_csv("../data/Memotion_Dataset/labels.csv")
df_memotion_raw.head()

Unnamed: 0.1,Unnamed: 0,image_name,text_ocr,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment
0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,hilarious,general,not_offensive,not_motivational,very_positive
1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,not_funny,general,not_offensive,motivational,very_positive
2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,very_funny,not_sarcastic,not_offensive,not_motivational,positive
3,3,image_4.png,10 Year Challenge - Sweet Dee Edition,10 Year Challenge - Sweet Dee Edition,very_funny,twisted_meaning,very_offensive,motivational,positive
4,4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,hilarious,very_twisted,very_offensive,not_motivational,neutral


In [7]:
memotion_data = []
for _, row in df_memotion_raw.iterrows():
    caption_col = "text_corrected" if "text_corrected" in df_memotion_raw.columns else "text"
    caption = str(row[caption_col]).strip()
    if caption.lower() == "nan" or caption == "":
        continue

    sentiment = sia.polarity_scores(caption)['compound']
    label = "positive" if sentiment > 0.2 else "negative" if sentiment < -0.2 else "neutral"

    memotion_data.append({
        "image_path": os.path.join("../data/Memotion_Dataset/images", row["image_name"]),
        "caption": caption,
        "text_sentiment": label,
        "image_sentiment": "neutral",
        "source": "memotion"
    })

df_memotion = pd.DataFrame(memotion_data)

## 💾 STEP 4: Merge + Save

In [8]:
df_combined = pd.concat([df_flickr, df_memotion], ignore_index=True)

print("Combined dataset shape:", df_combined.shape)
print(df_combined.sample(5))

df_combined.to_csv("fusion_dataset.csv", index=False)

Combined dataset shape: (165902, 5)
                                    image_path  \
95567  ../data/flickr30k_images\4295355954.jpg   
45167  ../data/flickr30k_images\2836360729.jpg   
89416  ../data/flickr30k_images\3974197857.jpg   
90537  ../data/flickr30k_images\4032257125.jpg   
21593  ../data/flickr30k_images\2243345188.jpg   

                                                 caption text_sentiment  \
95567  An older woman paints with watercolors in a mo...       negative   
45167  The smiling woman at the beach is buried in se...       positive   
89416  Men dressed up in blue uniforms standing in fo...        neutral   
90537  A girl with love tattooed on the inside of her...       positive   
21593  2 people in long pants and long-sleeved shirts...        neutral   

      image_sentiment  source  
95567         neutral  flickr  
45167         neutral  flickr  
89416         neutral  flickr  
90537         neutral  flickr  
21593         neutral  flickr  


## 🔥 CLIP Vision Sentiment Injection 🔥

### Imports

In [9]:
import open_clip
from PIL import Image
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
from tqdm.notebook import tqdm 

### Load model & preprocess

In [10]:
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='openai')
tokenizer = open_clip.get_tokenizer('ViT-B-32')



In [11]:
import torch
print(torch.version.cuda)  # Should be 11.8
print(torch.cuda.is_available())  # Should be True
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


11.8
True
Quadro P520


In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(device)

cuda


### Zero-shot candidate sentiment labels

In [13]:
sentiment_labels = ["happy", "sad", "angry", "neutral", "fear", "surprise", "disgust"]

### Encode emotion labels

In [14]:
with torch.no_grad():
    text_tokens = tokenizer(sentiment_labels).to(device)
    text_embeddings = model.encode_text(text_tokens)
    text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)

### Helper Function

In [15]:
def get_clip_image_sentiment(img_path):
    """
    Takes an image path, returns (best_sentiment_label, similarity_score)
    """
    try:
        image = preprocess(Image.open(img_path).convert("RGB")).unsqueeze(0).to(device)
        with torch.no_grad():
            image_features = model.encode_image(image)
            image_features /= image_features.norm(dim=-1, keepdim=True)

            similarity = image_features @ text_embeddings.T
            best_idx = similarity.argmax().item()
            return sentiment_labels[best_idx], similarity[0, best_idx].item()
    except:
        return None, None

In [16]:
def inject_clip_sentiment(df, image_col="image_path"):
    """
    Adds CLIP-based sentiment predictions & similarity scores to DataFrame.
    """
    sentiments = []
    scores = []
    for img_path in tqdm(df[image_col], desc="Processing images"):
        label, score = get_clip_image_sentiment(img_path)
        sentiments.append(label)
        scores.append(score)

    df["image_sentiment"] = sentiments
    df["clip_similarity"] = scores
    return df

In [17]:
df = pd.read_csv("fusion_dataset.csv")
df = inject_clip_sentiment(df, image_col="image_path")
df.to_csv("fusion_dataset_with_clip.csv", index=False)

Processing images:   0%|          | 0/165902 [00:00<?, ?it/s]



In [18]:
df.head()

Unnamed: 0,image_path,caption,text_sentiment,image_sentiment,source,clip_similarity
0,../data/flickr30k_images\1000092795.jpg,Two young guys with shaggy hair look at their ...,neutral,surprise,flickr,0.216297
1,../data/flickr30k_images\1000092795.jpg,Two young White males are outside near many b...,neutral,surprise,flickr,0.216297
2,../data/flickr30k_images\1000092795.jpg,Two men in green shirts are standing in a yard .,neutral,surprise,flickr,0.216297
3,../data/flickr30k_images\1000092795.jpg,A man in a blue shirt standing in a garden .,neutral,surprise,flickr,0.216297
4,../data/flickr30k_images\1000092795.jpg,Two friends enjoy time spent together .,positive,surprise,flickr,0.216297
