# CLIP + LLM Cooperative Reasoning
This notebook demonstrates multimodal reasoning by combining CLIP (vision-language) and an LLM (text).

# CLIP + LLM Cooperative Reasoning

In [None]:
# Install required packages
%pip install openai torch torchvision pillow git+https://github.com/openai/CLIP.git

In [None]:
import os, torch, clip, openai, json, requests
from PIL import Image

openai.api_key = os.getenv('OPENAI_API_KEY', None)
assert openai.api_key, 'Set OPENAI_API_KEY in your environment!'

In [None]:
# Load CLIP model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, preprocess = clip.load('ViT-B/32', device=device)

# Load image and encode
img_url = 'https://images.unsplash.com/photo-1506744038136-46273834b3fb'
image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
image_input = preprocess(image).unsqueeze(0).to(device)
with torch.no_grad():
    img_features = model.encode_image(image_input).cpu().numpy().tolist()

In [None]:
# LLM reasoning over image embedding
prompt = f"Image embedding: {img_features[:8]}...\nDescribe what this image might contain and its context."
resp = openai.ChatCompletion.create(model='gpt-3.5-turbo', messages=[{'role':'user','content':prompt}])
desc = resp.choices[0].message.content
os.makedirs('./outputs', exist_ok=True)
with open('./outputs/clip_llm_output.json', 'w') as f:
    json.dump({'desc': desc}, f, indent=2)
print(desc)

In [None]:
# Save outputs
import json, os
os.makedirs('./outputs', exist_ok=True)
image.save('./outputs/multimodal_input.jpg')
with open('./outputs/clip_embedding.json', 'w') as f:
    json.dump({'embedding': img_features[0][:128]}, f, indent=2)
print('Saved image and embedding to outputs/.')

In [None]:
# Save image and embedding with timestamp
import os, json, time
os.makedirs('./outputs', exist_ok=True)
ts = int(time.time())
img_path = f'./outputs/multimodal_input_{ts}.jpg'
image.save(img_path)
emb_path = f'./outputs/clip_embedding_{ts}.json'
with open(emb_path, 'w') as f:
    json.dump({'embedding_dim': len(img_features[0]), 'embedding_head': img_features[0][:16]}, f, indent=2)
print('Saved', img_path, 'and', emb_path)