In [19]:
# Simple CLIP Image Tagger - Just put in an image path and get tags!

# Install required packages first:
# !pip install torch torchvision clip-by-openai pillow

import torch
import clip
from PIL import Image

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

def tag_image(image_path):
    """
    Analyze full body images to detect topwear, bottomwear, and their colors
    Just provide the path to your image!
    """
    
    # Topwear categories
    topwear = [
        "t-shirt", "shirt", "blouse", "tank top", "polo shirt", "dress shirt",
        "sweater", "hoodie", "jacket", "blazer", "coat", "cardigan", 
        "vest", "crop top", "tube top", "halter top", "camisole"
    ]
    
    # Bottomwear categories  
    bottomwear = [
        "jeans", "pants", "trousers", "shorts", "skirt", "leggings",
        "sweatpants", "chinos", "cargo pants", "dress pants", "joggers",
        "capris", "culottes", "palazzo pants", "wide leg pants"
    ]
    
    # Dresses and full outfits
    full_outfits = [
        "dress", "gown", "sundress", "maxi dress", "mini dress", 
        "cocktail dress", "evening dress", "jumpsuit", "romper", "overall"
    ]
    
    # Colors to test against
    colors = [
        "red", "blue", "green", "yellow", "orange", "purple", "pink",
        "black", "white", "gray", "brown", "navy", "beige", "khaki",
        "maroon", "teal", "olive", "cream", "gold", "silver", "denim blue"
    ]

    skin_shades = [
    "fair", "brown", "ashy", "red"
]

    
    try:
        # Load and preprocess the image
        image = Image.open(image_path).convert('RGB')
        image_input = preprocess(image).unsqueeze(0).to(device)
        
        print(f"Analyzing full body image: {image_path}")
        print("=" * 60)

        # ==== SKIN COLOR DETECTION =====
        skin_prompt = [f"person skin color is {colors}" for colors in skin_shades]
        skin_inputs = clip.tokenize(skin_prompt).to(device)

        with torch.no_grad():
            image_features = model.encode_image(image_input)
            skin_features = model.encode_text(skin_inputs)
            
            # Normalize features
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
            skin_features = skin_features / skin_features.norm(dim=-1, keepdim=True)
        
            # Calculate similarities
            skin_similarities = (image_features @ skin_features.T).softmax(dim=-1)
            
            # Get top 3 topwear predictions
            skin_probs, skin_indices = skin_similarities[0].topk(3)
            
            print("SKIN SHADES:")
            skin_shade_results = []
            for i in range(3):
                item = skin_shades[skin_indices[i]]
                confidence = skin_probs[i].item()
                print(f"   {item}: {confidence:.3f} ({confidence*100:.1f}%)")
                skin_shade_results.append((item, confidence))
        
        # ===== TOPWEAR DETECTION =====
        top_prompts = [f"person wearing {item}" for item in topwear]
        top_inputs = clip.tokenize(top_prompts).to(device)
        
        with torch.no_grad():
            image_features = model.encode_image(image_input)
            top_features = model.encode_text(top_inputs)
            
            # Normalize features
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
            top_features = top_features / top_features.norm(dim=-1, keepdim=True)
            
            # Calculate similarities
            top_similarities = (image_features @ top_features.T).softmax(dim=-1)
            
            # Get top 3 topwear predictions
            top_probs, top_indices = top_similarities[0].topk(3)
            
            print("👕 TOPWEAR:")
            topwear_results = []
            for i in range(3):
                item = topwear[top_indices[i]]
                confidence = top_probs[i].item()
                print(f"   {item}: {confidence:.3f} ({confidence*100:.1f}%)")
                topwear_results.append((item, confidence))
        
        # ===== BOTTOMWEAR DETECTION =====
        bottom_prompts = [f"person wearing {item}" for item in bottomwear]
        bottom_inputs = clip.tokenize(bottom_prompts).to(device)
        
        with torch.no_grad():
            bottom_features = model.encode_text(bottom_inputs)
            bottom_features = bottom_features / bottom_features.norm(dim=-1, keepdim=True)
            
            # Calculate similarities
            bottom_similarities = (image_features @ bottom_features.T).softmax(dim=-1)
            
            # Get top 3 bottomwear predictions
            bottom_probs, bottom_indices = bottom_similarities[0].topk(3)
            
            print("\n👖 BOTTOMWEAR:")
            bottomwear_results = []
            for i in range(3):
                item = bottomwear[bottom_indices[i]]
                confidence = bottom_probs[i].item()
                print(f"   {item}: {confidence:.3f} ({confidence*100:.1f}%)")
                bottomwear_results.append((item, confidence))
        
        # ===== FULL OUTFIT CHECK =====
        outfit_prompts = [f"person wearing {item}" for item in full_outfits]
        outfit_inputs = clip.tokenize(outfit_prompts).to(device)
        
        with torch.no_grad():
            outfit_features = model.encode_text(outfit_inputs)
            outfit_features = outfit_features / outfit_features.norm(dim=-1, keepdim=True)
            
            # Calculate similarities
            outfit_similarities = (image_features @ outfit_features.T).softmax(dim=-1)
            
            # Get top outfit prediction
            outfit_prob, outfit_index = outfit_similarities[0].topk(1)
            
            best_outfit = full_outfits[outfit_index[0]]
            outfit_confidence = outfit_prob[0].item()
            
            print(f"\n👗 FULL OUTFIT CHECK:")
            print(f"   {best_outfit}: {outfit_confidence:.3f} ({outfit_confidence*100:.1f}%)")
        
        # ===== IMPROVED COLOR DETECTION =====
        print(f"\n🎨 COLORS:")
        
        # Method 1: Direct color detection for specific clothing items
        best_top = topwear_results[0][0]
        best_bottom = bottomwear_results[0][0]
        
        # Top color detection with specific clothing item
        top_color_prompts = [f"person wearing {color} {best_top}" for color in colors]
        top_color_inputs = clip.tokenize(top_color_prompts).to(device)
        
        # Bottom color detection with specific clothing item
        bottom_color_prompts = [f"person wearing {color} {best_bottom}" for color in colors]
        bottom_color_inputs = clip.tokenize(bottom_color_prompts).to(device)
        
        with torch.no_grad():
            top_color_features = model.encode_text(top_color_inputs)
            bottom_color_features = model.encode_text(bottom_color_inputs)
            
            top_color_features = top_color_features / top_color_features.norm(dim=-1, keepdim=True)
            bottom_color_features = bottom_color_features / bottom_color_features.norm(dim=-1, keepdim=True)
            
            # Calculate color similarities
            top_color_sims = (image_features @ top_color_features.T).softmax(dim=-1)
            bottom_color_sims = (image_features @ bottom_color_features.T).softmax(dim=-1)
            
            # Get top 3 colors for each
            top_color_probs, top_color_indices = top_color_sims[0].topk(3)
            bottom_color_probs, bottom_color_indices = bottom_color_sims[0].topk(3)
            
            print(f"   Topwear ({best_top}):")
            top_color_results = []
            for i in range(3):
                color = colors[top_color_indices[i]]
                confidence = top_color_probs[i].item()
                print(f"     {color}: {confidence:.3f} ({confidence*100:.1f}%)")
                top_color_results.append((color, confidence))
                
            print(f"   Bottomwear ({best_bottom}):")
            bottom_color_results = []
            for i in range(3):
                color = colors[bottom_color_indices[i]]
                confidence = bottom_color_probs[i].item()
                print(f"     {color}: {confidence:.3f} ({confidence*100:.1f}%)")
                bottom_color_results.append((color, confidence))
        
        # Method 2: Alternative detection with different prompts
        print(f"\n🔍 ALTERNATIVE COLOR CHECK:")
        
        # Try with "wearing" vs "has" prompts
        alt_top_prompts = [f"{color} {best_top}" for color in colors]
        alt_bottom_prompts = [f"{color} {best_bottom}" for color in colors]
        
        alt_top_inputs = clip.tokenize(alt_top_prompts).to(device)
        alt_bottom_inputs = clip.tokenize(alt_bottom_prompts).to(device)
        
        with torch.no_grad():
            alt_top_features = model.encode_text(alt_top_inputs)
            alt_bottom_features = model.encode_text(alt_bottom_inputs)
            
            alt_top_features = alt_top_features / alt_top_features.norm(dim=-1, keepdim=True)
            alt_bottom_features = alt_bottom_features / alt_bottom_features.norm(dim=-1, keepdim=True)
            
            alt_top_sims = (image_features @ alt_top_features.T).softmax(dim=-1)
            alt_bottom_sims = (image_features @ alt_bottom_features.T).softmax(dim=-1)
            
            alt_top_prob, alt_top_idx = alt_top_sims[0].topk(1)
            alt_bottom_prob, alt_bottom_idx = alt_bottom_sims[0].topk(1)
            
            alt_top_color = colors[alt_top_idx[0]]
            alt_bottom_color = colors[alt_bottom_idx[0]]
            
            print(f"   Alternative top color: {alt_top_color} ({alt_top_prob[0].item()*100:.1f}%)")
            print(f"   Alternative bottom color: {alt_bottom_color} ({alt_bottom_prob[0].item()*100:.1f}%)")
        
        # Choose best colors (highest confidence)
        final_top_color = top_color_results[0][0]
        final_bottom_color = bottom_color_results[0][0]
        
        # Use alternative if much more confident
        if alt_top_prob[0].item() > top_color_results[0][1] + 0.1:
            final_top_color = alt_top_color
        if alt_bottom_prob[0].item() > bottom_color_results[0][1] + 0.1:
            final_bottom_color = alt_bottom_color
        
        # ===== FINAL DESCRIPTION =====
        best_top = topwear_results[0][0]
        best_bottom = bottomwear_results[0][0]
        
        # Check if it's more likely a dress/full outfit
        if outfit_confidence > 0.3:  # If dress/outfit confidence is high
            # For dresses, use the top color detection
            dress_color = final_top_color
            description = f"{dress_color} {best_outfit}"
            outfit_type = "full_outfit"
        else:
            description = f"{final_top_color} {best_top} and {final_bottom_color} {best_bottom}"
            outfit_type = "separates"
        
        print(f"\n📝 FINAL OUTFIT DESCRIPTION:")
        print(f"   {description}")
        
        return {
            'topwear': topwear_results,
            'bottomwear': bottomwear_results,
            'full_outfit': (best_outfit, outfit_confidence),
            'top_color': final_top_color,
            'bottom_color': final_bottom_color,
            'top_color_options': top_color_results,
            'bottom_color_options': bottom_color_results,
            'description': description,
            'type': outfit_type
        }
            
    except Exception as e:
        print(f"Error processing image: {e}")
        return None

# Example usage:
if __name__ == "__main__":
    # Replace with your image path
    image_path = "images/person5.jpg"  # Change this to your actual image path
    
    print("CLIP Image Tagger with Color Detection")
    print("=" * 50)
    
    # Tag the image
    results = tag_image(image_path)
    
    if results:
        print(f"\n✨ Final Result: {results['description']}")

# You can also create quick functions:
def quick_outfit_description(image_path):
    """One-liner to get full outfit description"""
    result = tag_image(image_path)
    return result['description'] if result else "Could not analyze image"

def get_outfit_breakdown(image_path):
    """Get detailed breakdown of the outfit"""
    result = tag_image(image_path)
    if result:
        if result['type'] == 'full_outfit':
            return f"Wearing a {result['description']}"
        else:
            return f"Top: {result['top_color']} {result['topwear'][0][0]}, Bottom: {result['bottom_color']} {result['bottomwear'][0][0]}"
    return "Could not analyze image"

# Usage examples:
# quick_outfit_description("my_outfit.jpg") 
# # Returns: "black t-shirt and blue jeans" or "red dress"

# get_outfit_breakdown("my_outfit.jpg")
# # Returns: "Top: black t-shirt, Bottom: blue jeans"

CLIP Image Tagger with Color Detection
Analyzing full body image: images/person5.jpg
SKIN SHADES:
   brown: 0.252 (25.2%)
   red: 0.251 (25.1%)
   fair: 0.251 (25.1%)
👕 TOPWEAR:
   dress shirt: 0.061 (6.1%)
   polo shirt: 0.060 (6.0%)
   vest: 0.060 (6.0%)

👖 BOTTOMWEAR:
   dress pants: 0.068 (6.8%)
   chinos: 0.068 (6.8%)
   trousers: 0.067 (6.7%)

👗 FULL OUTFIT CHECK:
   overall: 0.104 (10.4%)

🎨 COLORS:
   Topwear (dress shirt):
     navy: 0.048 (4.8%)
     khaki: 0.048 (4.8%)
     brown: 0.048 (4.8%)
   Bottomwear (dress pants):
     khaki: 0.049 (4.9%)
     brown: 0.048 (4.8%)
     navy: 0.048 (4.8%)

🔍 ALTERNATIVE COLOR CHECK:
   Alternative top color: khaki (4.9%)
   Alternative bottom color: khaki (4.9%)

📝 FINAL OUTFIT DESCRIPTION:
   navy dress shirt and khaki dress pants

✨ Final Result: navy dress shirt and khaki dress pants
