In [7]:
import torch
import clip
from PIL import Image

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

def tag_image(image_path):
    """
    Analyze full body images to detect topwear, bottomwear, and their colors
    Just provide the path to your image!
    """
    
    # Topwear categories
    topwear = [
        "t-shirt", "shirt", "blouse", "tank top", "polo shirt", "dress shirt",
        "sweater", "hoodie", "jacket", "blazer", "coat", "cardigan", 
        "vest", "crop top", "tube top", "halter top", "camisole"
    ]
    
    # Bottomwear categories  
    bottomwear = [
        "jeans", "pants", "trousers", "shorts", "skirt", "leggings",
        "sweatpants", "chinos", "cargo pants", "dress pants", "joggers",
        "capris", "culottes", "palazzo pants", "wide leg pants"
    ]
    
    # Dresses and full outfits
    full_outfits = [
        "dress", "gown", "sundress", "maxi dress", "mini dress", 
        "cocktail dress", "evening dress", "jumpsuit", "romper", "overall"
    ]
    
    # Colors to test against
    colors = [
        "red", "crimson", "scarlet", "burgundy", "maroon",
        "blue", "navy", "sky blue", "denim blue", "royal blue", "baby blue",
        "green", "olive", "mint", "lime", "forest green", "teal",
        "yellow", "mustard", "gold", "lemon", "cream",
        "orange", "coral", "peach", "burnt orange", "amber",
        "purple", "lavender", "violet", "plum", "mauve",
        "pink", "hot pink", "rose", "blush", "fuchsia",
        "black", "charcoal", "jet black",
        "white", "ivory", "off-white", "cream",
        "gray", "silver", "slate", "ash gray",
        "brown", "tan", "beige", "khaki", "chocolate",
        "cyan", "turquoise", "aqua", "sea green",
        "gold", "silver", "bronze"
    ]

    
    try:
        # Load and preprocess the image
        image = Image.open(image_path).convert('RGB')
        image_input = preprocess(image).unsqueeze(0).to(device)
        
        print(f"Analyzing full body image: {image_path}")
        print("=" * 60)
        
        # ===== TOPWEAR DETECTION =====
        top_prompts = [f"person wearing {item}" for item in topwear]
        top_inputs = clip.tokenize(top_prompts).to(device)
        
        with torch.no_grad():
            image_features = model.encode_image(image_input)
            top_features = model.encode_text(top_inputs)
            
            # Normalize features
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
            top_features = top_features / top_features.norm(dim=-1, keepdim=True)
            
            # Calculate similarities
            top_similarities = (image_features @ top_features.T).softmax(dim=-1)
            
            # Get top 3 topwear predictions
            top_probs, top_indices = top_similarities[0].topk(3)
            
            print("👕 TOPWEAR:")
            topwear_results = []
            for i in range(3):
                item = topwear[top_indices[i]]
                confidence = top_probs[i].item()
                print(f"   {item}: {confidence:.3f} ({confidence*100:.1f}%)")
                topwear_results.append((item, confidence))
        
        # ===== BOTTOMWEAR DETECTION =====
        bottom_prompts = [f"person wearing {item}" for item in bottomwear]
        bottom_inputs = clip.tokenize(bottom_prompts).to(device)
        
        with torch.no_grad():
            bottom_features = model.encode_text(bottom_inputs)
            bottom_features = bottom_features / bottom_features.norm(dim=-1, keepdim=True)
            
            # Calculate similarities
            bottom_similarities = (image_features @ bottom_features.T).softmax(dim=-1)
            
            # Get top 3 bottomwear predictions
            bottom_probs, bottom_indices = bottom_similarities[0].topk(3)
            
            print("\n👖 BOTTOMWEAR:")
            bottomwear_results = []
            for i in range(3):
                item = bottomwear[bottom_indices[i]]
                confidence = bottom_probs[i].item()
                print(f"   {item}: {confidence:.3f} ({confidence*100:.1f}%)")
                bottomwear_results.append((item, confidence))
        
        # ===== FULL OUTFIT CHECK =====
        outfit_prompts = [f"person wearing {item}" for item in full_outfits]
        outfit_inputs = clip.tokenize(outfit_prompts).to(device)
        
        with torch.no_grad():
            outfit_features = model.encode_text(outfit_inputs)
            outfit_features = outfit_features / outfit_features.norm(dim=-1, keepdim=True)
            
            # Calculate similarities
            outfit_similarities = (image_features @ outfit_features.T).softmax(dim=-1)
            
            # Get top outfit prediction
            outfit_prob, outfit_index = outfit_similarities[0].topk(1)
            
            best_outfit = full_outfits[outfit_index[0]]
            outfit_confidence = outfit_prob[0].item()
            
            print(f"\n👗 FULL OUTFIT CHECK:")
            print(f"   {best_outfit}: {outfit_confidence:.3f} ({outfit_confidence*100:.1f}%)")
        
        # ===== COLOR DETECTION =====
        # Check colors for topwear
        top_color_prompts = [f"person wearing {color} top" for color in colors]
        top_color_inputs = clip.tokenize(top_color_prompts).to(device)
        
        # Check colors for bottomwear  
        bottom_color_prompts = [f"person wearing {color} bottom" for color in colors]
        bottom_color_inputs = clip.tokenize(bottom_color_prompts).to(device)
        
        with torch.no_grad():
            top_color_features = model.encode_text(top_color_inputs)
            bottom_color_features = model.encode_text(bottom_color_inputs)
            
            top_color_features = top_color_features / top_color_features.norm(dim=-1, keepdim=True)
            bottom_color_features = bottom_color_features / bottom_color_features.norm(dim=-1, keepdim=True)
            
            # Calculate color similarities
            top_color_sims = (image_features @ top_color_features.T).softmax(dim=-1)
            bottom_color_sims = (image_features @ bottom_color_features.T).softmax(dim=-1)
            
            # Get top colors
            top_color_prob, top_color_idx = top_color_sims[0].topk(1)
            bottom_color_prob, bottom_color_idx = bottom_color_sims[0].topk(1)
            
            top_color = colors[top_color_idx[0]]
            bottom_color = colors[bottom_color_idx[0]]
            
            print(f"\n🎨 COLORS:")
            print(f"   Topwear: {top_color} ({top_color_prob[0].item()*100:.1f}%)")
            print(f"   Bottomwear: {bottom_color} ({bottom_color_prob[0].item()*100:.1f}%)")
        
        # ===== FINAL DESCRIPTION =====
        best_top = topwear_results[0][0]
        best_bottom = bottomwear_results[0][0]
        
        # Check if it's more likely a dress/full outfit
        if outfit_confidence > 0.3:  # If dress/outfit confidence is high
            description = f"{colors[top_color_idx[0]]} {best_outfit}"
            outfit_type = "full_outfit"
        else:
            description = f"{top_color} {best_top} and {bottom_color} {best_bottom}"
            outfit_type = "separates"
        
        print(f"\n📝 OUTFIT DESCRIPTION:")
        print(f"   {description}")
        
        return {
            'topwear': topwear_results,
            'bottomwear': bottomwear_results,
            'full_outfit': (best_outfit, outfit_confidence),
            'top_color': top_color,
            'bottom_color': bottom_color,
            'description': description,
            'type': outfit_type
        }
            
    except Exception as e:
        print(f"Error processing image: {e}")
        return None

# Example usage:
if __name__ == "__main__":
    # Replace with your image path
    image_path = "images/person3.webp"  # Change this to your actual image path
    
    print("CLIP Image Tagger with Color Detection")
    print("=" * 50)
    
    # Tag the image
    results = tag_image(image_path)
    
    if results:
        print(f"\n✨ Final Result: {results['description']}")

# To use this code:
# 1. Save a full body image in the same folder as this notebook
# 2. Change the image_path variable to your image filename
# 3. Run the code!

# Example:
# tag_image("person_outfit.jpg")
# tag_image("full_body_photo.png") 

# You can also create quick functions:
def quick_outfit_description(image_path):
    """One-liner to get full outfit description"""
    result = tag_image(image_path)
    return result['description'] if result else "Could not analyze image"

def get_outfit_breakdown(image_path):
    """Get detailed breakdown of the outfit"""
    result = tag_image(image_path)
    if result:
        if result['type'] == 'full_outfit':
            return f"Wearing a {result['description']}"
        else:
            return f"Top: {result['top_color']} {result['topwear'][0][0]}, Bottom: {result['bottom_color']} {result['bottomwear'][0][0]}"
    return "Could not analyze image"

# Usage examples:
# quick_outfit_description("my_outfit.jpg") 
# # Returns: "black t-shirt and blue jeans" or "red dress"

# get_outfit_breakdown("my_outfit.jpg")
# # Returns: "Top: black t-shirt, Bottom: blue jeans"

CLIP Image Tagger with Color Detection
Analyzing full body image: images/person2.jpg
👕 TOPWEAR:
   t-shirt: 0.061 (6.1%)
   shirt: 0.060 (6.0%)
   polo shirt: 0.060 (6.0%)

👖 BOTTOMWEAR:
   jeans: 0.069 (6.9%)
   joggers: 0.068 (6.8%)
   capris: 0.068 (6.8%)

👗 FULL OUTFIT CHECK:
   overall: 0.105 (10.5%)

🎨 COLORS:
   Topwear: denim blue (1.7%)
   Bottomwear: denim blue (1.7%)

📝 OUTFIT DESCRIPTION:
   denim blue t-shirt and denim blue jeans

✨ Final Result: denim blue t-shirt and denim blue jeans
