### FID SCORE IMPLEMENTATION

In [9]:
# small_dataset_evaluator.py

import torch
from pytorch_fid import fid_score
from PIL import Image
import numpy as np
from pathlib import Path
import os
from tqdm import tqdm
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import seaborn as sns

def check_paths_and_files(real_path, generated_path):
    """Validate paths and check for image files."""
    # Check if directories exist
    if not os.path.exists(real_path):
        raise ValueError(f"Real images directory not found: {real_path}")
    if not os.path.exists(generated_path):
        raise ValueError(f"Generated images directory not found: {generated_path}")
    
    # List valid image extensions
    valid_extensions = {'.jpg', '.jpeg', '.png', '.bmp'}
    
    # Check real images
    real_images = [f for f in os.listdir(real_path) 
                  if os.path.splitext(f.lower())[1] in valid_extensions]
    if not real_images:
        raise ValueError(f"No valid images found in real directory: {real_path}")
    
    # Check generated images
    gen_images = [f for f in os.listdir(generated_path) 
                 if os.path.splitext(f.lower())[1] in valid_extensions]
    if not gen_images:
        raise ValueError(f"No valid images found in generated directory: {generated_path}")
    
    print(f"\nFound {len(real_images)} real images:")
    for img in real_images:
        print(f"  - {img}")
    
    print(f"\nFound {len(gen_images)} generated images:")
    for img in gen_images:
        print(f"  - {img}")
    
    return real_images, gen_images

class SmallDatasetEvaluator:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
        # Initialize model with newer syntax to avoid warnings
        weights = models.ResNet50_Weights.DEFAULT
        self.model = models.resnet50(weights=weights)
        self.model.fc = nn.Identity()
        self.model = self.model.to(self.device)
        self.model.eval()
        
        # Get the preprocessing from the weights
        self.transform = weights.transforms()

    def extract_features(self, image_path):
        """Extract features from an image using ResNet."""
        try:
            # Full path handling
            full_path = str(Path(image_path).absolute())
            if not os.path.exists(full_path):
                print(f"Image not found: {full_path}")
                return None
            
            # Load and verify image
            with Image.open(full_path) as img:
                try:
                    img.verify()  # Verify image is valid
                except Exception as e:
                    print(f"Invalid image {full_path}: {e}")
                    return None
            
            # Reload image after verify
            image = Image.open(full_path).convert('RGB')
            image_tensor = self.transform(image).unsqueeze(0).to(self.device)
            
            with torch.no_grad():
                features = self.model(image_tensor)
            return features.squeeze().cpu().numpy().reshape(1, -1)
            
        except Exception as e:
            print(f"Error processing {image_path}: {e}")
            return None

    def evaluate_small_dataset(self, real_path, generated_path):
        """Comprehensive evaluation for small datasets."""
        # First check paths and files
        real_images, gen_images = check_paths_and_files(real_path, generated_path)
        
        # Create full paths
        real_paths = [os.path.join(real_path, img) for img in real_images]
        gen_paths = [os.path.join(generated_path, img) for img in gen_images]
        
        # Extract features
        real_features = []
        print("\nExtracting features from real images...")
        for path in tqdm(real_paths):
            features = self.extract_features(path)
            if features is not None:
                real_features.append(features)
        
        gen_features = []
        print("\nExtracting features from generated images...")
        for path in tqdm(gen_paths):
            features = self.extract_features(path)
            if features is not None:
                gen_features.append(features)
        
        if not real_features or not gen_features:
            raise ValueError("Could not extract valid features from images")
        
        # Stack features
        real_features = np.vstack(real_features)
        gen_features = np.vstack(gen_features)
        
        print(f"\nFeature shapes:")
        print(f"Real features: {real_features.shape}")
        print(f"Generated features: {gen_features.shape}")
        
        # Calculate distances
        distances = cdist(real_features, gen_features, 'euclidean')
        
        # Calculate metrics
        results = {
            'nearest_neighbor_distance': np.min(distances, axis=0),
            'average_distance': np.mean(distances, axis=0),
            'worst_distance': np.max(distances, axis=0),
            'distance_std': np.std(distances, axis=0),
            'num_real': len(real_paths),
            'num_generated': len(gen_paths)
        }
        
        # Try to calculate FID
        try:
            fid_value = fid_score.calculate_fid_given_paths(
                [real_path, generated_path],
                batch_size=min(50, len(real_paths), len(gen_paths)),
                device=self.device,
                dims=2048
            )
            results['fid_score'] = fid_value
            results['fid_reliability'] = min(1.0, (min(len(real_paths), len(gen_paths)) / 50))
        except Exception as e:
            print(f"\nWarning: Could not calculate FID score: {e}")
        
        return results, distances

def main():
    # Get current working directory
    cwd = os.getcwd()
    print(f"\nCurrent working directory: {cwd}")
    
    # Define paths
    REAL_PATH = 'mbzuai_paper_cup_Images'
    GENERATED_PATH = 'output'
    
    # Print full paths
    real_full_path = os.path.abspath(REAL_PATH)
    gen_full_path = os.path.abspath(GENERATED_PATH)
    print(f"\nFull paths:")
    print(f"Real images: {real_full_path}")
    print(f"Generated images: {gen_full_path}")
    
    try:
        evaluator = SmallDatasetEvaluator()
        results, distances = evaluator.evaluate_small_dataset(REAL_PATH, GENERATED_PATH)
        
        print("\n=== Evaluation Results ===")
        if 'fid_score' in results:
            print(f"FID Score: {results['fid_score']:.4f}")
            print(f"FID Reliability: {results['fid_reliability']:.2%}")
        
        print("\nDistance Statistics:")
        print(f"Average Distance: {np.mean(results['average_distance']):.4f}")
        print(f"Min Distance: {np.min(results['nearest_neighbor_distance']):.4f}")
        print(f"Max Distance: {np.max(results['worst_distance']):.4f}")
        print(f"Standard Deviation: {np.mean(results['distance_std']):.4f}")
        
    except Exception as e:
        print(f"\nError during evaluation: {e}")
        
        # Print more debug information
        print("\nDebug Information:")
        print("Checking directory contents:")
        for path, name in [(REAL_PATH, "Real"), (GENERATED_PATH, "Generated")]:
            if os.path.exists(path):
                print(f"\n{name} directory contents:")
                for item in os.listdir(path):
                    print(f"  - {item}")
            else:
                print(f"\n{name} directory not found!")

if __name__ == "__main__":
    main()


Current working directory: /home/ishita.agarwal/Documents/ai_project_evaluation

Full paths:
Real images: /home/ishita.agarwal/Documents/ai_project_evaluation/mbzuai_paper_cup_Images
Generated images: /home/ishita.agarwal/Documents/ai_project_evaluation/output
Using device: cuda


Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /home/ishita.agarwal/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 85.6MB/s]



Found 5 real images:
  - mbzuai_paper_cup_1.jpeg
  - mbzuai_paper_cup_5.jpeg
  - mbzuai_paper_cup_4.jpeg
  - mbzuai_paper_cup_2.jpeg
  - mbzuai_paper_cup_3.jpeg

Found 9 generated images:
  - generated_image_8.png
  - generated_image_10.png
  - generated_image_6.png
  - generated_image_7.png
  - generated_image_9.png
  - generated_image_3.png
  - generated_image_4.png
  - generated_image_5.png
  - generated_image_2.png

Extracting features from real images...


100%|██████████| 5/5 [00:00<00:00, 20.89it/s]



Extracting features from generated images...


100%|██████████| 9/9 [00:00<00:00, 143.00it/s]



Feature shapes:
Real features: (5, 2048)
Generated features: (9, 2048)


100%|██████████| 1/1 [00:00<00:00, 10.33it/s]
100%|██████████| 2/2 [00:00<00:00, 13.70it/s]



=== Evaluation Results ===
FID Score: 164.8489
FID Reliability: 10.00%

Distance Statistics:
Average Distance: 12.5460
Min Distance: 10.1288
Max Distance: 14.5523
Standard Deviation: 0.4566


### Precision and recall

In [1]:
# mbzuai_clip_evaluator.py

import os
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

class MBZUAICLIPEvaluator:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        
        # Load CLIP model and processor
        print("Loading CLIP model...")
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        
        # Define MBZUAI-specific concepts and prompts
        self.concepts = {
            'core_elements': [
                "white paper cup",
                "MBZUAI logo",
                "clear logo visibility",
                "proper logo placement"
            ],
            'quality_aspects': [
                "high quality image",
                "clear background",
                "good lighting",
                "proper contrast"
            ],
            'design_elements': [
                "professional design",
                "clean appearance",
                "proper proportions",
                "realistic cup shape"
            ]
        }
        
        # Define specific prompts for evaluation
        self.evaluation_prompts = [
            "a white paper cup with clear MBZUAI logo",
            "professional paper cup with MBZUAI branding",
            "high quality MBZUAI branded paper cup",
            "clean white cup with MBZUAI logo"
        ]

    def calculate_clip_scores(self, image, prompts):
        """Calculate CLIP scores for an image against multiple prompts."""
        try:
            # Process image
            image_inputs = self.clip_processor(images=image, return_tensors="pt").to(self.device)
            image_features = self.clip_model.get_image_features(**image_inputs)
            
            # Calculate scores for each prompt
            scores = {}
            for prompt in prompts:
                text_inputs = self.clip_processor(text=[prompt], return_tensors="pt", padding=True).to(self.device)
                text_features = self.clip_model.get_text_features(**text_inputs)
                
                similarity = torch.cosine_similarity(text_features, image_features).item()
                scores[prompt] = similarity
            
            return scores
            
        except Exception as e:
            print(f"Error calculating scores: {e}")
            return None

    def evaluate_image(self, image_path):
        """Comprehensive evaluation of a single image."""
        try:
            image = Image.open(image_path).convert('RGB')
            
            results = {
                'filename': os.path.basename(image_path),
                'scores': {}
            }
            
            # Evaluate against all concept categories
            for category, concepts in self.concepts.items():
                category_scores = self.calculate_clip_scores(image, concepts)
                if category_scores:
                    results['scores'][category] = category_scores
            
            # Evaluate against standard prompts
            prompt_scores = self.calculate_clip_scores(image, self.evaluation_prompts)
            if prompt_scores:
                results['scores']['prompts'] = prompt_scores
            
            return results
            
        except Exception as e:
            print(f"Error evaluating {image_path}: {e}")
            return None

    def evaluate_directory(self, directory):
        """Evaluate all images in directory."""
        print(f"\nEvaluating images in {directory}")
        
        results = []
        image_files = [f for f in os.listdir(directory) 
                      if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        
        for filename in tqdm(image_files, desc="Evaluating images"):
            image_path = os.path.join(directory, filename)
            result = self.evaluate_image(image_path)
            if result:
                results.append(result)
        
        return results

    def analyze_results(self, results):
        """Analyze evaluation results."""
        analysis = {
            'overall_scores': {},
            'category_scores': {},
            'prompt_scores': {},
            'per_image_analysis': []
        }
        
        for result in results:
            image_scores = {}
            
            # Calculate average scores per category
            for category, scores in result['scores'].items():
                if category != 'prompts':
                    category_avg = np.mean(list(scores.values()))
                    image_scores[f'{category}_avg'] = category_avg
                    
                    if category not in analysis['category_scores']:
                        analysis['category_scores'][category] = []
                    analysis['category_scores'][category].append(category_avg)
            
            # Calculate prompt score
            if 'prompts' in result['scores']:
                prompt_avg = np.mean(list(result['scores']['prompts'].values()))
                image_scores['prompt_avg'] = prompt_avg
                analysis['prompt_scores'][result['filename']] = prompt_avg
            
            # Add to per-image analysis
            image_scores['filename'] = result['filename']
            image_scores['overall_score'] = np.mean(list(image_scores.values())[:-1])  # Exclude filename
            analysis['per_image_analysis'].append(image_scores)
        
        # Calculate overall averages
        analysis['overall_scores'] = {
            'overall_average': np.mean([img['overall_score'] 
                                      for img in analysis['per_image_analysis']]),
            'category_averages': {
                category: np.mean(scores)
                for category, scores in analysis['category_scores'].items()
            },
            'prompt_average': np.mean(list(analysis['prompt_scores'].values()))
        }
        
        return analysis

    def generate_report(self, analysis, output_dir='evaluation_results'):
        """Generate comprehensive evaluation report."""
        os.makedirs(output_dir, exist_ok=True)
        
        # Save detailed results
        df = pd.DataFrame(analysis['per_image_analysis'])
        df.to_csv(os.path.join(output_dir, 'detailed_scores.csv'), index=False)
        
        # Generate report text
        report = [
            "=== MBZUAI Paper Cup Generation Evaluation ===\n",
            "Overall Scores:",
            f"Average Score: {analysis['overall_scores']['overall_average']:.4f}",
            f"Average Prompt Score: {analysis['overall_scores']['prompt_average']:.4f}\n",
            "Category Averages:"
        ]
        
        for category, score in analysis['overall_scores']['category_averages'].items():
            report.append(f"{category}: {score:.4f}")
        
        # Add top performing images
        df_sorted = pd.DataFrame(analysis['per_image_analysis']).sort_values(
            'overall_score', ascending=False)
        
        report.extend([
            "\nTop 3 Performing Images:"
        ])
        
        for _, row in df_sorted.head(3).iterrows():
            report.append(f"{row['filename']}: {row['overall_score']:.4f}")
        
        # Save report
        with open(os.path.join(output_dir, 'evaluation_report.txt'), 'w') as f:
            f.write('\n'.join(report))
        
        # Create visualizations
        self.plot_results(analysis, output_dir)

    def plot_results(self, analysis, output_dir):
        """Create visualizations of results."""
        # Plot 1: Category Performance
        plt.figure(figsize=(12, 6))
        categories = list(analysis['overall_scores']['category_averages'].keys())
        scores = list(analysis['overall_scores']['category_averages'].values())
        
        plt.bar(categories, scores)
        plt.title('Average Performance by Category')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'category_performance.png'))
        plt.close()
        
        # Plot 2: Individual Image Performance
        df = pd.DataFrame(analysis['per_image_analysis'])
        plt.figure(figsize=(15, 6))
        sns.barplot(data=df, x='filename', y='overall_score')
        plt.title('Overall Score by Image')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'image_scores.png'))
        plt.close()

def main():
    # Configuration
    generated_dir = "output"
    
    try:
        # Initialize evaluator
        evaluator = MBZUAICLIPEvaluator()
        
        # Run evaluation
        results = evaluator.evaluate_directory(generated_dir)
        
        # Analyze results
        analysis = evaluator.analyze_results(results)
        
        # Generate report
        evaluator.generate_report(analysis)
        
        print("\nEvaluation completed successfully!")
        print("Results saved in evaluation_results/")
        
    except Exception as e:
        print(f"Error during evaluation: {e}")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
Loading CLIP model...

Evaluating images in output


Evaluating images: 100%|██████████| 9/9 [00:05<00:00,  1.55it/s]



Evaluation completed successfully!
Results saved in evaluation_results/


In [2]:
# direct_clip_evaluator.py

import os
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch
from tqdm import tqdm
import numpy as np

class DirectCLIPEvaluator:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        
        # Load CLIP model and processor
        print("Loading CLIP model...")
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        
        # Define evaluation aspects
        self.evaluation_aspects = {
            "Main Prompt": "a white paper cup with MBZUAI logo",
            "Logo Quality": "clear visible MBZUAI logo on a cup",
            "Cup Quality": "high quality white paper cup",
            "Overall Image": "professional product photography of a paper cup"
        }

    def calculate_similarity(self, image, text):
        """Calculate CLIP similarity score between image and text."""
        try:
            # Process image and text
            image_inputs = self.clip_processor(images=image, return_tensors="pt").to(self.device)
            text_inputs = self.clip_processor(text=[text], return_tensors="pt", padding=True).to(self.device)
            
            # Get features
            with torch.no_grad():
                image_features = self.clip_model.get_image_features(**image_inputs)
                text_features = self.clip_model.get_text_features(**text_inputs)
                
                # Calculate similarity
                similarity = torch.cosine_similarity(text_features, image_features).item()
            
            return similarity
            
        except Exception as e:
            print(f"Error calculating similarity: {e}")
            return None

    def evaluate_image(self, image_path):
        """Evaluate a single image against all aspects."""
        try:
            image = Image.open(image_path).convert('RGB')
            scores = {}
            
            for aspect, prompt in self.evaluation_aspects.items():
                score = self.calculate_similarity(image, prompt)
                if score is not None:
                    scores[aspect] = score
            
            return scores
            
        except Exception as e:
            print(f"Error evaluating {image_path}: {e}")
            return None

    def print_scores(self, filename, scores):
        """Print scores in a formatted way."""
        print(f"\nScores for {filename}:")
        print("-" * 50)
        for aspect, score in scores.items():
            print(f"{aspect:15}: {score:.4f}")
        print("-" * 50)

    def evaluate_directory(self, directory):
        """Evaluate all images in directory and show results."""
        print(f"\nEvaluating images in {directory}")
        
        # Get all image files
        image_files = [f for f in os.listdir(directory) 
                      if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        
        all_scores = []
        
        # Evaluate each image
        for filename in tqdm(image_files, desc="Evaluating images"):
            image_path = os.path.join(directory, filename)
            scores = self.evaluate_image(image_path)
            
            if scores:
                all_scores.append({
                    'filename': filename,
                    'scores': scores
                })
                self.print_scores(filename, scores)
        
        # Print summary statistics
        if all_scores:
            print("\n=== Summary Statistics ===")
            
            # Calculate averages for each aspect
            aspects = self.evaluation_aspects.keys()
            for aspect in aspects:
                scores = [item['scores'][aspect] for item in all_scores 
                         if aspect in item['scores']]
                if scores:
                    avg_score = np.mean(scores)
                    std_score = np.std(scores)
                    print(f"\n{aspect}:")
                    print(f"  Average: {avg_score:.4f}")
                    print(f"  Std Dev: {std_score:.4f}")
            
            # Find best performing image
            avg_scores = [(item['filename'], np.mean(list(item['scores'].values())))
                         for item in all_scores]
            best_img = max(avg_scores, key=lambda x: x[1])
            
            print(f"\nBest performing image: {best_img[0]} (Score: {best_img[1]:.4f})")

def main():
    # Directory containing generated images
    generated_dir = "output"
    
    try:
        # Initialize evaluator
        evaluator = DirectCLIPEvaluator()
        
        # Run evaluation
        evaluator.evaluate_directory(generated_dir)
        
    except Exception as e:
        print(f"Error during evaluation: {e}")

if __name__ == "__main__":
    main()

Using device: cuda
Loading CLIP model...

Evaluating images in output


Evaluating images:  22%|██▏       | 2/9 [00:00<00:00, 14.90it/s]


Scores for generated_image_8.png:
--------------------------------------------------
Main Prompt    : 0.3175
Logo Quality   : 0.2708
Cup Quality    : 0.2948
Overall Image  : 0.2987
--------------------------------------------------

Scores for generated_image_10.png:
--------------------------------------------------
Main Prompt    : 0.3243
Logo Quality   : 0.2801
Cup Quality    : 0.3005
Overall Image  : 0.3077
--------------------------------------------------

Scores for generated_image_6.png:
--------------------------------------------------
Main Prompt    : 0.3371
Logo Quality   : 0.2969
Cup Quality    : 0.3148
Overall Image  : 0.3245
--------------------------------------------------


Evaluating images:  44%|████▍     | 4/9 [00:00<00:00, 13.79it/s]


Scores for generated_image_7.png:
--------------------------------------------------
Main Prompt    : 0.3189
Logo Quality   : 0.2856
Cup Quality    : 0.2930
Overall Image  : 0.3167
--------------------------------------------------

Scores for generated_image_9.png:
--------------------------------------------------
Main Prompt    : 0.3354
Logo Quality   : 0.2929
Cup Quality    : 0.3087
Overall Image  : 0.3053
--------------------------------------------------


Evaluating images:  67%|██████▋   | 6/9 [00:00<00:00, 14.15it/s]


Scores for generated_image_3.png:
--------------------------------------------------
Main Prompt    : 0.2845
Logo Quality   : 0.2744
Cup Quality    : 0.2787
Overall Image  : 0.2720
--------------------------------------------------


Evaluating images:  89%|████████▉ | 8/9 [00:00<00:00, 14.16it/s]


Scores for generated_image_4.png:
--------------------------------------------------
Main Prompt    : 0.3129
Logo Quality   : 0.2794
Cup Quality    : 0.2955
Overall Image  : 0.2954
--------------------------------------------------

Scores for generated_image_5.png:
--------------------------------------------------
Main Prompt    : 0.3397
Logo Quality   : 0.3096
Cup Quality    : 0.3151
Overall Image  : 0.3236
--------------------------------------------------


Evaluating images: 100%|██████████| 9/9 [00:00<00:00, 14.20it/s]


Scores for generated_image_2.png:
--------------------------------------------------
Main Prompt    : 0.3306
Logo Quality   : 0.2813
Cup Quality    : 0.3040
Overall Image  : 0.3149
--------------------------------------------------

=== Summary Statistics ===

Main Prompt:
  Average: 0.3223
  Std Dev: 0.0160

Logo Quality:
  Average: 0.2857
  Std Dev: 0.0115

Cup Quality:
  Average: 0.3006
  Std Dev: 0.0110

Overall Image:
  Average: 0.3065
  Std Dev: 0.0155

Best performing image: generated_image_5.png (Score: 0.3220)





### Concept Accuracy

In [5]:
# combined_concept_accuracy.py

import torch
from torchvision import models, transforms
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import os
import json
from tqdm import tqdm
import numpy as np

class ConceptAccuracyEvaluator:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        
        # Initialize ResNet
        print("Loading ResNet...")
        self.resnet = models.resnet50(pretrained=True).to(self.device)
        self.resnet.eval()
        
        # Initialize CLIP
        print("Loading CLIP...")
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        
        # Preprocessing for ResNet
        self.preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                               std=[0.229, 0.224, 0.225]),
        ])
        
        # Load ImageNet classes
        try:
            with open('imagenet_class_index.json') as f:
                self.imagenet_classes = {int(key): value[1] 
                                       for key, value in json.load(f).items()}
        except:
            print("Warning: ImageNet class index file not found.")
            self.imagenet_classes = {}
        
        # Define concepts to check
        self.concepts = {
            "cup": ["cup", "paper cup", "coffee cup", "disposable cup"],
            "logo": ["logo", "brand logo", "company logo", "MBZUAI logo"],
            "quality": ["high quality", "clear image", "professional photo"]
        }

    def check_clip_concept(self, image, concept):
        """Check concept using CLIP."""
        try:
            image_inputs = self.clip_processor(images=image, return_tensors="pt").to(self.device)
            text_inputs = self.clip_processor(text=[concept], return_tensors="pt", padding=True).to(self.device)
            
            with torch.no_grad():
                image_features = self.clip_model.get_image_features(**image_inputs)
                text_features = self.clip_model.get_text_features(**text_inputs)
                
                image_features = image_features / image_features.norm(dim=1, keepdim=True)
                text_features = text_features / text_features.norm(dim=1, keepdim=True)
                
                similarity = torch.cosine_similarity(image_features, text_features).item()
            
            return similarity
            
        except Exception as e:
            print(f"Error in CLIP check: {e}")
            return 0.0

    def check_resnet_concept(self, image):
        """Check concept using ResNet."""
        try:
            image_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
            
            with torch.no_grad():
                outputs = self.resnet(image_tensor)
                _, predicted_idx = outputs.max(1)
                
                if self.imagenet_classes:
                    predicted_label = self.imagenet_classes[predicted_idx.item()]
                    return predicted_label.lower()
                return ""
                
        except Exception as e:
            print(f"Error in ResNet check: {e}")
            return ""

    def evaluate_directory(self, directory):
        """Evaluate all images in directory."""
        print(f"\nEvaluating images in {directory}")
        
        image_files = [f for f in os.listdir(directory) 
                      if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        
        if not image_files:
            print("No images found in directory!")
            return
        
        results = {concept: [] for concept in self.concepts.keys()}
        
        # Process each image
        for filename in tqdm(image_files, desc="Processing images"):
            try:
                image_path = os.path.join(directory, filename)
                image = Image.open(image_path).convert('RGB')
                
                # ResNet check
                resnet_label = self.check_resnet_concept(image)
                
                # CLIP check for each concept category
                for category, concept_list in self.concepts.items():
                    category_scores = []
                    for concept in concept_list:
                        clip_score = self.check_clip_concept(image, concept)
                        category_scores.append(clip_score)
                    
                    # Combine ResNet and CLIP results
                    if any(concept in resnet_label for concept in concept_list):
                        category_scores.append(1.0)
                        
                    results[category].append(np.mean(category_scores))
                    
            except Exception as e:
                print(f"Error processing {filename}: {e}")
        
        # Calculate and print average scores
        print("\n=== Concept Accuracy Scores ===")
        print(f"Number of images evaluated: {len(image_files)}")
        print("\nCategory Averages:")
        
        overall_scores = []
        for category, scores in results.items():
            avg_score = np.mean(scores)
            overall_scores.append(avg_score)
            print(f"{category:10}: {avg_score:.4f}")
        
        print(f"\nOverall Concept Accuracy: {np.mean(overall_scores):.4f}")

def main():
    generated_dir = "output"
    
    try:
        evaluator = ConceptAccuracyEvaluator()
        evaluator.evaluate_directory(generated_dir)
        
    except Exception as e:
        print(f"Error during evaluation: {e}")

if __name__ == "__main__":
    main()

Using device: cuda
Loading ResNet...




Loading CLIP...

Evaluating images in output


Processing images: 100%|██████████| 9/9 [00:02<00:00,  3.96it/s]


=== Concept Accuracy Scores ===
Number of images evaluated: 9

Category Averages:
cup       : 0.2831
logo      : 0.2155
quality   : 0.1938

Overall Concept Accuracy: 0.2308



