In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# Define the dataset paths
dataset_paths = {
    'Dataset1': r'D:\git\FYP\Python\TwoStage\data\Dataset1\Pandora7k_original',
    'Dataset2': r'D:\git\FYP\Python\TwoStage\data\Dataset2\Pandora18k_original',
    'Dataset3': r'D:\git\FYP\Python\TwoStage\data\Dataset3\Dataset3_Equal_70split\train',
    'Dataset4': r'D:\git\FYP\Python\TwoStage\data\Dataset4\BalancedWikiart',
    'Dataset5': r'D:\git\FYP\Python\TwoStage\data\Dataset5\complete_dataset5'
}

# Create results folder
results_folder = 'results'
os.makedirs(results_folder, exist_ok=True)

# Function to count images per genre
def count_images_per_genre(dataset_path):
    genre_counts = {}
    for root, dirs, files in tqdm(list(os.walk(dataset_path)), desc=f"Scanning {os.path.basename(dataset_path)}"):
        if not dirs:  # Only process leaf directories
            genre_name = os.path.basename(root)
            genre_counts[genre_name] = len(files)
    return genre_counts

# Function to plot a pie chart for each dataset and save it
def plot_pie_charts(genre_counts_dict):
    for dataset_name, genre_counts in genre_counts_dict.items():
        labels = list(genre_counts.keys())
        sizes = list(genre_counts.values())
        
        plt.figure(figsize=(8, 8))
        plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
        plt.title(f'Image Distribution - {dataset_name}')
        plt.axis('equal')
        plt.tight_layout()
        
        # Save each pie chart inside the results folder
        pie_chart_path = os.path.join(results_folder, f'{dataset_name}_pie_chart.png')
        plt.savefig(pie_chart_path)
        plt.close()  # Close the figure to free memory

# Dictionary to store genre counts for each dataset
genre_counts_dict = {}

# Path to the Excel file inside the results folder
excel_file_path = os.path.join(results_folder, 'dataset_comparison.xlsx')

# Process each dataset and collect counts
with pd.ExcelWriter(excel_file_path, engine='openpyxl') as excel_writer:
    for dataset_name, dataset_path in tqdm(dataset_paths.items(), desc="Processing Datasets"):
        genre_counts = count_images_per_genre(dataset_path)
        genre_counts_dict[dataset_name] = genre_counts

        # Save counts to Excel
        df = pd.DataFrame(list(genre_counts.items()), columns=['Genre', 'Image Count'])
        df.to_excel(excel_writer, sheet_name=dataset_name, index=False)

    # Create and save a summary comparison table
    all_genres = sorted(set(genre for genre_counts in genre_counts_dict.values() for genre in genre_counts.keys()))
    summary_data = []

    for genre in tqdm(all_genres, desc="Building Summary Table"):
        row = [genre]
        for dataset_name in dataset_paths:
            row.append(genre_counts_dict[dataset_name].get(genre, 0))
        summary_data.append(row)

    summary_df = pd.DataFrame(summary_data, columns=['Genre'] + list(dataset_paths.keys()))
    summary_df.to_excel(excel_writer, sheet_name='Summary Comparison', index=False)

# Plot and save individual pie charts
plot_pie_charts(genre_counts_dict)

print(f"✅ Script completed: Excel and pie charts saved in the '{results_folder}' folder.")


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import time

# Define the dataset paths
dataset_paths = {
    'Dataset1': r'D:\git\FYP\Python\TwoStage\data\Dataset1\Pandora7k_original',
    'Dataset2': r'D:\git\FYP\Python\TwoStage\data\Dataset2\Pandora18k_original',
    'Dataset3': r'D:\git\FYP\Python\TwoStage\data\Dataset3\Dataset3_Equal_70split\train',
    'Dataset4': r'D:\git\FYP\Python\TwoStage\data\Dataset4\BalancedWikiart',
    'Dataset5': r'D:\git\FYP\Python\TwoStage\data\Dataset5\complete_dataset5'
}

# Function to count images per genre with progress bar
def count_images_per_genre(dataset_path):
    print(f"Scanning directory: {dataset_path}")
    genre_counts = {}
    
    # First, get all the genre folders
    genre_folders = []
    for root, dirs, _ in os.walk(dataset_path):
        for d in dirs:
            full_path = os.path.join(root, d)
            # Check if this is a leaf directory (no subdirectories)
            if not any(os.path.isdir(os.path.join(full_path, x)) for x in os.listdir(full_path)):
                genre_folders.append((d, full_path))
        break  # Only process the top level
    
    # Now process each genre folder with a progress bar
    for genre_name, folder_path in tqdm(genre_folders, desc="Processing genres"):
        files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
        genre_counts[genre_name] = len(files)
            
    return genre_counts

# Function to generate pie charts
def plot_pie_chart(genre_counts, dataset_name):
    print(f"Generating pie chart for {dataset_name}...")
    labels = list(genre_counts.keys())
    sizes = list(genre_counts.values())
    
    # Create pie chart
    plt.figure(figsize=(10, 8))
    plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
    plt.title(f'Image Distribution per Genre - {dataset_name}')
    plt.axis('equal')  # Equal aspect ratio ensures that pie chart is drawn as a circle.
    
    # Save the pie chart to a file
    chart_filename = f"{dataset_name}_distribution.png"
    plt.savefig(chart_filename)
    plt.close()  # Close the figure to free up memory
    print(f"Chart saved as {chart_filename}")

# Create Excel Writer
excel_writer = pd.ExcelWriter('dataset_image_counts.xlsx', engine='openpyxl')

# Overall progress of datasets
print("Starting dataset analysis...")
for dataset_name, dataset_path in tqdm(dataset_paths.items(), desc="Datasets progress"):
    print(f"\nAnalyzing {dataset_name}...")
    
    # Count images
    genre_counts = count_images_per_genre(dataset_path)
    
    # Save genre counts to a DataFrame
    df = pd.DataFrame(list(genre_counts.items()), columns=['Genre', 'Image Count'])
    df.to_excel(excel_writer, sheet_name=dataset_name, index=False)
    
    # Plot pie chart for the dataset
    plot_pie_chart(genre_counts, dataset_name)
    
    # Small delay to see the progress bar
    time.sleep(0.5)

# Save the Excel file
print("Saving Excel report...")
excel_writer.save()

print("\nScript completed successfully!")
print("Excel file 'dataset_image_counts.xlsx' with counts generated.")
print("Pie charts saved as PNG files in the current directory.")

In [None]:
# Function to plot a pie chart for each dataset with fixed legend and save it
def plot_pie_charts(genre_counts_dict):
    for dataset_name, genre_counts in genre_counts_dict.items():
        labels = list(genre_counts.keys())
        sizes = list(genre_counts.values())

        # Generate colors for consistency
        colors = plt.cm.tab20.colors if len(labels) <= 20 else plt.cm.get_cmap('tab20b', len(labels)).colors

        plt.figure(figsize=(10, 8))
        wedges, texts, autotexts = plt.pie(
            sizes,
            colors=colors[:len(labels)],
            autopct='%1.1f%%',
            startangle=90,
            textprops=dict(color="black")
        )

        # Create a legend outside the pie
        plt.legend(
            wedges, labels,
            title="Genres",
            loc="center left",
            bbox_to_anchor=(1, 0.5),
            fontsize='small'
        )

        plt.title(f'Image Distribution - {dataset_name}')
        plt.axis('equal')  # Draw as circle
        plt.tight_layout()

        # Save the pie chart
        pie_chart_path = os.path.join(results_folder, f'{dataset_name}_pie_chart.png')
        plt.savefig(pie_chart_path, bbox_inches='tight')  # Ensure legend is saved
        plt.close()

        print(f"✅ Saved pie chart for {dataset_name} at {pie_chart_path}")

plot_pie_charts(genre_counts_dict)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create results folder if it doesn't exist
results_folder = 'results'
os.makedirs(results_folder, exist_ok=True)

# Get all unique genres from all datasets
all_genres = sorted(set(genre for counts in genre_counts_dict.values() for genre in counts.keys()))

# Build the comparison table (row = genre, columns = datasets)
summary_data = []
for genre in all_genres:
    row = [genre]
    for dataset_name in dataset_paths.keys():
        count = genre_counts_dict.get(dataset_name, {}).get(genre, 0)
        row.append(count)
    summary_data.append(row)

# Create the DataFrame
comparison_df = pd.DataFrame(summary_data, columns=['Genre'] + list(dataset_paths.keys()))

# Save as CSV for reference
comparison_csv_path = os.path.join(results_folder, 'summary_comparison.csv')
comparison_df.to_csv(comparison_csv_path, index=False)

print(f"✅ Summary table saved to: {comparison_csv_path}")
# Optional heatmap to visualize the class distribution comparison
plt.figure(figsize=(14, 10))
heatmap_data = comparison_df.set_index('Genre').fillna(0)

sns.heatmap(heatmap_data, annot=True, fmt='d', cmap='YlGnBu')
plt.title('Genre Distribution Across Datasets')
plt.tight_layout()

# Save the heatmap
heatmap_img_path = os.path.join(results_folder, 'summary_heatmap.png')
plt.savefig(heatmap_img_path)

print(f"✅ Heatmap saved to: {heatmap_img_path}")
