# EDA: Dataset Exploration

This notebook explores the cattle breed dataset.


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
import numpy as np


In [None]:
# Load dataset metadata
metadata_path = Path('../dataset/archive/bovine_breeds_metadata.csv')
if metadata_path.exists():
    df = pd.read_csv(metadata_path)
    print(df.head())


In [None]:
# Count images per breed
dataset_path = Path('../dataset/archive/Indian_bovine_breeds/Indian_bovine_breeds')
breeds = [d.name for d in dataset_path.iterdir() if d.is_dir()]
print(f"Total breeds: {len(breeds)}")

breed_counts = {}
for breed in breeds:
    breed_dir = dataset_path / breed
    images = list(breed_dir.glob('*.{jpg,jpeg,png,JPG,JPEG,PNG}'))
    breed_counts[breed] = len(images)

df_counts = pd.DataFrame(list(breed_counts.items()), columns=['Breed', 'Count'])
df_counts = df_counts.sort_values('Count', ascending=False)
print(df_counts)


In [None]:
# Visualize distribution
plt.figure(figsize=(12, 8))
sns.barplot(data=df_counts, y='Breed', x='Count')
plt.title('Number of Images per Breed')
plt.xlabel('Number of Images')
plt.tight_layout()
plt.show()


In [None]:
# Sample images from different breeds
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
for idx, breed in enumerate(breeds[:6]):
    breed_dir = dataset_path / breed
    images = list(breed_dir.glob('*.{jpg,jpeg,png,JPG,JPEG,PNG}'))
    if images:
        img = Image.open(images[0])
        ax = axes[idx // 3, idx % 3]
        ax.imshow(img)
        ax.set_title(f"{breed}")
        ax.axis('off')
plt.tight_layout()
plt.show()
