In [None]:
%pip install -q pandas matplotlib seaborn
import pandas as pd, sys
import hashlib
import numpy as np
import matplotlib.pyplot as pl
import seaborn

In [None]:
with open("../data/iris.csv", "rb") as f:
    file_bytes = f.read()
checksum = hashlib.sha256(file_bytes).hexdigest()
print("SHA256", checksum)

In [None]:
iris = pd.read_csv("../data/iris.csv")

# Any missing values?
missing_values = iris.isnull().sum()

# Class Distribution
class_distribution = iris["species"].value_counts()

# Column Names
column_names = iris.columns.to_list()

# Any duplicates?
duplicate_count = iris.duplicated()
mask = iris.duplicated(keep=False)


# 1. Exact Duplicates
exact_row_duplicates = iris.duplicated().sum()

# 2. Feature-only duplicates (ignoring label, but same label too)
feature_cols = column_names

feature_only = iris.groupby(feature_cols)["species"].nunique().reset_index(name="unique_species_count")

# Count groups that have only 1 species but more than 1 row
feature_only_duplicates = (feature_only["unique_species_count"] > 1).sum()

# 3. Conflicting label groups (same features -> multiple species)
conflicting_label_groups = (feature_only["unique_species_count"] > 1).sum()

# Save to dict
summary = {
    "exact_row_duplicates": int(exact_row_duplicates),
    "feature_only_duplicates": int(feature_only_duplicates),
    "conflicting_label_groups": int(conflicting_label_groups),
}

# Shape:
print(f"Shape: {iris.shape}")

# Nulls:
print(f"Nulls: {missing_values}")

# Class Counts:
print(f"Class Counts: {class_distribution}")

# Duplicate summary dict:
print(f"Duplicate Summary: {summary}")


g = seaborn.pairplot(iris, hue="species")
pl.savefig("../reports/figures/01_pairplot.png", dpi=300, bbox_inches="tight")
pl.show()

### Source of Data
The dataset used is the Iris dataset, originally published by Ronald Fisher
In this project, it is loaded from a local CSV file at `../data/iris.csv`

### Shape of Data
The dataset contains 150 rows and 5 columns. Each row represents a flower with four feature measurements and one species label.

### Quick Validation Summary
- No missing values detected
- Data types are numeric for features, categorical for species.
- 150 unique rows, no duplicates

### Cleaning
The dataset required minimal cleaning. No missing or duplicate records were found. The only adjust was dropping the artificial Id column, which is redundant with the row index.

## Data Integrity
The cleaned dataset is stored at `data/iris.csv`.  
SHA256 checksum: `20f7ef9ad6e85c0...eb1df2cda`
 
## Obeservations:
Found 3 exact duplicate rows; 0 feature-only duplicates; 0 conflicting labels.