In [None]:
# import packages
import kagglehub
import pandas as pd
import os

# Download latest version
path_download = kagglehub.dataset_download("uciml/iris")

# Update path_download variable to point to csv file within directory
data_path = os.path.join(path_download, "Iris.csv")

# Load dataset into a dataframe
df = pd.read_csv(data_path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/uciml/iris?dataset_version_number=2...


100%|██████████| 3.60k/3.60k [00:00<00:00, 1.09MB/s]

Extracting files...





In [None]:
# Display first few rows of data and identify types of data
print("First 5 rows of DF")
print(df.head())

print("Data Types")
print(df.dtypes)

First 5 rows of DF
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa
Data Types
Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object


In [None]:
# Dataset Info:
# First 50 of Iris-Setosa, followed by 50 of Iris-Versicolor, and lastly, 50 of Iris-Virginica = (50*3) 150 Total Flower Species
# 150 rows of Sepal Length and Width [in centimeters] (2 columns)
# Petal Length and Width [in centimeters] (2 columns)
# ID is the index (1-150)

# Data Cleaning
# Check for missing values in the dataset
missing_values = df.isnull().sum()
print(missing_values)

# Fill in missing values and drop any unneccessary rows
df = df.drop(columns=['PetalWidthCm'])
print(df.head())

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm      Species
0   1            5.1           3.5            1.4  Iris-setosa
1   2            4.9           3.0            1.4  Iris-setosa
2   3            4.7           3.2            1.3  Iris-setosa
3   4            4.6           3.1            1.5  Iris-setosa
4   5            5.0           3.6            1.4  Iris-setosa


In [None]:
# Filtering
#1st method by only filtering for rows that only includes Iris-Setosa
setosa_df = df[df['Species'] == 'Iris-setosa']
print(setosa_df.head())

#2nd method by only filtering for rows where the Sepal Length (SL) is greater than 6.0 cm
large_SL_df = df[df['SepalLengthCm'] > 6.0]
print(large_SL_df.head())

#3rd method by specifically extracting rows for Iris-Virginica where the Sepal Width (SW) is greater than 3.5 cm
large_virginicaSW_df = df[(df['Species'] == 'Iris-virginica') & (df['SepalWidthCm'] > 3.5)]
print(large_virginicaSW_df.head())

#4th method by extracting flowers that have a Petal Length (PL) between 5.0 cm and 6.0 cm
medium_PL_df = df[(df['PetalLengthCm'] >= 0) & (df['PetalLengthCm'] <= 1.0)]
print(medium_PL_df.head())

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm      Species
0   1            5.1           3.5            1.4  Iris-setosa
1   2            4.9           3.0            1.4  Iris-setosa
2   3            4.7           3.2            1.3  Iris-setosa
3   4            4.6           3.1            1.5  Iris-setosa
4   5            5.0           3.6            1.4  Iris-setosa
    Id  SepalLengthCm  SepalWidthCm  PetalLengthCm          Species
50  51            7.0           3.2            4.7  Iris-versicolor
51  52            6.4           3.2            4.5  Iris-versicolor
52  53            6.9           3.1            4.9  Iris-versicolor
54  55            6.5           2.8            4.6  Iris-versicolor
56  57            6.3           3.3            4.7  Iris-versicolor
      Id  SepalLengthCm  SepalWidthCm  PetalLengthCm         Species
109  110            7.2           3.6            6.1  Iris-virginica
117  118            7.7           3.8            6.7  Iris-virginica
131  13

In [None]:
# Grouping and Aggregation
#1st method by grouping data based on three flower species and calculating avg Sepal Length (SL) for each
mean_SL_species = df.groupby('Species')['SepalLengthCm'].mean()
print(f"Average Sepal Length for each species: \n{mean_SL_species}")

# 2nd method by grouping the three flower species based on multiple aggregation functions for Sepal Width
sepal_width_stats = df.groupby('Species')['SepalWidthCm'].agg(['mean', 'std', 'sum', 'count'])
print(f"Grouped flower species based on multi aggr func for Sepal Width: \n{sepal_width_stats}")

# 3rd method by grouping the three flower species based on the variance of Sepal Width (SW)
var_SW_species = df.groupby('Species')['SepalWidthCm'].var()
print(f"Grouped flower species based on variance of Sepal Width: \n{var_SW_species}")

# 4th method by grouping the three flower species based on the aggregation of the range (max-min) of Petal Length (PL)
range_PL_species = df.groupby('Species')['PetalLengthCm'].agg(lambda x: x.max() - x.min())
print(f"Grouped flower species based on aggregation of the range of Petal Length: \n{range_PL_species}")

Average Sepal Length for each species: 
Species
Iris-setosa        5.006
Iris-versicolor    5.936
Iris-virginica     6.588
Name: SepalLengthCm, dtype: float64
Grouped flower species based on multi aggr func for Sepal Width: 
                  mean       std    sum  count
Species                                       
Iris-setosa      3.418  0.381024  170.9     50
Iris-versicolor  2.770  0.313798  138.5     50
Iris-virginica   2.974  0.322497  148.7     50
Grouped flower species based on variance of Sepal Width: 
Species
Iris-setosa        0.145180
Iris-versicolor    0.098469
Iris-virginica     0.104004
Name: SepalWidthCm, dtype: float64
Grouped flower species based on aggregation of the range of Petal Length: 
Species
Iris-setosa        0.9
Iris-versicolor    2.1
Iris-virginica     2.4
Name: PetalLengthCm, dtype: float64


# Output Findings (Filtering)
**1st filtering method** revealed that Iris-setosa has different characteristics in its sepal and petal columns. Specifically, the values followed a descending order: Sepal Length > Sepal Width > Petal Length. **2nd filtering method** returned Sepal Length of all flower species greater than 6.0 cm, and the majority of the Iris species were Iris-versicolor species; meaningful for differentiating species based on larger sepal measurements. **3rd filtering method** identified three Iris-virginica flowers that had Sepal Width greater than 3.5 centimeters, while the Sepal Length and Petal Length values are twice its Sepal Width. **4th filtering method** had to extract a value between 0 and 1, where it returned only one Iris-setosa value from the whole dataset, which had a Petal Length equal to 1.0.

# Output Findings (Grouping & Aggregation)
**1st grouping method** reveals that Iris-setosa has the shortest average sepal length (5.006 cm), while Iris-virginica has the longest (6.588 cm), with Iris-versicolor in between (5.936 cm). This suggests differences in the overall size of the species. **2nd grouping method** reveals the mean sepal width is largest in Iris-setosa (3.418 cm) and smallest in Iris-versicolor (2.770 cm). The standard deviation is also highest in Iris-setosa (0.381), indicating higher variability in this species' sepal width. Despite the differences in sepal width, all species have the same count of 50 samples. **3rd grouping method** reveals that Iris-setosa has the highest variance (0.145), while Iris-versicolor has the lowest (0.098), suggesting more consistency in sepal width measurements for Iris-versicolor compared to the others. **4th grouping method** reveals that the range of petal length is smallest in Iris-setosa (0.9 cm), while Iris-virginica has the largest range (2.4 cm), showing more differences in the petal lengths of the Iris-virginica species compared to the others.






