In [1]:
import pandas as pd
import os

#download dataset
os.makedirs("data/raw", exist_ok=True)
url= "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv"
df= pd.read_csv(url)
df.to_csv("data/raw/iris.csv", index=False) # index = False means dont save the row no
print(" \u2705 DOWNLOADED data/raw/iris.csv")

print("LOCAL Iris Loaded! Shape:", df.shape)
print("first five rows:")
print(df.head())
print("Summary Statistics:")
print(df.describe())
species_count=df['species'].value_counts()
species_count.name = "Unique Species"
print(species_count)
species=df["species"]
species.unique()




 âœ… DOWNLOADED data/raw/iris.csv
LOCAL Iris Loaded! Shape: (150, 5)
first five rows:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa
Summary Statistics:
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.4000

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [2]:
# selection and filtering
#selct one col
sepal_len = df['sepal_length']
print(sepal_len.head())

#select multiple col
sepal =df[["sepal_length", "sepal_width"]]
print(sepal.head())

# filter with single condition
long_sepal= df[df["sepal_length"] > 6.0]
print(long_sepal.head())

#filter with multiple condition
ideal_sepal=df[(df["sepal_length"]>6.0) & (df["sepal_width"]>3.0) & (df["species"] == "virginica")]
print(ideal_sepal.head(10))


0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: sepal_length, dtype: float64
   sepal_length  sepal_width
0           5.1          3.5
1           4.9          3.0
2           4.7          3.2
3           4.6          3.1
4           5.0          3.6
    sepal_length  sepal_width  petal_length  petal_width     species
50           7.0          3.2           4.7          1.4  versicolor
51           6.4          3.2           4.5          1.5  versicolor
52           6.9          3.1           4.9          1.5  versicolor
54           6.5          2.8           4.6          1.5  versicolor
56           6.3          3.3           4.7          1.6  versicolor
     sepal_length  sepal_width  petal_length  petal_width    species
100           6.3          3.3           6.0          2.5  virginica
109           7.2          3.6           6.1          2.5  virginica
110           6.5          3.2           5.1          2.0  virginica
115           6.4          3.2           5.3          2.

In [3]:
#Addding new feature and column
df["petal_area"] = df["petal_length"] * df["petal_width"]
print(df[["petal_length","petal_width","petal_area"]].head(10))
print("New Shape:", df.shape)
print(df.head(10))

   petal_length  petal_width  petal_area
0           1.4          0.2        0.28
1           1.4          0.2        0.28
2           1.3          0.2        0.26
3           1.5          0.2        0.30
4           1.4          0.2        0.28
5           1.7          0.4        0.68
6           1.4          0.3        0.42
7           1.5          0.2        0.30
8           1.4          0.2        0.28
9           1.5          0.1        0.15
New Shape: (150, 6)
   sepal_length  sepal_width  petal_length  petal_width species  petal_area
0           5.1          3.5           1.4          0.2  setosa        0.28
1           4.9          3.0           1.4          0.2  setosa        0.28
2           4.7          3.2           1.3          0.2  setosa        0.26
3           4.6          3.1           1.5          0.2  setosa        0.30
4           5.0          3.6           1.4          0.2  setosa        0.28
5           5.4          3.9           1.7          0.4  setosa        0.

In [4]:
#Group By
mean_area = df.groupby("species")["petal_area"].mean()
mean_area.name = "petal_mean_area"
print("mean petal area of each species:")
print(mean_area)

mean petal area of each species:
species
setosa         0.3656
versicolor     5.7204
virginica     11.2962
Name: petal_mean_area, dtype: float64
