# Iris

### Introduction:

This exercise may seem a little bit strange, but keep doing it.

### Step 1. Import the necessary libraries

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Step 2. Import the dataset from this [address](https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data). 

### Step 3. Assign it to a variable called iris

In [2]:
iris_df= pd.read_csv(r'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')
iris_df.head()

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


### Step 4. Create columns for the dataset

In [57]:
# 1. sepal_length (in cm)
# 2. sepal_width (in cm)
# 3. petal_length (in cm)
# 4. petal_width (in cm)
# 5. class

In [3]:
column_names = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
iris_df = pd.read_csv( r'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', names=column_names)

iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Step 5.  Is there any missing value in the dataframe?

In [4]:
print(iris_df.isnull().sum())

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
class           0
dtype: int64


### Step 6.  Lets set the values of the rows 10 to 29 of the column 'petal_length' to NaN

In [5]:
iris_df.loc[10:29, 'petal_length'] = np.nan

print(iris_df.iloc[8:31])

    sepal_length  sepal_width  petal_length  petal_width        class
8            4.4          2.9           1.4          0.2  Iris-setosa
9            4.9          3.1           1.5          0.1  Iris-setosa
10           5.4          3.7           NaN          0.2  Iris-setosa
11           4.8          3.4           NaN          0.2  Iris-setosa
12           4.8          3.0           NaN          0.1  Iris-setosa
13           4.3          3.0           NaN          0.1  Iris-setosa
14           5.8          4.0           NaN          0.2  Iris-setosa
15           5.7          4.4           NaN          0.4  Iris-setosa
16           5.4          3.9           NaN          0.4  Iris-setosa
17           5.1          3.5           NaN          0.3  Iris-setosa
18           5.7          3.8           NaN          0.3  Iris-setosa
19           5.1          3.8           NaN          0.3  Iris-setosa
20           5.4          3.4           NaN          0.2  Iris-setosa
21           5.1    

### Step 7. Good, now lets substitute the NaN values to 1.0

In [6]:
iris_df['petal_length'].fillna(1.0, inplace=True)

print(iris_df.iloc[8:31])

    sepal_length  sepal_width  petal_length  petal_width        class
8            4.4          2.9           1.4          0.2  Iris-setosa
9            4.9          3.1           1.5          0.1  Iris-setosa
10           5.4          3.7           1.0          0.2  Iris-setosa
11           4.8          3.4           1.0          0.2  Iris-setosa
12           4.8          3.0           1.0          0.1  Iris-setosa
13           4.3          3.0           1.0          0.1  Iris-setosa
14           5.8          4.0           1.0          0.2  Iris-setosa
15           5.7          4.4           1.0          0.4  Iris-setosa
16           5.4          3.9           1.0          0.4  Iris-setosa
17           5.1          3.5           1.0          0.3  Iris-setosa
18           5.7          3.8           1.0          0.3  Iris-setosa
19           5.1          3.8           1.0          0.3  Iris-setosa
20           5.4          3.4           1.0          0.2  Iris-setosa
21           5.1    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  iris_df['petal_length'].fillna(1.0, inplace=True)


### Step 8. Now let's delete the column class

In [7]:
iris_df = iris_df.drop('class', axis=1)

iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


### Step 9.  Set the first 3 rows as NaN

In [8]:
iris_df.iloc[0:3] = np.nan

iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,,,,
1,,,,
2,,,,
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


### Step 10.  Delete the rows that have NaN

In [9]:
iris_df_cleaned = iris_df.dropna()

print("DataFrame after dropping NaN rows:")
print(iris_df_cleaned.head())
print("\nNumber of rows before and after dropping NaN:")
print(f"Before: {len(iris_df)}, After: {len(iris_df_cleaned)}")

DataFrame after dropping NaN rows:
   sepal_length  sepal_width  petal_length  petal_width
3           4.6          3.1           1.5          0.2
4           5.0          3.6           1.4          0.2
5           5.4          3.9           1.7          0.4
6           4.6          3.4           1.4          0.3
7           5.0          3.4           1.5          0.2

Number of rows before and after dropping NaN:
Before: 150, After: 147


### Step 11. Reset the index so it begins with 0 again

In [10]:
iris_df_cleaned.reset_index(drop=True, inplace=True)

print("DataFrame after resetting index:")
iris_df_cleaned.head()

DataFrame after resetting index:


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,4.6,3.1,1.5,0.2
1,5.0,3.6,1.4,0.2
2,5.4,3.9,1.7,0.4
3,4.6,3.4,1.4,0.3
4,5.0,3.4,1.5,0.2


### BONUS: Create your own question and answer it.

In [11]:
iris_df_cleaned.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,147.0,147.0,147.0,147.0
mean,5.862585,3.05034,3.741497,1.219048
std,0.824971,0.436263,1.84052,0.757278
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.5,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [12]:
iris_df_cleaned.corr()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.102636,0.856441,0.812784
sepal_width,-0.102636,1.0,-0.43527,-0.353088
petal_length,0.856441,-0.43527,1.0,0.958432
petal_width,0.812784,-0.353088,0.958432,1.0
