In [1]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

In [3]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target
print("Initial Dataset:")
print(df.head())

Initial Dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   species  
0        0  
1        0  
2        0  
3        0  
4        0  


In [4]:
print("\nDataset Shape:")
print(df.shape)


Dataset Shape:
(150, 5)


In [5]:
print("\nMissing Values:")
print(df.isnull().sum())



Missing Values:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
species              0
dtype: int64


In [6]:
print("\nDuplicate Records:")
print(df.duplicated().sum())


Duplicate Records:
1


In [7]:
df.drop_duplicates(inplace=True)

In [8]:
print("\nData Types:")
print(df.dtypes)


Data Types:
sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
species                int64
dtype: object


In [9]:
df['species'] = df['species'].map({
    0: 'setosa',
    1: 'versicolor',
    2: 'virginica'
})

print("\nDataset after encoding species:")
print(df.head())



Dataset after encoding species:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  


In [10]:
Q1 = df.select_dtypes(include=[np.number]).quantile(0.25)
Q3 = df.select_dtypes(include=[np.number]).quantile(0.75)
IQR = Q3 - Q1

df_cleaned = df[~((df.select_dtypes(include=[np.number]) < (Q1 - 1.5 * IQR)) |
                  (df.select_dtypes(include=[np.number]) > (Q3 + 1.5 * IQR))).any(axis=1)]

print("\nShape after removing outliers:")
print(df_cleaned.shape)


Shape after removing outliers:
(145, 5)


In [11]:
print("\nFinal Cleaned Dataset:")
print(df_cleaned.head())


Final Cleaned Dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  
