 Step 1: Import Required Python Libraries

In [3]:
import pandas as pd
import numpy as np

Step 2: Load the Dataset into Pandas DataFrame

In [4]:
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


Step 3: Data Preprocessing

In [5]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [7]:
df.shape

(150, 5)

Step 4: Data Formatting & Normalization

In [8]:
df.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [9]:
df['species'] = df['species'].astype('category')
df.dtypes

sepal_length     float64
sepal_width      float64
petal_length     float64
petal_width      float64
species         category
dtype: object

Step 5: Convert Categorical Variables to Quantitative

In [10]:
df['species_encoded'] = df['species'].cat.codes
df[['species', 'species_encoded']].head()

Unnamed: 0,species,species_encoded
0,setosa,0
1,setosa,0
2,setosa,0
3,setosa,0
4,setosa,0


In [11]:
df_encoded = pd.get_dummies(df, columns=['species'])
df_encoded.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_encoded,species_setosa,species_versicolor,species_virginica
0,5.1,3.5,1.4,0.2,0,True,False,False
1,4.9,3.0,1.4,0.2,0,True,False,False
2,4.7,3.2,1.3,0.2,0,True,False,False
3,4.6,3.1,1.5,0.2,0,True,False,False
4,5.0,3.6,1.4,0.2,0,True,False,False


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   sepal_length     150 non-null    float64 
 1   sepal_width      150 non-null    float64 
 2   petal_length     150 non-null    float64 
 3   petal_width      150 non-null    float64 
 4   species          150 non-null    category
 5   species_encoded  150 non-null    int8    
dtypes: category(1), float64(4), int8(1)
memory usage: 5.2 KB


In [14]:
print(df.columns)

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species',
       'species_encoded'],
      dtype='object')


In [15]:
# Label encoding
df['species_encoded'] = df['species'].cat.codes

In [16]:
df = pd.get_dummies(df, columns=['species'], prefix='species')

In [17]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] = scaler.fit_transform(
    df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
)