In [1]:
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Create the dataframe

path = './iris-data.csv'
iris_data = pd.read_csv(path)

In [3]:
# First EDA

print(iris_data.info())
iris_data['class'].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sepal_length_cm  150 non-null    float64
 1   sepal_width_cm   150 non-null    float64
 2   petal_length_cm  150 non-null    float64
 3   petal_width_cm   145 non-null    float64
 4   class            150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None


array(['Iris-setosa', 'Iris-setossa', 'Iris-versicolor', 'versicolor',
       'Iris-virginica'], dtype=object)

In [4]:
# Fix class names

iris_data['class'] = [i.replace('Iris-setossa', 'Iris-setosa').replace('versicolor', 'Iris-versicolor') if i == 'Iris-setossa' or i == 'versicolor' else i for i in iris_data['class']]

In [5]:
# Drop missing values

iris_data = iris_data.dropna().reset_index(drop=True)

In [6]:
# Feature engineering

iris_data['sepal_length_width_ratio'] = iris_data['sepal_length_cm'] / iris_data['sepal_width_cm']
iris_data['petal_length_width_ratio'] = iris_data['petal_length_cm'] / iris_data['petal_width_cm']
iris_data

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class,sepal_length_width_ratio,petal_length_width_ratio
0,5.1,3.5,1.4,0.2,Iris-setosa,1.457143,7.000000
1,4.9,3.0,1.4,0.2,Iris-setosa,1.633333,7.000000
2,4.7,3.2,1.3,0.2,Iris-setosa,1.468750,6.500000
3,4.6,3.1,1.5,0.2,Iris-setosa,1.483871,7.500000
4,5.0,3.6,1.4,0.2,Iris-setosa,1.388889,7.000000
...,...,...,...,...,...,...,...
140,6.7,3.0,5.2,2.3,Iris-virginica,2.233333,2.260870
141,6.3,2.5,5.0,2.3,Iris-virginica,2.520000,2.173913
142,6.5,3.0,5.2,2.0,Iris-virginica,2.166667,2.600000
143,6.2,3.4,5.4,2.3,Iris-virginica,1.823529,2.347826


In [7]:
# Second EDA

iris_data.describe()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,sepal_length_width_ratio,petal_length_width_ratio
count,145.0,145.0,145.0,145.0,145.0,145.0
mean,5.670303,3.046207,3.836552,1.236552,1.893118,4.204257
std,1.32654,0.435096,1.742955,0.755058,0.523034,2.48981
min,0.055,2.0,1.0,0.1,0.02069,2.125
25%,5.1,2.8,1.6,0.4,1.545455,2.789474
50%,5.8,3.0,4.4,1.3,2.026316,3.266667
75%,6.4,3.3,5.1,1.8,2.2,4.333333
max,7.9,4.4,6.9,2.5,2.961538,15.0


In [8]:
# Encoding

iris_encoded_data = pd.get_dummies(iris_data, columns=['class'])
iris_encoded_data

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,sepal_length_width_ratio,petal_length_width_ratio,class_Iris-setosa,class_Iris-versicolor,class_Iris-virginica
0,5.1,3.5,1.4,0.2,1.457143,7.000000,1,0,0
1,4.9,3.0,1.4,0.2,1.633333,7.000000,1,0,0
2,4.7,3.2,1.3,0.2,1.468750,6.500000,1,0,0
3,4.6,3.1,1.5,0.2,1.483871,7.500000,1,0,0
4,5.0,3.6,1.4,0.2,1.388889,7.000000,1,0,0
...,...,...,...,...,...,...,...,...,...
140,6.7,3.0,5.2,2.3,2.233333,2.260870,0,0,1
141,6.3,2.5,5.0,2.3,2.520000,2.173913,0,0,1
142,6.5,3.0,5.2,2.0,2.166667,2.600000,0,0,1
143,6.2,3.4,5.4,2.3,1.823529,2.347826,0,0,1


In [9]:
# Scaling

# Using scikit-learn .StandardScaler()

scaler = StandardScaler()
scaled_data = scaler.fit_transform(iris_encoded_data.iloc[:, 0:6])
scaled_df = pd.DataFrame(scaled_data, columns=['sepal_length_cm', 
                                               'sepal_width_cm', 
                                               'petal_length_cm', 
                                               'petal_width_cm',
                                               'sepal_length_width_ratio',
                                               'petal_length_width_ratio'])
scaled_df

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,sepal_length_width_ratio,petal_length_width_ratio
0,-0.431408,1.046588,-1.402789,-1.377569,-0.836439,1.126766
1,-0.582699,-0.106567,-1.402789,-1.377569,-0.498409,1.126766
2,-0.733990,0.354695,-1.460362,-1.377569,-0.814170,0.925252
3,-0.809635,0.124064,-1.345216,-1.377569,-0.785160,1.328281
4,-0.507054,1.277219,-1.402789,-1.377569,-0.967387,1.126766
...,...,...,...,...,...,...
140,0.778918,-0.106567,0.784974,1.413314,0.652719,-0.783242
141,0.476336,-1.259723,0.669829,1.413314,1.202703,-0.818288
142,0.627627,-0.106567,0.784974,1.014616,0.524816,-0.646563
143,0.400691,0.815957,0.900120,1.413314,-0.133509,-0.748196


In [10]:
iris_final = scaled_df.join(iris_encoded_data.iloc[:, 6:9])
iris_final.to_csv('./output/iris_final.csv', index=False) 
iris_final

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,sepal_length_width_ratio,petal_length_width_ratio,class_Iris-setosa,class_Iris-versicolor,class_Iris-virginica
0,-0.431408,1.046588,-1.402789,-1.377569,-0.836439,1.126766,1,0,0
1,-0.582699,-0.106567,-1.402789,-1.377569,-0.498409,1.126766,1,0,0
2,-0.733990,0.354695,-1.460362,-1.377569,-0.814170,0.925252,1,0,0
3,-0.809635,0.124064,-1.345216,-1.377569,-0.785160,1.328281,1,0,0
4,-0.507054,1.277219,-1.402789,-1.377569,-0.967387,1.126766,1,0,0
...,...,...,...,...,...,...,...,...,...
140,0.778918,-0.106567,0.784974,1.413314,0.652719,-0.783242,0,0,1
141,0.476336,-1.259723,0.669829,1.413314,1.202703,-0.818288,0,0,1
142,0.627627,-0.106567,0.784974,1.014616,0.524816,-0.646563,0,0,1
143,0.400691,0.815957,0.900120,1.413314,-0.133509,-0.748196,0,0,1
