In [4]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from collections import Counter
from sklearn.datasets import make_classification

In [14]:
iris_df = pd.read_csv('./iris-data.csv')
iris_df

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,2.3,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [8]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sepal_length_cm  150 non-null    float64
 1   sepal_width_cm   150 non-null    float64
 2   petal_length_cm  150 non-null    float64
 3   petal_width_cm   145 non-null    float64
 4   class            150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [19]:
iris_df.describe()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm
count,150.0,150.0,150.0,145.0
mean,5.644627,3.054667,3.758667,1.236552
std,1.312781,0.433123,1.76442,0.755058
min,0.055,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.4
50%,5.7,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [20]:
iris_df['class'].unique()

array(['Iris-setosa', 'Iris-setossa', 'Iris-versicolor', 'versicolor',
       'Iris-virginica'], dtype=object)

In [21]:
iris_df['class'].replace('Iris-setossa','Iris-setosa',inplace=True)
iris_df['class'].replace('versicolor','Iris-versicolor',inplace=True)
iris_df['class'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [22]:
iris_df.isnull().sum()

sepal_length_cm    0
sepal_width_cm     0
petal_length_cm    0
petal_width_cm     5
class              0
dtype: int64

In [23]:
iris_df['petal_width_cm'].median()

1.3

In [25]:
iris_input = iris_df.copy()
iris_input

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,2.3,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [26]:
iris_input['class'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [29]:
iris_input['petal_width_cm'] = iris_input['petal_width_cm'].fillna(iris_input['petal_width_cm'].mean())

In [30]:
iris_input.isnull().sum()

sepal_length_cm    0
sepal_width_cm     0
petal_length_cm    0
petal_width_cm     0
class              0
dtype: int64

In [36]:
iris_one_hot_encoding = pd.get_dummies(iris_input,columns = ['class'], drop_first=True)
iris_one_hot_encoding

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class_Iris-versicolor,class_Iris-virginica
0,5.1,3.5,1.4,0.2,0,0
1,4.9,3.0,1.4,0.2,0,0
2,4.7,3.2,1.3,0.2,0,0
3,4.6,3.1,1.5,0.2,0,0
4,5.0,3.6,1.4,0.2,0,0
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0,1
146,6.3,2.5,5.0,2.3,0,1
147,6.5,3.0,5.2,2.0,0,1
148,6.2,3.4,5.4,2.3,0,1


In [37]:
sample_data = iris_one_hot_encoding[['sepal_length_cm', 'sepal_width_cm','petal_length_cm','petal_width_cm']]
sample_data.describe()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm
count,150.0,150.0,150.0,150.0
mean,5.644627,3.054667,3.758667,1.236552
std,1.312781,0.433123,1.76442,0.742281
min,0.055,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.4
50%,5.7,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [38]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(sample_data)
scaled_data

array([[-4.16254538e-01,  1.03163593e+00, -1.34127240e+00,
        -1.40111854e+00],
       [-5.69113207e-01, -1.26637944e-01, -1.34127240e+00,
        -1.40111854e+00],
       [-7.21971877e-01,  3.36671607e-01, -1.39813811e+00,
        -1.40111854e+00],
       [-7.98401212e-01,  1.05016832e-01, -1.28440670e+00,
        -1.40111854e+00],
       [-4.92683873e-01,  1.26329071e+00, -1.34127240e+00,
        -1.40111854e+00],
       [-1.86966534e-01,  1.95825503e+00, -1.17067529e+00,
        -1.13077631e+00],
       [-7.98401212e-01,  7.99981158e-01, -1.34127240e+00,
        -1.26594742e+00],
       [-4.92683873e-01,  7.99981158e-01, -1.28440670e+00,
         6.00280343e-16],
       [-9.51259881e-01, -3.58292719e-01, -1.34127240e+00,
         6.00280343e-16],
       [-5.69113207e-01,  1.05016832e-01, -1.28440670e+00,
         6.00280343e-16],
       [-1.86966534e-01,  1.49494548e+00, -1.28440670e+00,
         6.00280343e-16],
       [-6.45542542e-01,  7.99981158e-01, -1.22754100e+00,
      

In [42]:
scaled_df = pd.DataFrame(scaled_data, columns=['sepal_length_cm', 'sepal_width_cm','petal_length_cm','petal_width_cm'])
scaled_df

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm
0,-0.416255,1.031636,-1.341272,-1.401119
1,-0.569113,-0.126638,-1.341272,-1.401119
2,-0.721972,0.336672,-1.398138,-1.401119
3,-0.798401,0.105017,-1.284407,-1.401119
4,-0.492684,1.263291,-1.341272,-1.401119
...,...,...,...,...
145,0.806615,-0.126638,0.819624,1.437475
146,0.500897,-1.284912,0.705893,1.437475
147,0.653756,-0.126638,0.819624,1.031962
148,0.424468,0.799981,0.933356,1.437475
