## **Data Preprocessing**

In [2]:
# importing necessarry libraries
import pandas as pd
from scipy import stats
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler

# imputer library
from sklearn.impute import SimpleImputer

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

In [3]:
# importing our dataset from scikit-learn
from sklearn.datasets import load_iris

In [4]:
iris_flower = load_iris()

In [5]:
# loading the feature dataset
features = pd.DataFrame(iris_flower.data, columns=iris_flower.feature_names)

# loading te target dataset
target = iris_flower.target

In [6]:
# previewing the first five rows
features.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


##### **1. Handling Missing Values**

In [7]:
# checking for missing values
features.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

In [8]:
# taking a snaoshot of our dataset
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


This dataset does not have missing values, but we will introduce missing values. indexing and slicing  using `.iloc[]` and `.loc[]`

In [9]:
# introducing missing values to our data
features.loc[10,"sepal length (cm)"]=None
features.loc[50:54,"sepal width (cm)"] = None
features.loc[100:102, "petal length (cm)"]= None 

In [10]:
# checking for missing values again
features.isnull().sum()

sepal length (cm)    1
sepal width (cm)     5
petal length (cm)    3
petal width (cm)     0
dtype: int64

In [11]:
# creating an instance of the imputer class using mean as strategy for imputation
impute_mean = SimpleImputer(strategy="mean")

In [12]:
# creating an instance of the imputer class using median as strategy for imputation
impute_median = SimpleImputer(strategy="median")

In [13]:
# applying the defined mean instance and filling the missing valuess
features[["sepal length (cm)"]] = impute_mean.fit_transform(features[["sepal length (cm)"]])
features[["sepal width (cm)"]] = impute_mean.fit_transform(features[["sepal width (cm)"]])
features[["petal length (cm)"]] = impute_median.fit_transform(features[["petal length (cm)"]])

In [14]:
# check for missing values again
features.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

##### **2. Handling Outliers**

In [19]:
# calculating the z-score for the whole dataframe
# 
from scipy import stats
z_scores = stats.zscore(features)

# printing the z_score
print(z_scores) 


[[-9.05163302e-01  1.02053680e+00 -1.34014407e+00 -1.31544430e+00]
 [-1.14773404e+00 -1.44643012e-01 -1.34014407e+00 -1.31544430e+00]
 [-1.39030478e+00  3.21428915e-01 -1.39764453e+00 -1.31544430e+00]
 [-1.51159016e+00  8.83929516e-02 -1.28264361e+00 -1.31544430e+00]
 [-1.02644867e+00  1.25357277e+00 -1.34014407e+00 -1.31544430e+00]
 [-5.41307191e-01  1.95268066e+00 -1.16764268e+00 -1.05217993e+00]
 [-1.51159016e+00  7.87500841e-01 -1.34014407e+00 -1.18381211e+00]
 [-1.02644867e+00  7.87500841e-01 -1.28264361e+00 -1.31544430e+00]
 [-1.75416090e+00 -3.77678975e-01 -1.34014407e+00 -1.31544430e+00]
 [-1.14773404e+00  8.83929516e-02 -1.28264361e+00 -1.44707648e+00]
 [ 0.00000000e+00  1.48660873e+00 -1.28264361e+00 -1.31544430e+00]
 [-1.26901941e+00  7.87500841e-01 -1.22514315e+00 -1.31544430e+00]
 [-1.26901941e+00 -1.44643012e-01 -1.34014407e+00 -1.44707648e+00]
 [-1.87544627e+00 -1.44643012e-01 -1.51264545e+00 -1.44707648e+00]
 [-5.61657085e-02  2.18571662e+00 -1.45514499e+00 -1.31544430e