# Load the dataset

In [41]:
import pandas as pd 
from sklearn.impute import SimpleImputer
import numpy as np

# Example data with shape (563, 3)
# X = np.random.rand(563, 3)
df=pd.read_csv('sample_dataset.csv')
print(df)

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0            NaN         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0              NaN   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34             NaN        NaN              NaN   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565          NaN         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30        NaN          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76           NaN           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mea

In [42]:
df['area error']
df['area error'].value_counts()
#means we have 1 categorical data aand other numerical features

area error
A    489
B      4
C      1
Name: count, dtype: int64

In [43]:
df.dtypes

mean radius                float64
mean texture               float64
mean perimeter             float64
mean area                  float64
mean smoothness            float64
mean compactness           float64
mean concavity             float64
mean concave points        float64
mean symmetry              float64
mean fractal dimension     float64
radius error               float64
texture error              float64
perimeter error            float64
area error                  object
smoothness error           float64
compactness error          float64
concavity error            float64
concave points error       float64
symmetry error             float64
fractal dimension error    float64
worst radius               float64
worst texture              float64
worst perimeter            float64
worst area                 float64
worst smoothness           float64
worst compactness          float64
worst concavity            float64
worst concave points       float64
worst symmetry      

In [44]:
#check categorical column

categorical_variables=df.select_dtypes(include=['object', 'category','bool']).columns


#check numerical column
numerical_variables=df.select_dtypes(exclude=['object', 'category','bool']).columns


In [45]:
print(categorical_variables) #are error column is categorical column
print(numerical_variables) #others were numerical

#verify using df.dtypes and check their types


Index(['area error'], dtype='object')
Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'smoothness error',
       'compactness error', 'concavity error', 'concave points error',
       'symmetry error', 'fractal dimension error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension', 'target'],
      dtype='object')


# Replacing the empty value in mean radius, mean texture & Mean perimeter  with their mean values.

In [46]:
X=df.iloc[:,0:3] #just 3 column's all row values
print(X)

     mean radius  mean texture  mean perimeter
0            NaN         10.38          122.80
1          20.57         17.77          132.90
2          19.69         21.25          130.00
3          11.42         20.38           77.58
4          20.29         14.34             NaN
..           ...           ...             ...
564        21.56         22.39          142.00
565          NaN         28.25          131.20
566        16.60         28.08          108.30
567        20.60         29.33          140.10
568         7.76           NaN           47.92

[569 rows x 3 columns]


In [47]:
# option 1= manually finding mean for each column and assign

X['mean radius'].mean() #the mean value for the whole column is
X['mean texture'].mean()
X['mean perimeter'].mean()




np.float64(92.03902534113061)

In [48]:
# Option 2 = using library called SimpleImputer

cleaner=SimpleImputer(strategy='mean')
cleaner.fit(X)
#print(X) #does not change the data

#Transform the data
X_imputed = cleaner.transform(X) #once transformed,  it fills the missing rows
print(X_imputed) #NUMPY array

[[ 14.05954772  10.38       122.8       ]
 [ 20.57        17.77       132.9       ]
 [ 19.69        21.25       130.        ]
 ...
 [ 16.6         28.08       108.3       ]
 [ 20.6         29.33       140.1       ]
 [  7.76        19.31182927  47.92      ]]


# Replacing missing rows with median value



In [49]:
cleaner_med=SimpleImputer(strategy='median')
cleaner_med.fit(X)
X_imputer_med=cleaner_med.transform(X)
print(X_imputer_med) #now the missing rows are filled with median values of each  column

[[ 13.28  10.38 122.8 ]
 [ 20.57  17.77 132.9 ]
 [ 19.69  21.25 130.  ]
 ...
 [ 16.6   28.08 108.3 ]
 [ 20.6   29.33 140.1 ]
 [  7.76  18.86  47.92]]


# Replace missing rows with fixed values

In [50]:
cleaner_fix=SimpleImputer(strategy='constant',fill_value=0)
cleaner_fix.fit(X)
X_imputer_fix=cleaner_fix.transform(X)
print(X_imputer_fix) # Now the missing values are filled with 0

[[  0.    10.38 122.8 ]
 [ 20.57  17.77 132.9 ]
 [ 19.69  21.25 130.  ]
 ...
 [ 16.6   28.08 108.3 ]
 [ 20.6   29.33 140.1 ]
 [  7.76   0.    47.92]]


# Cleaning Categorical column

In [53]:
df=pd.read_csv('sample_dataset.csv')
categorical_variables=df.select_dtypes(include=['object', 'category','bool']).columns #finding the categorical column
print(categorical_variables)

Index(['area error'], dtype='object')


In [56]:
df['area error'].isnull().sum() # number of rows  which are empty in this column

np.int64(75)

In [None]:
X=df[['area error']] #dataaframe with only categorical column 
print(X)

    area error
0            A
1            A
2            A
3            A
4          NaN
..         ...
564          A
565          A
566          A
567          A
568          A

[569 rows x 1 columns]


# Filling the rows with most frequent data from the column

In [None]:
cleaner_categorical=SimpleImputer(strategy="most_frequent")
cleaner_categorical.fit(X)
Categorical_X=cleaner_categorical.transform(X)
print(Categorical_X) # as most frequent value was A, you can see A in gap rows now.

[['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['B']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']

# Filling the caategorical column with a categorical value

In [59]:
cleaner_categorical=SimpleImputer(strategy="constant",fill_value='New_one') #you can use Undefined as well
cleaner_categorical.fit(X)
Categorical_X=cleaner_categorical.transform(X)
print(Categorical_X) # as most frequent value was A, you can see A in gap rows now.

[['A']
 ['A']
 ['A']
 ['A']
 ['New_one']
 ['New_one']
 ['New_one']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['New_one']
 ['A']
 ['A']
 ['New_one']
 ['New_one']
 ['A']
 ['A']
 ['New_one']
 ['A']
 ['New_one']
 ['A']
 ['New_one']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['New_one']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['New_one']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['New_one']
 ['A']
 ['A']
 ['A']
 ['A']
 ['New_one']
 ['A']
 ['A']
 ['New_one']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['New_one']
 ['A']
 ['A']
 ['New_one']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['New_one']
 ['A']
 ['A']
 ['A']
 ['New_one']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['New_one']
 ['A']
 ['New_one']
 ['New_one']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['New_one']
 ['A']
 ['A']
 ['A']
 ['A']
 ['A']
 ['B']
 ['A']


# Filling gaps of Numerical column with KNN values

In [64]:
from sklearn.impute import KNNImputer
X=df.iloc[:,0:3] #just 3 column's all row values
#print(X)
cleaner_KNN=KNNImputer(n_neighbors=5) #wherever any data is missing, it checks nearest 5 more data and averages it's value and inputs that.
cleaner_KNN.fit_transform(X)


array([[ 13.9636,  10.38  , 122.8   ],
       [ 20.57  ,  17.77  , 132.9   ],
       [ 19.69  ,  21.25  , 130.    ],
       ...,
       [ 16.6   ,  28.08  , 108.3   ],
       [ 20.6   ,  29.33  , 140.1   ],
       [  7.76  ,  20.014 ,  47.92  ]], shape=(569, 3))

# KNN imputer with weights

In [None]:
cleaner_weight=KNNImputer(n_neighbors=10,weights="distance") #the inverse value of the distance will be  used as weight so we can transform our data
cleaner_weight.fit_transform(X)


# distance can be eucledean distance etc. Mostly the default value of weights is 'uniform'

array([[ 15.51148032,  10.38      , 122.8       ],
       [ 20.57      ,  17.77      , 132.9       ],
       [ 19.69      ,  21.25      , 130.        ],
       ...,
       [ 16.6       ,  28.08      , 108.3       ],
       [ 20.6       ,  29.33      , 140.1       ],
       [  7.76      ,  24.25892267,  47.92      ]], shape=(569, 3))

# Using ColumnTransformer

In [67]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

df=pd.read_csv('sample_dataset.csv')

#defining numerical and categorical column
#check categorical column

categorical_variables=df.select_dtypes(include=['object', 'category','bool']).columns

#check numerical column
numerical_variables=df.select_dtypes(exclude=['object', 'category','bool']).columns

In [68]:
cleaner=ColumnTransformer([
    ('numerical_transformer',SimpleImputer(strategy='mean'),numerical_variables),
    ('categorical_transformer',SimpleImputer(strategy='most_frequent'),categorical_variables)
])

cleaner.fit_transform(df)

array([[14.059547717842323, 10.38, 122.8, ..., 0.1189, 0.0, 'A'],
       [20.57, 17.77, 132.9, ..., 0.08436317021276594, 0.0, 'A'],
       [19.69, 21.25, 130.0, ..., 0.08758, 0.0, 'A'],
       ...,
       [16.6, 28.08, 108.3, ..., 0.0782, 0.0, 'A'],
       [20.6, 29.33, 140.1, ..., 0.124, 0.0, 'A'],
       [7.76, 19.311829268292684, 47.92, ..., 0.07039, 1.0, 'A']],
      shape=(569, 31), dtype=object)

In [None]:
cleaner=ColumnTransformer([
    ('numerical_transformer',SimpleImputer(strategy='mean'),[0,1,2]), #only 3 numerical column
    ('categorical_transformer',SimpleImputer(strategy='most_frequent'),categorical_variables) #only 1 categorical column
])

cleaner.fit_transform(df)

array([[14.059547717842323, 10.38, 122.8, 'A'],
       [20.57, 17.77, 132.9, 'A'],
       [19.69, 21.25, 130.0, 'A'],
       ...,
       [16.6, 28.08, 108.3, 'A'],
       [20.6, 29.33, 140.1, 'A'],
       [7.76, 19.311829268292684, 47.92, 'A']],
      shape=(569, 4), dtype=object)

# make_column_selector

In [None]:
from sklearn.compose import make_column_selector
cleaner=ColumnTransformer([
    ('numerical_transformer',SimpleImputer(strategy='mean'),make_column_selector(dtype_exclude="object")), #applying to numerical columns
    ('categorical_transformer',SimpleImputer(strategy='most_frequent'),make_column_selector(dtype_include="object")) #applying to categorical column
])

cleaner.fit_transform(df)

array([[14.059547717842323, 10.38, 122.8, ..., 0.1189, 0.0, 'A'],
       [20.57, 17.77, 132.9, ..., 0.08436317021276594, 0.0, 'A'],
       [19.69, 21.25, 130.0, ..., 0.08758, 0.0, 'A'],
       ...,
       [16.6, 28.08, 108.3, ..., 0.0782, 0.0, 'A'],
       [20.6, 29.33, 140.1, ..., 0.124, 0.0, 'A'],
       [7.76, 19.311829268292684, 47.92, ..., 0.07039, 1.0, 'A']],
      shape=(569, 31), dtype=object)