In [84]:
import numpy as np
import pandas as pd

In [85]:
#Sample Dataset

data = {
    'A':[1,2,np.nan,4,5],
    'B':[5,np.nan,np.nan,8,10],
    'C':['cat','dog','cat',np.nan,'dog']
}

df = pd.DataFrame(data)
print(df)

     A     B    C
0  1.0   5.0  cat
1  2.0   NaN  dog
2  NaN   NaN  cat
3  4.0   8.0  NaN
4  5.0  10.0  dog


In [86]:
#Removing row with missing values

df_drop = df.dropna()
print(df_drop)

     A     B    C
0  1.0   5.0  cat
4  5.0  10.0  dog


In [87]:
#Removing Columns with missing values
df_drop_col = df.dropna(axis=1)
print(df_drop_col)

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


In [88]:
# fill with a constant value

df_fill = df.fillna("Fill..")
print(df_fill)

        A       B       C
0     1.0     5.0     cat
1     2.0  Fill..     dog
2  Fill..  Fill..     cat
3     4.0     8.0  Fill..
4     5.0    10.0     dog


In [89]:
# Imputing with mean for numerical columns
df['A']=df['A'].fillna(df['A'].mean())
df['B']=df['B'].fillna(df['B'].mean())
print(df)

     A          B    C
0  1.0   5.000000  cat
1  2.0   7.666667  dog
2  3.0   7.666667  cat
3  4.0   8.000000  NaN
4  5.0  10.000000  dog


In [90]:
# Imputing with mode for categorical columns
df['C']=df['C'].fillna(df['C'].mode()[0])
print(df)

     A          B    C
0  1.0   5.000000  cat
1  2.0   7.666667  dog
2  3.0   7.666667  cat
3  4.0   8.000000  cat
4  5.0  10.000000  dog


In [91]:
from sklearn.impute import SimpleImputer
#Imputing numerical columns with mean 
numerical_imputer = SimpleImputer (strategy='mean') 
df[['A', 'B']] = numerical_imputer.fit_transform(df[['A', 'B']])

print(df)

# Imputing categorical columns with most frequent (mode)
categorical_imputer = SimpleImputer(strategy='most_frequent')

df[['C']] = categorical_imputer.fit_transform(df[['C']])
print(df)

     A          B    C
0  1.0   5.000000  cat
1  2.0   7.666667  dog
2  3.0   7.666667  cat
3  4.0   8.000000  cat
4  5.0  10.000000  dog
     A          B    C
0  1.0   5.000000  cat
1  2.0   7.666667  dog
2  3.0   7.666667  cat
3  4.0   8.000000  cat
4  5.0  10.000000  dog


In [92]:
# Iterative Imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
iterative_imputer = IterativeImputer()
df[['A','B']] = iterative_imputer.fit_transform(df[['A','B']])
df[['A','B']]

Unnamed: 0,A,B
0,1.0,5.0
1,2.0,7.666667
2,3.0,7.666667
3,4.0,8.0
4,5.0,10.0


In [93]:
from sklearn.preprocessing import MinMaxScaler
data = {'Feature1' : [10,20,30,40,50], 'Feature2' : [100,200,300,400,500]}
df = pd.DataFrame(data)

#Normalization

scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

print(df_normalized)

   Feature1  Feature2
0      0.00      0.00
1      0.25      0.25
2      0.50      0.50
3      0.75      0.75
4      1.00      1.00


Standardization

In [94]:
from sklearn.preprocessing import StandardScaler

# Sample Dataset
data = {'Feature1' : [10,20,30,40,50], 'Feature2' : [100,200,300,400,500]}
df = pd.DataFrame(data)

#Standardization
scaler = StandardScaler()
df_Standardized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

print(df_Standardized)

   Feature1  Feature2
0 -1.414214 -1.414214
1 -0.707107 -0.707107
2  0.000000  0.000000
3  0.707107  0.707107
4  1.414214  1.414214


Data Splitting 

In [95]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Sample Dataset
data = {
    'Feature1' : [10,20,30,40,50], 
    'Feature2' : [100,200,300,400,500],
    'Target' : [1,0,1,0,1]}

df = pd.DataFrame(data)
print("Original Dataset")
print(df)

Original Dataset
   Feature1  Feature2  Target
0        10       100       1
1        20       200       0
2        30       300       1
3        40       400       0
4        50       500       1


In [96]:
#Splitting the dataset 
X=df.drop('Target', axis=1) #features 
y=df['Target'] #Target variable

X_train,X_test,y_train,y_test= train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Features:")
print(X_train)
print("Test Features:")
print(X_test)

Training Features:
   Feature1  Feature2
4        50       500
2        30       300
0        10       100
3        40       400
Test Features:
   Feature1  Feature2
1        20       200
