In [1]:
import pandas as pd

In [2]:
data = pd.DataFrame({
    'Name':['Aayush','Bhanu','Bishal','Ashmita'],
    'Age':[22,None,24,21],
    'City':['Lahan','Dharan',None,'Jhapa'],
    'Gender':['Male','Male','Male','Female']
})

In [3]:
#drop rows with missing values
data.dropna()

Unnamed: 0,Name,Age,City,Gender
0,Aayush,22.0,Lahan,Male
3,Ashmita,21.0,Jhapa,Female


In [4]:
#drop columns with missing values
data.dropna(axis=1)

Unnamed: 0,Name,Gender
0,Aayush,Male
1,Bhanu,Male
2,Bishal,Male
3,Ashmita,Female


In [5]:
#fill missing age with mean
data['Age'].fillna(data['Age'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].mean(),inplace=True)


In [6]:
#fill missing city with most frequent value
data['City'].fillna(data['City'].mode()[0],inplace=True)

In [8]:
#for ML pipelines ,use simpleimputer from sklearn.impute
from sklearn.impute import SimpleImputer

# For numerical
num_imputer = SimpleImputer(strategy='mean')
data[['Age']] = num_imputer.fit_transform(data[['Age']])

# For categorical
cat_imputer = SimpleImputer(strategy='most_frequent')
data[['City']] = cat_imputer.fit_transform(data[['City']])


In [9]:
#label encoding
from sklearn.preprocessing import LabelEncoder
data['City']=LabelEncoder().fit_transform(data['City'])

In [10]:
data.head()

Unnamed: 0,Name,Age,City,Gender
0,Aayush,22.0,2,Male
1,Bhanu,22.333333,0,Male
2,Bishal,24.0,0,Male
3,Ashmita,21.0,1,Female


In [None]:
#one hot encoding


# df = pd.get_dummies(df, columns=['City'])


In [13]:
#ordinal encoding
data['Size'] = ['S', 'M', 'L','XL']
size_mapping = {'S': 0, 'M': 1, 'L': 2,'XL':3}
data['Size'] = data['Size'].map(size_mapping)


In [14]:
#min-max scaling (normalization)
from sklearn.preprocessing import MinMaxScaler

scalar = MinMaxScaler()
data[['Age']] = scalar.fit_transform(data[['Age']])

In [15]:
data.head()

Unnamed: 0,Name,Age,City,Gender,Size
0,Aayush,0.333333,2,Male,0
1,Bhanu,0.444444,0,Male,1
2,Bishal,1.0,0,Male,2
3,Ashmita,0.0,1,Female,3


In [16]:
#standardization
from sklearn.preprocessing import StandardScaler
scalar =StandardScaler()
data[['Age']] =scalar.fit_transform(data[['Age']])

In [17]:
data.head()

Unnamed: 0,Name,Age,City,Gender,Size
0,Aayush,-0.3086067,2,Male,0
1,Bhanu,-1.23344e-15,0,Male,1
2,Bishal,1.543033,0,Male,2
3,Ashmita,-1.234427,1,Female,3


In [19]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_features = ['Age']
cat_features = ['City']

# Pipeline for numerical
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Pipeline for categorical
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

# Combine using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

processed = preprocessor.fit_transform(data)


In [20]:
data.head()

Unnamed: 0,Name,Age,City,Gender,Size
0,Aayush,-0.3086067,2,Male,0
1,Bhanu,-1.23344e-15,0,Male,1
2,Bishal,1.543033,0,Male,2
3,Ashmita,-1.234427,1,Female,3
