In [5]:
import pandas as pd
import numpy as np

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, np.nan, 30, np.nan, 45],
    'Salary': [50000, 60000, np.nan, 80000, np.nan]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)


mean = df['Age'].mean()
print(f"Mean Age: {mean}")
df['Age'].fillna(mean, inplace=True)



medain = df['Salary'].median()
print(f"Median Salary: {medain}") 
df['Salary'].fillna(medain, inplace=True)



print(df)


Original DataFrame:
      Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob   NaN  60000.0
2  Charlie  30.0      NaN
3    David   NaN  80000.0
4      Eva  45.0      NaN
Mean Age: 33.333333333333336
Median Salary: 60000.0
      Name        Age   Salary
0    Alice  25.000000  50000.0
1      Bob  33.333333  60000.0
2  Charlie  30.000000  60000.0
3    David  33.333333  80000.0
4      Eva  45.000000  60000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(medain, inplace=True)


Label Encoding: Best for ordinal data or when the machine learning algorithm can handle integer-encoded categories without misinterpretation.​

One-Hot Encoding: Ideal for nominal data to prevent the model from assuming any ordinal relationship between categories.

🔢 Label Encoding with LabelEncoder
Label Encoding transforms categorical labels into integer values. This is particularly useful for encoding target variables or ordinal features.​
scikit-learn


In [1]:
from sklearn.preprocessing import LabelEncoder

# Sample data
labels = ['red', 'green', 'blue', 'green', 'red']

# Initialize the encoder
le = LabelEncoder()

# Fit and transform the labels
encoded_labels = le.fit_transform(labels)

print("Encoded Labels:", encoded_labels)
print("Classes:", le.classes_)

# To inverse transform
original_labels = le.inverse_transform(encoded_labels)
print("Original Labels:", original_labels)


Encoded Labels: [2 1 0 1 2]
Classes: ['blue' 'green' 'red']
Original Labels: ['red' 'green' 'blue' 'green' 'red']


One Hot Encoder

In [3]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Sample data
features = np.array([['red'], ['green'], ['blue'], ['green'], ['red']])

# Initialize the encoder
ohe = OneHotEncoder(sparse_output=False)

# Fit and transform the features
encoded_features = ohe.fit_transform(features)

print("Encoded Features:\n", encoded_features)
print("Categories:", ohe.categories_)


Encoded Features:
 [[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
Categories: [array(['blue', 'green', 'red'], dtype='<U5')]
