In [None]:
import pandas as pd
import numpy as np

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, np.nan, 30, np.nan, 45],
    'Salary': [50000, 60000, np.nan, 80000, np.nan]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)


mean = df['Age'].mean()
print(f"Mean Age: {mean}")
df['Age'].fillna(mean, inplace=True)
                        


medain = df['Salary'].median()
print(f"Median Salary: {medain}") 
df['Salary'].fillna(medain, inplace=True)



print(df)


Original DataFrame:
      Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob   NaN  60000.0
2  Charlie  30.0      NaN
3    David   NaN  80000.0
4      Eva  45.0      NaN
Mean Age: 33.333333333333336
Median Salary: 60000.0
      Name        Age   Salary
0    Alice  25.000000  50000.0
1      Bob  33.333333  60000.0
2  Charlie  30.000000  60000.0
3    David  33.333333  80000.0
4      Eva  45.000000  60000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(medain, inplace=True)


Label Encoding: Best for ordinal data or when the machine learning algorithm can handle integer-encoded categories without misinterpretation.​

One-Hot Encoding: Ideal for nominal data to prevent the model from assuming any ordinal relationship between categories.

🔢 Label Encoding with LabelEncoder
Label Encoding transforms categorical labels into integer values. This is particularly useful for encoding target variables or ordinal features.​
scikit-learn


In [1]:
from sklearn.preprocessing import LabelEncoder

# Sample data
labels = ['red', 'green', 'blue', 'green', 'red']

# Initialize the encoder
le = LabelEncoder()

# Fit and transform the labels
encoded_labels = le.fit_transform(labels)

print("Encoded Labels:", encoded_labels)
print("Classes:", le.classes_)

# To inverse transform
original_labels = le.inverse_transform(encoded_labels)
print("Original Labels:", original_labels)


Encoded Labels: [2 1 0 1 2]
Classes: ['blue' 'green' 'red']
Original Labels: ['red' 'green' 'blue' 'green' 'red']


One Hot Encoder

In [3]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Sample data
features = np.array([['red'], ['green'], ['blue'], ['green'], ['red']])

# Initialize the encoder
ohe = OneHotEncoder(sparse_output=False)

# Fit and transform the features
encoded_features = ohe.fit_transform(features)

print("Encoded Features:\n", encoded_features)
print("Categories:", ohe.categories_)


Encoded Features:
 [[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
Categories: [array(['blue', 'green', 'red'], dtype='<U5')]


✅ Label Encoding
When to Use:

Ordinal Data: Use Label Encoding when the categorical variable has a natural order or ranking. For example, educational levels like 'High School', 'Bachelor', 'Master', 'PhD' have a clear hierarchy.​

Target Variables: Label Encoding is often used for target variables in classification tasks, especially when the categories are ordinal.​

Why Use:

Memory Efficiency: Label Encoding is more memory-efficient as it assigns a unique integer to each category without increasing the dimensionality of the dataset.​

Model Compatibility: Some algorithms, like decision trees and random forests, can handle label-encoded data effectively without misinterpreting the numeric relationships between categories.​

✅ One-Hot Encoding
When to Use:

Nominal Data: Use One-Hot Encoding when the categorical variable does not have any intrinsic order. For example, colors like 'Red', 'Green', 'Blue' are nominal and should be one-hot encoded.​

Linear Models and Neural Networks: These models often perform better with one-hot encoded data, especially when the categorical variable has no inherent order.​

Why Use:

Avoiding Implicit Ordering: One-Hot Encoding prevents the model from assuming any ordinal relationship between categories, which could lead to incorrect interpretations.​

Model Performance: For algorithms that assume numerical relationships between features, One-Hot Encoding ensures that each category is treated equally, improving model performance.

## Normalization vs Standardization

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Sample Data
data = pd.DataFrame({
    'Feature1': [10, 20, 30, 40, 50],
    'Feature2': [100, 200, 300, 400, 500]
})
minmax_scaler = MinMaxScaler()
noramalize_data = minmax_scaler.fit_transform(data)
noramalize_data = minmax_scaler.fit_transform(data)
normalized_df = pd.DataFrame(noramalize_data , columns=data.columns)
print("🔹 Normalized Data (MinMaxScaler):\n", normalized_df)


# StandardScaler: Standardization
standard_scaler = StandardScaler()
standardized_data = standard_scaler.fit_transform(data)
standardized_df = pd.DataFrame(standardized_data, columns=data.columns)
print("\n🔹 Standardized Data (StandardScaler):\n", standardized_df)

🔹 Normalized Data (MinMaxScaler):
    Feature1  Feature2
0      0.00      0.00
1      0.25      0.25
2      0.50      0.50
3      0.75      0.75
4      1.00      1.00

🔹 Standardized Data (StandardScaler):
    Feature1  Feature2
0 -1.414214 -1.414214
1 -0.707107 -0.707107
2  0.000000  0.000000
3  0.707107  0.707107
4  1.414214  1.414214


Detect and remove outliers using IQR and Z-score methods.

In [1]:
import pandas as pd

data = {'values': [10, 12, 14, 15, 13, 16, 12, 11, 150, 14, 13, 10]}
df = pd.DataFrame(data)
print(df)


    values
0       10
1       12
2       14
3       15
4       13
5       16
6       12
7       11
8      150
9       14
10      13
11      10


In [4]:
Q1 = df['values'].quantile(0.25)
Q3 = df['values'].quantile(0.75)

IQR = Q3 - Q1
print(IQR)

2.5


In [8]:
# Step 2: Define lower and upper bounds
lowerbound = Q1 - 1.5 * IQR
upperbound = Q3 + 1.5 * IQR

# Step 3: Filter out the outliers
df_iqr_filter = df[(df['values']>= lowerbound)& (df['values'] <= upperbound)]

# print("IQR Outliers Removed:")
print(df_iqr_filter)

    values
0       10
1       12
2       14
3       15
4       13
5       16
6       12
7       11
9       14
10      13
11      10


In [12]:
from scipy import stats
import numpy as np

# Step 1: Calculate Z-scores
z_scores = np.abs(stats.zscore(df['values']))

# Step 2: Define a threshold (commonly 3)
threshold = 2

# Step 3: Filter out values with Z-score > threshold
df_z_filtered = df[(z_scores < threshold)]

print("Z-score Outliers Removed:")
print(df_z_filtered)


Z-score Outliers Removed:
    values
0       10
1       12
2       14
3       15
4       13
5       16
6       12
7       11
9       14
10      13
11      10


In [1]:
import pandas as pd

# Sample DataFrame
data = {
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
    'Age': [23, 25, 31, 22, 29],
    'Salary': [50000, 60000, 75000, 52000, 68000]
}

df = pd.DataFrame(data)
print(df)


   Gender  Age  Salary
0    Male   23   50000
1  Female   25   60000
2  Female   31   75000
3    Male   22   52000
4  Female   29   68000


In [7]:
mean_df = df.groupby('Gender').mean(numeric_only=True)
print(mean_df)


              Age        Salary
Gender                         
Female  28.333333  67666.666667
Male    22.500000  51000.000000


In [None]:
median_df = df.groupby('Gender').median(numeric_only=True)
print(median_df)

         Age   Salary
Gender               
Female  29.0  68000.0
Male    22.5  51000.0


## Write code to impute missing values using KNN imputer.

In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

# Sample data banayi hai jisme kuch values missing hain
data = {
    'age': [25, 27, np.nan, 35, 40],
    'salary': [50000, 54000, 58000, np.nan, 62000],
    'experience': [2, 3, 4, 5, np.nan]
}
df = pd.DataFrame(data)
print("Original DataFrame with missing values:\n", df)


Original DataFrame with missing values:
     age   salary  experience
0  25.0  50000.0         2.0
1  27.0  54000.0         3.0
2   NaN  58000.0         4.0
3  35.0      NaN         5.0
4  40.0  62000.0         NaN


In [4]:
imputer = KNNImputer(n_neighbors=2)
#👉 n_neighbors=2 ka matlab: jab koi value missing milegi, to usko bharne ke liye 2 sabse similar rows dekhi jaayengi.

In [6]:
dfimput = imputer.fit_transform(df)
dfimput = pd.DataFrame(dfimput,columns=df.columns)
print("\nDataFrame after KNN Imputation:\n", dfimput)


DataFrame after KNN Imputation:
     age   salary  experience
0  25.0  50000.0         2.0
1  27.0  54000.0         3.0
2  31.0  58000.0         4.0
3  35.0  60000.0         5.0
4  40.0  62000.0         4.5
