In [1]:
# Ques_1.ipynb – Handling Missing Data

import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, np.nan, 35, 45, np.nan],
    'Gender': ['F', 'M', np.nan, 'M', 'F'],
    'Salary': [50000, 60000, np.nan, 80000, 90000],
    'JoinDate': pd.to_datetime(['2020-01-01', '2020-06-15', '2021-03-20', '2021-07-30', '2022-01-10'])
}

df = pd.DataFrame(data)

# 1. Introduction to Missing Data in a DataFrame
print("Missing Values in Each Column:\n", df.isnull().sum())

# 2. Dropping Rows with Missing Values
df_dropped_rows = df.dropna()
print("\nDataFrame after Dropping Rows with Missing Values:\n", df_dropped_rows)

# 3. Dropping Columns with Missing Values
df_dropped_columns = df.dropna(axis=1)
print("\nDataFrame after Dropping Columns with Missing Values:\n", df_dropped_columns)

# 4. Mean Imputation for Numerical Data (Age)
df_mean_imputed = df.copy()
df_mean_imputed['Age'].fillna(df_mean_imputed['Age'].mean(), inplace=True)
print("\nDataFrame after Mean Imputation for 'Age':\n", df_mean_imputed)

# 5. Mode Imputation for Categorical Data (Gender)
df_mode_imputed = df.copy()
df_mode_imputed['Gender'].fillna(df_mode_imputed['Gender'].mode()[0], inplace=True)
print("\nDataFrame after Mode Imputation for 'Gender':\n", df_mode_imputed)

# 6. Median Imputation for Skewed Data (Salary)
df_median_imputed = df.copy()
df_median_imputed['Salary'].fillna(df_median_imputed['Salary'].median(), inplace=True)
print("\nDataFrame after Median Imputation for 'Salary':\n", df_median_imputed)

# 7. KNN Imputation
df_knn = df.copy()
# Selecting numerical columns for KNN Imputer
numerical_cols = ['Age', 'Salary']
imputer = KNNImputer(n_neighbors=2)
df_knn[numerical_cols] = imputer.fit_transform(df_knn[numerical_cols])
print("\nDataFrame after KNN Imputation:\n", df_knn)

# 8. Detecting and Handling Missing Categorical Data (Gender)
df_categorical = df.copy()
# Calculate the mode
gender_mode = df_categorical['Gender'].mode()
# If multiple modes exist, select the second most frequent
if len(gender_mode) > 1:
    gender_counts = df_categorical['Gender'].value_counts()
    second_mode = gender_counts.index[1]
    df_categorical['Gender'].fillna(second_mode, inplace=True)
else:
    df_categorical['Gender'].fillna(gender_mode[0], inplace=True)
print("\nDataFrame after Handling Missing Categorical Data:\n", df_categorical)

# 9. Predictive Modeling for Imputation (Predicting Age)
df_predictive = df.copy()
# Separate rows with and without missing Age
df_known = df_predictive[df_predictive['Age'].notnull()]
df_unknown = df_predictive[df_predictive['Age'].isnull()]
# Features for prediction
features = ['Salary']
# Ensure no missing values in features
df_known = df_known.dropna(subset=features)
df_unknown = df_unknown.dropna(subset=features)
# Train Linear Regression model
model = LinearRegression()
model.fit(df_known[features], df_known['Age'])
# Predict missing Age values
predicted_ages = model.predict(df_unknown[features])
df_predictive.loc[df_predictive['Age'].isnull(), 'Age'] = predicted_ages
print("\nDataFrame after Predictive Modeling for 'Age':\n", df_predictive)

# 10. Handling Time Series Data with Forward and Backward Fill
df_time_series = df.copy()
# Introduce missing values in 'Salary' for demonstration
df_time_series.loc[2, 'Salary'] = np.nan
df_time_series.sort_values('JoinDate', inplace=True)
# Forward fill
df_time_series_ffill = df_time_series.fillna(method='ffill')
print("\nDataFrame after Forward Fill:\n", df_time_series_ffill)
# Backward fill
df_time_series_bfill = df_time_series.fillna(method='bfill')
print("\nDataFrame after Backward Fill:\n", df_time_series_bfill)

Missing Values in Each Column:
 Name        0
Age         2
Gender      1
Salary      1
JoinDate    0
dtype: int64

DataFrame after Dropping Rows with Missing Values:
     Name   Age Gender   Salary   JoinDate
0  Alice  25.0      F  50000.0 2020-01-01
3  David  45.0      M  80000.0 2021-07-30

DataFrame after Dropping Columns with Missing Values:
       Name   JoinDate
0    Alice 2020-01-01
1      Bob 2020-06-15
2  Charlie 2021-03-20
3    David 2021-07-30
4      Eve 2022-01-10

DataFrame after Mean Imputation for 'Age':
       Name   Age Gender   Salary   JoinDate
0    Alice  25.0      F  50000.0 2020-01-01
1      Bob  35.0      M  60000.0 2020-06-15
2  Charlie  35.0    NaN      NaN 2021-03-20
3    David  45.0      M  80000.0 2021-07-30
4      Eve  35.0      F  90000.0 2022-01-10

DataFrame after Mode Imputation for 'Gender':
       Name   Age Gender   Salary   JoinDate
0    Alice  25.0      F  50000.0 2020-01-01
1      Bob   NaN      M  60000.0 2020-06-15
2  Charlie  35.0      F      

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_mean_imputed['Age'].fillna(df_mean_imputed['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_mode_imputed['Gender'].fillna(df_mode_imputed['Gender'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never wo