In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load your dataset
df = pd.read_csv('attendance_avg.csv')

In [3]:
# Handling Missing Values

# Option 1: Remove rows with missing values
df_cleaned = df.dropna()

In [4]:
# Option 2: Fill missing values with the mean (for numerical columns)
df_filled = df.fillna(df.select_dtypes(include=np.number).mean()) # Select only numeric columns for calculating the mean

In [5]:
# Option 3: Fill missing values with the median (for numerical columns)
df_filled_median = df.fillna(df.select_dtypes(include=np.number).median()) # Select numeric columns for median calculation

In [6]:
# Option 4: Fill missing values with the mode (for categorical columns)
df_filled_mode = df.apply(lambda x: x.fillna(x.mode()[0]) if x.dtype == 'O' else x)

In [7]:
# Handling Outliers

# Using the Interquartile Range (IQR) method
Q1 = df.select_dtypes(include=np.number).quantile(0.25) # Select numeric columns for quantile calculations
Q3 = df.select_dtypes(include=np.number).quantile(0.75) # Select numeric columns for quantile calculations
IQR = Q3 - Q1

In [8]:
# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [9]:
# Identify outliers
outliers = ((df.select_dtypes(include=np.number) < lower_bound) | (df.select_dtypes(include=np.number) > upper_bound))

In [10]:
# Option 1: Remove outliers
df_no_outliers = df[~((df.select_dtypes(include=np.number) < lower_bound) | (df.select_dtypes(include=np.number) > upper_bound)).any(axis=1)]

In [11]:
# Option 2: Cap outliers to the lower and upper bounds
df_capped = df.copy()
numeric_df = df.select_dtypes(include=np.number) # Select numeric columns for comparison

for col in numeric_df.columns:
    df_capped.loc[numeric_df[col] < lower_bound[col], col] = lower_bound[col]
    df_capped.loc[numeric_df[col] > upper_bound[col], col] = upper_bound[col]

  df_capped.loc[numeric_df[col] < lower_bound[col], col] = lower_bound[col]


In [12]:
# Option 3: Impute outliers with mean/median
df_imputed_outliers = df.copy()
df_imputed_outliers[outliers] = np.nan

# Calculate the mean only for numeric columns
numeric_df = df.select_dtypes(include=np.number)
df_imputed_outliers = df_imputed_outliers.fillna(numeric_df.mean())

In [13]:
print("Original DataFrame:\n", df)
print("DataFrame after handling missing values and outliers:\n", df_imputed_outliers)

Original DataFrame:
      SAP ID     ROLL ID            NAME   Avg
0  50002354  R211235456   Pranav Sharma  83.8
1  50003546  R215665563    Ranjan Kumar  63.4
2  50007894  R218999654  Riya Srivastav  78.0
3  50003156  R212256653     Aditi Tamta  83.4
4  50007235  R218656723   Mehak Kandpal  94.4
5  50002324  R212544582     Sajal Suyal  69.2
6  50008943  R218946821  Lokesh Kapkoti  75.7
7  50003013  R219756894   Diwakar Bisht  69.0
8  50002364  R218656595  Devank Rathore  81.2
DataFrame after handling missing values and outliers:
      SAP ID     ROLL ID            NAME   Avg
0  50002354  R211235456   Pranav Sharma  83.8
1  50003546  R215665563    Ranjan Kumar  63.4
2  50007894  R218999654  Riya Srivastav  78.0
3  50003156  R212256653     Aditi Tamta  83.4
4  50007235  R218656723   Mehak Kandpal  94.4
5  50002324  R212544582     Sajal Suyal  69.2
6  50008943  R218946821  Lokesh Kapkoti  75.7
7  50003013  R219756894   Diwakar Bisht  69.0
8  50002364  R218656595  Devank Rathore  81.2
