In [16]:
import pandas as pd #
import numpy as np  # Random() function used later

In [17]:
np.random.seed(41)
n=1000
df=pd.DataFrame({
    "Student_ID": np.arange(101, 101+n),
    "Age": np.random.randint(18, 24,size=n),
    "Marks": np.random.randint(50, 100,size=n),
    "Department": np.random.choice(["AIML", "ECE", "ME", "CSE", "AR"], size=n)
})

In [18]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,66,CSE
1,102,21,75,CSE
2,103,22,56,ME
3,104,20,90,ME
4,105,18,82,ME
...,...,...,...,...
995,1096,21,88,AR
996,1097,21,80,ECE
997,1098,20,51,CSE
998,1099,21,53,AR


In [19]:
# Introduce Missing Values in Age, Marks, Department
for col in["Age", "Marks", "Department"]:
    df.loc[df.sample(frac=0.05).index, col] = np.nan  # Introduce 5% missing values
df 

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,66.0,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,21.0,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR


In [20]:
#Introdece Duplicate records/Rows
pd.concat([df, df.sample(20)], ignore_index=True) # Duplicate 20 random rows

#Introduce invalid ages
df.loc[df.sample(10).index, "Age"] = np.random.choice([-5, 240], size=10)  
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,66.0,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,21.0,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR


In [21]:
#Introduce Outliers
df.loc[df.sample(10).index, "Marks"] = np.random.choice([150, 300], size=10)
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,66.0,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,21.0,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR


In [22]:
# Inconsistent Text Formatting
df["Department"] = df["Department"].astype(str)
df.loc[df.sample(frac=0.1).index, "Department"] = df["Department"].sample(10).str.lower()
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,66.0,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,21.0,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR


In [23]:
df=df.copy()  # To avoid SettingWithCopyWarning

In [24]:
#column name standarizedization
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    )

In [25]:
df.columns

Index(['student_id', 'age', 'marks', 'department'], dtype='object')

In [26]:
# Enforcing correct Data Types
df["age"] = pd.to_numeric(df["age"], errors='coerce')
df["marks"] = pd.to_numeric(df["marks"], errors='coerce')   
df["department"] = df["department"].astype(str)

In [27]:
# Handling Missing Values
#df['age'] = df['age'].fillna(df['age'].median())
#df['marks'] = df['marks'].fillna(df['marks'].median())      
#df["age"].fillna(df["age"].median(), inplace=True)
#df.method({col: value}, inplace=True)
#df["marks"].fillna(df["marks"].median(), inplace=True)
#df["department"].fillna("Unknown", inplace=True)    

In [28]:
# Handle Duplicates
df.drop_duplicates(inplace=True)

In [29]:
valid_departments = ["AIML", "ECE", "ME", "CSE", "AR"]
df["department"].isin(valid_departments)

0      True
1      True
2      True
3      True
4      True
       ... 
995    True
996    True
997    True
998    True
999    True
Name: department, Length: 1000, dtype: bool

In [31]:
valid_departments = ["AIML", "ECE", "ME", "CSE", "AR"]
df = df[df["department"].isin(valid_departments)]

In [33]:
df

Unnamed: 0,student_id,age,marks,department
0,101,18.0,66.0,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,21.0,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR


In [35]:
df[df["age"].between(18, 23)]

Unnamed: 0,student_id,age,marks,department
0,101,18.0,66.0,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
994,1095,19.0,55.0,ME
995,1096,21.0,88.0,AR
996,1097,21.0,80.0,ECE
998,1099,21.0,53.0,AR


In [37]:
df[df["marks"].between(0,100)]

Unnamed: 0,student_id,age,marks,department
0,101,18.0,66.0,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,21.0,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR
