In [48]:
import pandas as pd #
import numpy as np  # Random() function used later
n=1000
df=pd.DataFrame({
    "Student_ID": np.arange(101, 101+n),
    "Age": np.random.randint(18, 24,size=n),
    "Marks": np.random.randint(50, 100,size=n),
    "Department": np.random.choice(["AIML", "ECE", "ME", "CSE", "AR"], size=n)
})

In [43]:
# Introduce Missing Values in Age, Marks, Department
for col in["Age", "Marks", "Department"]:
    df.loc[df.sample(frac=0.05).index, col] = np.nan  # Introduce 5% missing values
df  


Unnamed: 0,Student_ID,Age,Marks,Department


In [46]:
#Introdece Duplicate records/Rows
pd.concat([df, df.sample(20)], ignore_index=True) # Duplicate 20 random rows

#Introduce invalid ages
df.loc[df.sample(10).index, "Age"] = np.random.choice([-5, 240], size=10)  
df



ValueError: a must be greater than 0 unless no samples are taken

In [16]:
# Inconsistent Text Formatting
df["Department"] = df["Department"].astype(str)
df.loc[df.sample(frac=0.1).index, "Department"] = df["Department"].sample(10).str.lower()
df


Unnamed: 0,Student_ID,Age,Marks,Department
0,101,22.0,58.0,
1,102,22.0,94.0,
2,103,22.0,83.0,ME
3,104,20.0,52.0,ECE
4,105,22.0,79.0,
...,...,...,...,...
995,1096,20.0,62.0,ECE
996,1097,22.0,75.0,AR
997,1098,18.0,64.0,
998,1099,21.0,84.0,AIML


In [19]:
df=df.copy()
df.columns.str.strip()#Making Standarised Column
df.columns.str.lower()

Index(['student_id', 'age', 'marks', 'department'], dtype='object')

In [18]:
df.columns.str.replace(" " ,"_")

Index(['Student_ID', 'Age', 'Marks', 'Department'], dtype='object')

In [24]:
#Enforcing Correct Data type
df["Age"] = pd.to_numeric(df["Age"], errors ="coerce")
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,22.0,58.0,
1,102,22.0,94.0,
2,103,22.0,83.0,ME
3,104,20.0,52.0,ECE
4,105,22.0,79.0,
...,...,...,...,...
995,1096,20.0,62.0,ECE
996,1097,22.0,75.0,AR
997,1098,18.0,64.0,
998,1099,21.0,84.0,AIML


In [26]:
#Handling Missing Values
df["Age"].fillna(df["Age"].median(), inplace=True)
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,22.0,58.0,
1,102,22.0,94.0,
2,103,22.0,83.0,ME
3,104,20.0,52.0,ECE
4,105,22.0,79.0,
...,...,...,...,...
995,1096,20.0,62.0,ECE
996,1097,22.0,75.0,AR
997,1098,18.0,64.0,
998,1099,21.0,84.0,AIML


In [27]:
#Handling Duplicates
df.drop_duplicates(inplace=True)
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,22.0,58.0,
1,102,22.0,94.0,
2,103,22.0,83.0,ME
3,104,20.0,52.0,ECE
4,105,22.0,79.0,
...,...,...,...,...
995,1096,20.0,62.0,ECE
996,1097,22.0,75.0,AR
997,1098,18.0,64.0,
998,1099,21.0,84.0,AIML


In [38]:
#validation apply on column 
valid_Department = ["CSE","AIML","IT","ECE","ME"]


df=df[df["Department"].isin(valid_Department)]
df

Unnamed: 0,Student_ID,Age,Marks,Department
2,103,22.0,83.0,ME
3,104,20.0,52.0,ECE
6,107,18.0,64.0,ME
8,109,20.0,80.0,ME
9,110,19.0,64.0,AIML
...,...,...,...,...
991,1092,21.0,,ECE
993,1094,19.0,57.0,ME
995,1096,20.0,62.0,ECE
998,1099,21.0,84.0,AIML


In [45]:
df=df[df["Marks"].between(0,100)]
df

Unnamed: 0,Student_ID,Age,Marks,Department


In [49]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,56,ECE
1,102,22,94,ECE
2,103,21,99,ECE
3,104,22,58,CSE
4,105,19,67,ECE
...,...,...,...,...
995,1096,18,54,CSE
996,1097,22,61,ME
997,1098,19,57,AIML
998,1099,23,77,CSE


In [54]:
df=df[df["Age"].between(0,19)]
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,56,ECE
12,113,18,72,CSE
20,121,18,50,ECE
28,129,18,53,CSE
29,130,18,58,CSE
...,...,...,...,...
972,1073,18,70,ME
974,1075,18,83,AR
979,1080,18,67,AR
990,1091,18,67,AIML
