In [22]:
import pandas as pd
import numpy as np

In [23]:
# Create a Simple, Clean Synthetic Dataset
np.random.seed(41)
n=1000

df=pd.DataFrame({
    "Student_ID": np.arange(101,101+n),
    "Age": np.random.randint(18,25,size=n),
    "Marks": np.random.randint(50,100,size=n),
    "Department": np.random.choice(["AIML", "CSE", "ECE", "MECH","AUTOMATION"],size=n)
})


In [24]:
#Introduce Missing Values in Age, Marks, Department
for col in ["Age","Marks","Department"]:
    df.loc[df.sample(frac=0.5).index, col] = np.nan

## Duplicate records/rows
df = pd.concat([df, df.sample(20)], ignore_index=True)

#Introduce Invalid ages
df.loc[df.sample(10).index, "Age"] = np.random.choice([-5, -10, -15], size=10)

#Introduce outliers
df.loc[df.sample(10).index, "Marks"] = np.random.choice([0, 150, 200], size=10)

#Inconsistent Text Formatting
df["Department"] = df["Department"].astype(str)
# lowercase only the sampled rows (use 'frac' not 'fract')
dept_idx = df.sample(frac=0.1).index
df.loc[dept_idx, "Department"] = df.loc[dept_idx, "Department"].str.lower()


In [25]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,82.0,AUTOMATION
1,102,21.0,,MECH
2,103,,,CSE
3,104,,68.0,
4,105,,85.0,
...,...,...,...,...
1015,428,,,ECE
1016,618,24.0,86.0,
1017,480,18.0,,
1018,684,,79.0,


In [26]:
df=df.copy()

In [27]:
# Standardize Column Names  
df.columns =(
     df.columns.str.strip()  # Remove leading/trailing spaces
    .str.lower()           # Convert to lowercase
    .str.replace(' ', '_') # Replace spaces with underscores    
)

In [28]:
df.columns

Index(['student_id', 'age', 'marks', 'department'], dtype='object')

In [29]:
import re
# Enforcing Correct Data Types
# normalize messy column names (remove non-alnum chars, collapse underscores, lowercase)
df.columns = (
	df.columns.astype(str)
	.str.replace(r'[^0-9a-zA-Z_]', '', regex=True)
	.str.replace(r'_+', '_', regex=True)
	.str.strip('_')
	.str.lower()
)

# map fuzzy names to canonical names
col_map = {}
for c in df.columns:
	key = re.sub(r'[^a-z]', '', c.lower())
	if 'age' in key:
		col_map[c] = 'age'
	elif 'mark' in key:
		col_map[c] = 'marks'
	elif 'student' in key:
		col_map[c] = 'student_id'
	elif 'depart' in key:
		col_map[c] = 'department'
if col_map:
	df = df.rename(columns=col_map)

df["age"] = pd.to_numeric(df["age"], errors='coerce')
df["marks"] = pd.to_numeric(df["marks"], errors='coerce')


In [30]:
# Handling Missing Values
df["age"] = df["age"].fillna(df["age"].median(), inplace=True)

In [31]:
#Handling Duplicates
df = df.drop_duplicates(inplace=True)

In [32]:
valid_departments = ["CSE", "ECE", "ME", "CE", "EE"]
def is_valid_department(dept_code):
    return dept_code in valid_departments