In [13]:
import pandas as pd 
import numpy as np # to use random


In [14]:
np.random.seed(41)
n=1000

df=pd.DataFrame( {
    "Student_ID": np.arange(101,101+n),
    "Age": np.random.randint(18,25,size=n),
    "Marks": np.random.randint(40,100, size=n),
    "Department": np.random.choice(["AIML", "ECE", "ME", "CSE", "AUTOMATION"],size= n)
})


In [15]:
# Introduce Missing Values in Age, Marks and Department

for col in ["Age", "Marks", "Department"]:
    df.loc[df.sample(frac=0.05).index, col] = np.nan


# Duplicate records/rows

pd.concat([df, df.sample(20)], ignore_index=True)


#Introduce invalid Ages

df.loc[df.sample(n=10).index, "Age"] = np.random.choice([-5,240], size=10)


#Introduce Outliers 

df.loc[df.sample(n=10).index, "Marks"] = np.random.choice([150,240], size=10)


# Incosistent Text Formatting

df["Department"]= df["Department"].astype(str)
df.loc[df.sample(frac=0.1).index, "Department"] = df["Department"].str.lower()


In [16]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,72.0,AUTOMATION
1,102,21.0,83.0,AUTOMATION
2,103,22.0,67.0,ME
3,104,20.0,96.0,AUTOMATION
4,105,18.0,58.0,ECE
...,...,...,...,...
995,1096,24.0,54.0,ECE
996,1097,19.0,92.0,AIML
997,1098,23.0,98.0,ECE
998,1099,21.0,89.0,me


In [17]:
df=df.copy()

In [27]:
# Standardizing Column Names
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ","_")
    )

In [28]:
df.columns

Index(['student_id', 'age', 'marks', 'department'], dtype='object')

In [29]:
# Enforcing Correct Data Types  
df["age"] = pd.to_numeric(df["age"], errors='coerce')

In [30]:
# handling missing values

df['age'].fillna(df['age'].median(), inplace=True)

In [31]:
# Handle Duplicate 
df.drop_duplicates(inplace=True)