In [42]:
import pandas as pd 
import numpy as np # to use random


In [43]:
np.random.seed(41)
n=1000

df=pd.DataFrame( {
    "Student_ID": np.arange(101,101+n),
    "Age": np.random.randint(18,25,size=n),
    "Marks": np.random.randint(40,100, size=n),
    "Department": np.random.choice(["AIML", "ECE", "ME", "CSE", "AUTOMATION"],size= n)
})


In [44]:
# Introduce Missing Values in Age, Marks and Department

for col in ["Age", "Marks", "Department"]:
    df.loc[df.sample(frac=0.05).index, col] = np.nan


# Duplicate records/rows

pd.concat([df, df.sample(20)], ignore_index=True)


#Introduce invalid Ages

df.loc[df.sample(n=10).index, "Age"] = np.random.choice([-5,240], size=10)


#Introduce Outliers 

df.loc[df.sample(n=10).index, "Marks"] = np.random.choice([150,240], size=10)


# Incosistent Text Formatting

df["Department"]= df["Department"].astype(str)
df.loc[df.sample(frac=0.1).index, "Department"] = df["Department"].str.lower()


In [45]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,72.0,AUTOMATION
1,102,21.0,83.0,AUTOMATION
2,103,22.0,67.0,ME
3,104,20.0,96.0,AUTOMATION
4,105,18.0,58.0,ECE
...,...,...,...,...
995,1096,24.0,54.0,ECE
996,1097,19.0,92.0,AIML
997,1098,23.0,98.0,ECE
998,1099,21.0,89.0,me


In [46]:
df=df.copy()

In [47]:
# Standardizing Column Names
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ","_")
    )

In [48]:
df.columns

Index(['student_id', 'age', 'marks', 'department'], dtype='object')

In [49]:
# Enforcing Correct Data Types  
df["age"] = pd.to_numeric(df["age"], errors='coerce')

In [50]:
# handling missing values

df['age'].fillna(df['age'].median(), inplace=True)

In [51]:
# Handle Duplicate 
df.drop_duplicates(inplace=True)

In [52]:
valid_department = ["AIML", "ECE", "ME", "CSE", "IT"]
df['department'].isin(valid_department)

0      False
1      False
2       True
3      False
4       True
       ...  
995     True
996     True
997     True
998    False
999     True
Name: department, Length: 1000, dtype: bool

In [53]:
valid_department = ["AIML", "ECE", "ME", "CSE", "IT"]
df = df[df['department'].isin(valid_department)]

In [55]:
df.department

2        ME
4       ECE
6        ME
7       ECE
8       ECE
       ... 
992    AIML
995     ECE
996    AIML
997     ECE
999     CSE
Name: department, Length: 678, dtype: object

In [59]:
df=df[df["age"].between(18,24)]

In [61]:
df.age

2      22.0
4      18.0
6      21.0
7      19.0
8      23.0
       ... 
992    22.0
995    24.0
996    19.0
997    23.0
999    20.0
Name: age, Length: 671, dtype: float64

In [63]:
df=df[df["marks"].between(0,100)]

In [66]:
df.marks

2      67.0
4      58.0
7      79.0
8      46.0
9      48.0
       ... 
992    88.0
995    54.0
996    92.0
997    98.0
999    45.0
Name: marks, Length: 627, dtype: float64