In [3]:
import pandas as pd 
#create a simple, clean synthetic dataset
import numpy as np 

In [34]:
np.random.seed(41)
n = 1000
df = pd.DataFrame ({
    'student_ID': np.arange(101, 101+n),
    'age': np.random.randint(18, 30,size= n),
    'marks': np.random.randint(40, 100, size=n),
    'grade': np.random.choice(['A', 'B', 'C', 'D'], size=n),
    'Department': np.random.choice(["Physics", "Chemistry", "Mathematics", "Biology", "Computer Science"], size=n)

})

In [5]:
#introduce missing values in age and marks and department  columns
for col in ['age', 'marks', 'Department']:
    df.loc[df.sample(frac=0.05).index, col] = np.nan
df

Unnamed: 0,student_ID,age,marks,grade,Department
0,101,18.0,81.0,A,Physics
1,102,21.0,56.0,A,Computer Science
2,103,20.0,99.0,C,Biology
3,104,18.0,59.0,B,Biology
4,105,19.0,67.0,C,Physics
...,...,...,...,...,...
995,1096,23.0,76.0,B,Computer Science
996,1097,24.0,42.0,A,Biology
997,1098,25.0,79.0,D,Chemistry
998,1099,26.0,52.0,A,Physics


In [6]:
#duplicate records/rows
pd.concat([df, df.sample(10)], ignore_index=True)
df

Unnamed: 0,student_ID,age,marks,grade,Department
0,101,18.0,81.0,A,Physics
1,102,21.0,56.0,A,Computer Science
2,103,20.0,99.0,C,Biology
3,104,18.0,59.0,B,Biology
4,105,19.0,67.0,C,Physics
...,...,...,...,...,...
995,1096,23.0,76.0,B,Computer Science
996,1097,24.0,42.0,A,Biology
997,1098,25.0,79.0,D,Chemistry
998,1099,26.0,52.0,A,Physics


In [7]:
#introduce invalid ages
df.loc[df.sample(10).index,"age"] = np.random.choice([-5, 150, 240], size=10)
df

Unnamed: 0,student_ID,age,marks,grade,Department
0,101,18.0,81.0,A,Physics
1,102,21.0,56.0,A,Computer Science
2,103,20.0,99.0,C,Biology
3,104,18.0,59.0,B,Biology
4,105,19.0,67.0,C,Physics
...,...,...,...,...,...
995,1096,23.0,76.0,B,Computer Science
996,1097,24.0,42.0,A,Biology
997,1098,25.0,79.0,D,Chemistry
998,1099,26.0,52.0,A,Physics


In [8]:
#introduce OUTLIERS 
df.loc[df.sample(10).index,"marks"] = np.random.choice([-5, 150, 240], size=10)
df

Unnamed: 0,student_ID,age,marks,grade,Department
0,101,18.0,81.0,A,Physics
1,102,21.0,56.0,A,Computer Science
2,103,20.0,99.0,C,Biology
3,104,18.0,59.0,B,Biology
4,105,19.0,67.0,C,Physics
...,...,...,...,...,...
995,1096,23.0,76.0,B,Computer Science
996,1097,24.0,42.0,A,Biology
997,1098,25.0,79.0,D,Chemistry
998,1099,26.0,52.0,A,Physics


In [9]:
#inconsistent text formatting
df["Department"] = df["Department"].astype(str)
df.loc[df.sample(frac=0.1).index, "Department"] = df["Department"].str.lower()
df

Unnamed: 0,student_ID,age,marks,grade,Department
0,101,18.0,81.0,A,Physics
1,102,21.0,56.0,A,Computer Science
2,103,20.0,99.0,C,Biology
3,104,18.0,59.0,B,biology
4,105,19.0,67.0,C,Physics
...,...,...,...,...,...
995,1096,23.0,76.0,B,Computer Science
996,1097,24.0,42.0,A,biology
997,1098,25.0,79.0,D,Chemistry
998,1099,26.0,52.0,A,Physics


In [10]:
df=df.copy()

In [None]:
df.columns=(
    df.columns
    .str.strip()
    .str.lower()
    .str.replace("", "_"))

In [18]:
df.columns

Index(['studentid', 'age', 'marks', 'grade', 'department'], dtype='object')

In [13]:
#Enforcing Correct Data Type
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df["marks"] = pd.to_numeric(df["marks"], errors="coerce")  
df["department"] = df["department"].astype("category")
df

Unnamed: 0,studentid,age,marks,grade,department
0,101,18.0,81.0,A,Physics
1,102,21.0,56.0,A,Computer Science
2,103,20.0,99.0,C,Biology
3,104,18.0,59.0,B,biology
4,105,19.0,67.0,C,Physics
...,...,...,...,...,...
995,1096,23.0,76.0,B,Computer Science
996,1097,24.0,42.0,A,biology
997,1098,25.0,79.0,D,Chemistry
998,1099,26.0,52.0,A,Physics


In [14]:
#Enforcing Correct Data Type
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df

Unnamed: 0,studentid,age,marks,grade,department
0,101,18.0,81.0,A,Physics
1,102,21.0,56.0,A,Computer Science
2,103,20.0,99.0,C,Biology
3,104,18.0,59.0,B,biology
4,105,19.0,67.0,C,Physics
...,...,...,...,...,...
995,1096,23.0,76.0,B,Computer Science
996,1097,24.0,42.0,A,biology
997,1098,25.0,79.0,D,Chemistry
998,1099,26.0,52.0,A,Physics


In [41]:
#handling missing values
df["age"].fillna(df["age"].median(), inplace=True)
df

Unnamed: 0,student_ID,age,marks,grade,Department
0,101,18,True,A,Physics
1,102,21,True,A,Computer Science
2,103,20,True,C,Biology
3,104,18,True,B,Biology
4,105,19,True,C,Physics
...,...,...,...,...,...
995,1096,23,True,B,Computer Science
996,1097,24,True,A,Biology
997,1098,25,True,D,Chemistry
998,1099,26,True,A,Physics


In [40]:
#handling dulicate records
df.drop_duplicates(inplace=True)
df

Unnamed: 0,student_ID,age,marks,grade,Department
0,101,18,True,A,Physics
1,102,21,True,A,Computer Science
2,103,20,True,C,Biology
3,104,18,True,B,Biology
4,105,19,True,C,Physics
...,...,...,...,...,...
995,1096,23,True,B,Computer Science
996,1097,24,True,A,Biology
997,1098,25,True,D,Chemistry
998,1099,26,True,A,Physics


In [None]:

#validating and correcting ranges
valid_departments = ["Computer Science","Physics","Chemistry","biology",]
df[df["Department"].isin(valid_departments)]
df
df


Unnamed: 0,student_ID,age,marks,grade,Department
0,101,18,True,A,Physics
1,102,21,True,A,Computer Science
2,103,20,True,C,Biology
3,104,18,True,B,Biology
4,105,19,True,C,Physics
...,...,...,...,...,...
995,1096,23,True,B,Computer Science
996,1097,24,True,A,Biology
997,1098,25,True,D,Chemistry
998,1099,26,True,A,Physics


In [44]:
#validating and correcting ranges
valid_departments = ["computer science","physics","chemistry","biology",]
df = df[df["Department"].isin(valid_departments)] 
df

Unnamed: 0,student_ID,age,marks,grade,Department


In [37]:
df["age"].fillna(df["age"].median(), inplace=True)
df

Unnamed: 0,student_ID,age,marks,grade,Department
0,101,18,True,A,Physics
1,102,21,True,A,Computer Science
2,103,20,True,C,Biology
3,104,18,True,B,Biology
4,105,19,True,C,Physics
...,...,...,...,...,...
995,1096,23,True,B,Computer Science
996,1097,24,True,A,Biology
997,1098,25,True,D,Chemistry
998,1099,26,True,A,Physics


In [36]:
df["marks"] = df["marks"].between(0,100)
df

Unnamed: 0,student_ID,age,marks,grade,Department
0,101,18,True,A,Physics
1,102,21,True,A,Computer Science
2,103,20,True,C,Biology
3,104,18,True,B,Biology
4,105,19,True,C,Physics
...,...,...,...,...,...
995,1096,23,True,B,Computer Science
996,1097,24,True,A,Biology
997,1098,25,True,D,Chemistry
998,1099,26,True,A,Physics


In [35]:
df

Unnamed: 0,student_ID,age,marks,grade,Department
0,101,18,81,A,Physics
1,102,21,56,A,Computer Science
2,103,20,99,C,Biology
3,104,18,59,B,Biology
4,105,19,67,C,Physics
...,...,...,...,...,...
995,1096,23,76,B,Computer Science
996,1097,24,42,A,Biology
997,1098,25,79,D,Chemistry
998,1099,26,52,A,Physics
