In [37]:
import pandas as pd # for data manipulation
import numpy as np  # for numerical operations(random data generation)
np.random.seed(41)
n=1000
df = pd.DataFrame({    
    "Student_ID":np.arange(101,101+n),
    "Age": np.random.randint(18,30,size=n),
    "Marks":np.random.randint(40,100,size=n),
    "Departrment": np.random.choice(['Physics', 'Chemistry', 'Mathematics', 'Biology', 'Computer Science'], size=n)
})
df 


Unnamed: 0,Student_ID,Age,Marks,Departrment
0,101,18,81,Computer Science
1,102,21,56,Computer Science
2,103,20,99,Mathematics
3,104,18,59,Mathematics
4,105,19,67,Biology
...,...,...,...,...
995,1096,23,76,Physics
996,1097,24,42,Physics
997,1098,25,79,Chemistry
998,1099,26,52,Chemistry


In [38]:
#introduce missing values in age, Marks and Department columns
for col in ['Age', 'Marks', 'Departrment']:
    df.loc[df.sample(frac=0.1).index, col] = np. nan

#Adding Duplicate rows 
df = pd.concat([df, df.sample(20)], ignore_index=True)

#Rename misspelled column to correct name if necessary
if 'Departrment' in df.columns and 'Department' not in df.columns:
	df = df.rename(columns={'Departrment': 'Department'})

#Introduce invalid age values
df.loc[df.sample(20).index, 'Age'] = np.random.choice([-5, 240], size=20)

#outliers in Marks
df.loc[df.sample(20).index, 'Marks'] = np.random.choice([-10, 150], size=20)

#Inconsistent text formatting
df['Department'] = df['Department'].astype(str)
df.loc[df.sample(20).index, 'Department'] = df['Department'].str.lower()

df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,-5.0,81.0,Computer Science
1,102,21.0,56.0,Computer Science
2,103,20.0,99.0,Mathematics
3,104,,59.0,Mathematics
4,105,19.0,67.0,Biology
...,...,...,...,...
1015,915,29.0,94.0,Biology
1016,598,28.0,65.0,Chemistry
1017,1033,29.0,85.0,Computer Science
1018,252,18.0,70.0,Physics


In [26]:
df=df.copy()

In [27]:
#standaerized the column names
df.columns = (df.columns.str.strip().str.lower().str.replace(' ', '_'))

In [28]:
df.columns

Index(['student_id', 'age', 'marks', 'department'], dtype='object')

In [29]:
#Enforcing correct data types
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['marks'] = pd.to_numeric(df['marks'], errors='coerce')
df['student_id'] = pd.to_numeric(df['student_id'], errors='coerce')
df['department'] = df['department'].astype(str)

In [None]:
#handling missing values
df['age'] =df['age'].fillna(df['age'].median())

# df['marks'].fillna(df['marks'].median(), inplace=True)
# df['department'].fillna('Unknown', inplace=True)


In [31]:
#handling duplicates
df.drop_duplicates(inplace=True)

In [32]:
#applying validation on values
valid_departments = ['Physics', 'Chemistry', 'Mathematics', 'Biology', 'Computer Science']
df = df['department'].isin(valid_departments)
df

0      True
1      True
2      True
3      True
4      True
       ... 
995    True
996    True
997    True
998    True
999    True
Name: department, Length: 1000, dtype: bool

In [45]:
valid_departments = ['Physics', 'Chemistry', 'Mathematics', 'Biology', 'Computer Science']
df = df[df['Department'].isin(valid_departments)]
df

Unnamed: 0,Student_ID,Age,Marks,Department
1,102,21.0,56.0,Computer Science
2,103,20.0,99.0,Mathematics
4,105,19.0,67.0,Biology
8,109,23.0,73.0,Physics
11,112,21.0,71.0,Physics
...,...,...,...,...
1008,271,20.0,58.0,Mathematics
1009,1052,24.0,93.0,Physics
1013,391,22.0,79.0,Mathematics
1014,713,20.0,81.0,Physics


In [35]:
df

0      True
1      True
2      True
3      True
4      True
       ... 
995    True
996    True
997    True
998    True
999    True
Name: department, Length: 1000, dtype: bool

In [39]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,-5.0,81.0,Computer Science
1,102,21.0,56.0,Computer Science
2,103,20.0,99.0,Mathematics
3,104,,59.0,Mathematics
4,105,19.0,67.0,Biology
...,...,...,...,...
1015,915,29.0,94.0,Biology
1016,598,28.0,65.0,Chemistry
1017,1033,29.0,85.0,Computer Science
1018,252,18.0,70.0,Physics


In [44]:
df = df[df['Age'].between(18,24)]
df


Unnamed: 0,Student_ID,Age,Marks,Department
1,102,21.0,56.0,Computer Science
2,103,20.0,99.0,Mathematics
4,105,19.0,67.0,Biology
5,106,21.0,87.0,
8,109,23.0,73.0,Physics
...,...,...,...,...
1008,271,20.0,58.0,Mathematics
1009,1052,24.0,93.0,Physics
1013,391,22.0,79.0,Mathematics
1014,713,20.0,81.0,Physics


In [43]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,-5.0,81.0,Computer Science
1,102,21.0,56.0,Computer Science
2,103,20.0,99.0,Mathematics
3,104,,59.0,Mathematics
4,105,19.0,67.0,Biology
...,...,...,...,...
1015,915,29.0,94.0,Biology
1016,598,28.0,65.0,Chemistry
1017,1033,29.0,85.0,Computer Science
1018,252,18.0,70.0,Physics


In [46]:
df = df[df['Marks'].between(24,56)]
df

Unnamed: 0,Student_ID,Age,Marks,Department
1,102,21.0,56.0,Computer Science
12,113,21.0,43.0,Mathematics
16,117,22.0,56.0,Chemistry
28,129,20.0,50.0,Mathematics
30,131,23.0,49.0,Mathematics
...,...,...,...,...
950,1051,20.0,48.0,Mathematics
974,1075,24.0,49.0,Computer Science
988,1089,19.0,46.0,Computer Science
994,1095,21.0,43.0,Mathematics
