In [57]:
import pandas as pd
import numpy as np
np.random.seed(41)
n=1000
data = ({
    "Student_ID": np.arange(101,101+n),
    "Age": np.random.randint(18,24,size=n),
    "Marks": np.random.randint(40,100,size=n),
    "Department" :np.random.choice(["CSE","ECE","A&R","ME","AIML"],size=n)
})
df = pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,CSE
1,102,21,56,A&R
2,103,22,65,A&R
3,104,20,46,CSE
4,105,18,80,CSE
...,...,...,...,...
995,1096,21,48,CSE
996,1097,21,76,A&R
997,1098,20,42,CSE
998,1099,21,62,ECE


In [58]:
from numpy.random.mtrand import sample
# Introduce Missing values in Age, Marks and Department
for col in ["Age","Marks","Department"]:
    df.loc[df.sample(frac=0.5).index,col] = np.nan

df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,92.0,CSE
1,102,21.0,,A&R
2,103,,,A&R
3,104,,46.0,
4,105,,80.0,
...,...,...,...,...
995,1096,,48.0,
996,1097,21.0,,A&R
997,1098,,42.0,CSE
998,1099,,62.0,ECE


In [59]:
# Duplicate Records/Rows
pd.concat([df,df.sample(50)], ignore_index=True)

df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,92.0,CSE
1,102,21.0,,A&R
2,103,,,A&R
3,104,,46.0,
4,105,,80.0,
...,...,...,...,...
995,1096,,48.0,
996,1097,21.0,,A&R
997,1098,,42.0,CSE
998,1099,,62.0,ECE


In [60]:
df.loc[df.sample(10).index,"Age"]= np.nan

df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,92.0,CSE
1,102,21.0,,A&R
2,103,,,A&R
3,104,,46.0,
4,105,,80.0,
...,...,...,...,...
995,1096,,48.0,
996,1097,21.0,,A&R
997,1098,,42.0,CSE
998,1099,,62.0,ECE


In [61]:
#Duplicate invalid ages
df.loc[df.sample(10).index,"Age"]=np.random.choice([-5, 240], size=10)

df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,92.0,CSE
1,102,21.0,,A&R
2,103,,,A&R
3,104,,46.0,
4,105,,80.0,
...,...,...,...,...
995,1096,,48.0,
996,1097,21.0,,A&R
997,1098,,42.0,CSE
998,1099,,62.0,ECE


In [62]:
df.loc[df.sample(60).index,"Marks"]=np.nan
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,92.0,CSE
1,102,21.0,,A&R
2,103,,,A&R
3,104,,46.0,
4,105,,80.0,
...,...,...,...,...
995,1096,,48.0,
996,1097,21.0,,A&R
997,1098,,42.0,CSE
998,1099,,,ECE


In [63]:
df.loc[df.sample(10).index,"Marks"] = np.random.choice([150, 240], size=10)

df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,92.0,CSE
1,102,21.0,,A&R
2,103,,,A&R
3,104,,46.0,
4,105,,80.0,
...,...,...,...,...
995,1096,,48.0,
996,1097,21.0,,A&R
997,1098,,42.0,CSE
998,1099,,,ECE


In [64]:
df = df.copy()
#Creating a copy of df so that original data doesnt get disturbed

In [65]:
df.columns =(
    df.columns
    .str.strip()   #Removing spacing from columns
    .str.lower()             #converting column hedings to lowercase
    .str.replace("_","")  #Removing spacing by.
)
df.columns


Index(['studentid', 'age', 'marks', 'department'], dtype='object')

In [66]:
#Enforcing Correct datatypes
df["age"]=pd.to_numeric(df["age"],errors="coerce")
df

Unnamed: 0,studentid,age,marks,department
0,101,,92.0,CSE
1,102,21.0,,A&R
2,103,,,A&R
3,104,,46.0,
4,105,,80.0,
...,...,...,...,...
995,1096,,48.0,
996,1097,21.0,,A&R
997,1098,,42.0,CSE
998,1099,,,ECE


In [67]:
# Handling Missing Values using Mean, Median, Categorical

df["age"].fillna(df["age"].median(),inplace=True)
df


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(df["age"].median(),inplace=True)


Unnamed: 0,studentid,age,marks,department
0,101,20.0,92.0,CSE
1,102,21.0,,A&R
2,103,20.0,,A&R
3,104,20.0,46.0,
4,105,20.0,80.0,
...,...,...,...,...
995,1096,20.0,48.0,
996,1097,21.0,,A&R
997,1098,20.0,42.0,CSE
998,1099,20.0,,ECE


In [68]:
#Handle Duplicates
df.drop_duplicates(inplace=True)
df

Unnamed: 0,studentid,age,marks,department
0,101,20.0,92.0,CSE
1,102,21.0,,A&R
2,103,20.0,,A&R
3,104,20.0,46.0,
4,105,20.0,80.0,
...,...,...,...,...
995,1096,20.0,48.0,
996,1097,21.0,,A&R
997,1098,20.0,42.0,CSE
998,1099,20.0,,ECE


**Data Validations**

In [69]:
valid_department = ["CSE","AIML","A&R","ECE"]
df=df["department"].isin(valid_department)

In [70]:
df

Unnamed: 0,department
0,True
1,True
2,True
3,False
4,False
...,...
995,False
996,True
997,True
998,True


In [90]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,CSE
1,102,21,56,A&R
2,103,22,65,A&R
3,104,20,46,CSE
4,105,18,80,CSE
...,...,...,...,...
995,1096,21,48,CSE
996,1097,21,76,A&R
997,1098,20,42,CSE
998,1099,21,62,ECE


In [92]:

df=df[df["Age"].between(18,24)]
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,CSE
1,102,21,56,A&R
2,103,22,65,A&R
3,104,20,46,CSE
4,105,18,80,CSE
...,...,...,...,...
995,1096,21,48,CSE
996,1097,21,76,A&R
997,1098,20,42,CSE
998,1099,21,62,ECE


In [93]:
df=df[df["Marks"].between(0,100)]
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,CSE
1,102,21,56,A&R
2,103,22,65,A&R
3,104,20,46,CSE
4,105,18,80,CSE
...,...,...,...,...
995,1096,21,48,CSE
996,1097,21,76,A&R
997,1098,20,42,CSE
998,1099,21,62,ECE
