In [2]:
import pandas as pd #
import numpy as np  # Random() function used later

In [12]:
np.random.seed(41)
n=1000
df=pd.DataFrame({
    "Student_ID": np.arange(101, 101+n),
    "Age": np.random.randint(18, 24,size=n),
    "Marks": np.random.randint(50, 100,size=n),
    "Department": np.random.choice(["AIML", "ECE", "ME", "CSE", "AR"], size=n)
})

In [7]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,21,88,ME
1,102,18,60,ME
2,103,21,84,AIML
3,104,22,93,AIML
4,105,19,86,CSE
...,...,...,...,...
995,1096,22,70,AR
996,1097,22,92,CSE
997,1098,23,88,AR
998,1099,22,70,ME


In [9]:
# Introduce Missing Values in Age, Marks, Department
for col in["Age", "Marks", "Department"]:
    df.loc[df.sample(frac=0.05).index, col] = np.nan  # Introduce 5% missing values
df 

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,88.0,ME
1,102,18.0,60.0,ME
2,103,21.0,84.0,AIML
3,104,22.0,93.0,AIML
4,105,19.0,86.0,CSE
...,...,...,...,...
995,1096,,70.0,AR
996,1097,22.0,,CSE
997,1098,23.0,88.0,AR
998,1099,22.0,70.0,ME


In [13]:
#Introdece Duplicate records/Rows
pd.concat([df, df.sample(20)], ignore_index=True) # Duplicate 20 random rows

#Introduce invalid ages
df.loc[df.sample(10).index, "Age"] = np.random.choice([-5, 240], size=10)  
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,66,CSE
1,102,21,75,CSE
2,103,22,56,ME
3,104,20,90,ME
4,105,18,82,ME
...,...,...,...,...
995,1096,21,88,AR
996,1097,21,80,ECE
997,1098,20,51,CSE
998,1099,21,53,AR


In [15]:
#Introduce Outliers
df.loc[df.sample(10).index, "Marks"] = np.random.choice([150, 300], size=10)
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,66,CSE
1,102,21,75,CSE
2,103,22,56,ME
3,104,20,90,ME
4,105,18,82,ME
...,...,...,...,...
995,1096,21,88,AR
996,1097,21,80,ECE
997,1098,20,51,CSE
998,1099,21,53,AR


In [17]:
# Inconsistent Text Formatting
df["Department"] = df["Department"].astype(str)
df.loc[df.sample(frac=0.1).index, "Department"] = df["Department"].sample(10).str.lower()
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,66,CSE
1,102,21,75,CSE
2,103,22,56,ME
3,104,20,90,ME
4,105,18,82,ME
...,...,...,...,...
995,1096,21,88,
996,1097,21,80,
997,1098,20,51,CSE
998,1099,21,53,AR


In [19]:
df=df.copy()  # To avoid SettingWithCopyWarning

In [22]:
#column name standarizedization
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    )

In [24]:
df.columns

Index(['student_id', 'age', 'marks', 'department'], dtype='object')

In [31]:
# Enforcing correct Data Types
df["age"] = pd.to_numeric(df["age"], errors='coerce')
df["marks"] = pd.to_numeric(df["marks"], errors='coerce')   
df["department"] = df["department"].astype(str)

In [None]:
# Handling Missing Values
df["age"].fillna(df["age"].median(), inplace=True)
df.method({col: value}, inplace=True)
df["marks"].fillna(df["marks"].median(), inplace=True)
df["department"].fillna("Unknown", inplace=True)    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(df["age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["marks"].fillna(df["marks"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

In [30]:
# Handle Duplicates
df.drop_duplicates(inplace=True)