In [17]:
import pandas as pd
import numpy as np


In [18]:
np.random.seed(41)
n = 1000
data = {"Student_ID": np.arange(101,101+n), "Age": np.random.randint(18,24,size=n), "Marks": np.random.randint(40,100,size=n), "Department": np.random.choice(["CSE","ECE","ME","ECE"], size=n)}
df = pd.DataFrame(data)

In [19]:
for col in ["Age","Marks","Department"]:
    df.loc[df.sample(frac=0.05).index, col] = np.nan

# add a small sample of duplicate rows
df = pd.concat([df, df.sample(20)], ignore_index=True)

# set some Age values to null
df.loc[df.sample(10).index, "Age"] = np.nan

In [20]:
df.loc[df.sample(10).index, "Marks"] = np.nan

In [21]:
df["Department"] = df["Department"].astype(str)
df.loc[df.sample(frac=0.1).index, "Department"] = df["Department"].str.lower()

In [22]:
df=df.copy()
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,92.0,ECE
1,102,21.0,56.0,CSE
2,103,22.0,65.0,ME
3,104,,46.0,ME
4,105,18.0,80.0,cse
...,...,...,...,...
1015,995,,63.0,ECE
1016,152,22.0,90.0,CSE
1017,919,21.0,91.0,ece
1018,630,,91.0,ECE


In [23]:
df.columns = (df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)



In [24]:
df.columns

Index(['student_id', 'age', 'marks', 'department'], dtype='object')

In [25]:
df["age"] = pd.to_numeric(df["age"], errors='coerce')
df["marks"] = pd.to_numeric(df["marks"], errors='coerce')   
df["student_id"] = pd.to_numeric(df["student_id"], errors='coerce')
df["department"] = df["department"].str.strip().str.upper()
df

Unnamed: 0,student_id,age,marks,department
0,101,18.0,92.0,ECE
1,102,21.0,56.0,CSE
2,103,22.0,65.0,ME
3,104,,46.0,ME
4,105,18.0,80.0,CSE
...,...,...,...,...
1015,995,,63.0,ECE
1016,152,22.0,90.0,CSE
1017,919,21.0,91.0,ECE
1018,630,,91.0,ECE


In [26]:
df["age"].fillna(df["age"].median(), inplace=True)
df["marks"].fillna(df["marks"].median(), inplace=True)
df["department"].fillna("UNKNOWN", inplace=True)

df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(df["age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["marks"].fillna(df["marks"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Unnamed: 0,student_id,age,marks,department
0,101,18.0,92.0,ECE
1,102,21.0,56.0,CSE
2,103,22.0,65.0,ME
3,104,20.0,46.0,ME
4,105,18.0,80.0,CSE
...,...,...,...,...
1015,995,20.0,63.0,ECE
1016,152,22.0,90.0,CSE
1017,919,21.0,91.0,ECE
1018,630,20.0,91.0,ECE


In [27]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,student_id,age,marks,department
0,101,18.0,92.0,ECE
1,102,21.0,56.0,CSE
2,103,22.0,65.0,ME
3,104,20.0,46.0,ME
4,105,18.0,80.0,CSE
...,...,...,...,...
995,1096,21.0,48.0,ME
996,1097,21.0,76.0,ME
997,1098,20.0,42.0,ECE
998,1099,21.0,70.0,ECE


In [28]:
valid_departments = ["CSE", "ECE", "ME","AIML"]
df=df[df["department"].isin(valid_departments)]

In [29]:
valid_departments = ["CSE", "ECE", "ME","AIML"]
df=df[df["department"].isin(valid_departments)]

In [30]:
df

Unnamed: 0,student_id,age,marks,department
0,101,18.0,92.0,ECE
1,102,21.0,56.0,CSE
2,103,22.0,65.0,ME
3,104,20.0,46.0,ME
4,105,18.0,80.0,CSE
...,...,...,...,...
995,1096,21.0,48.0,ME
996,1097,21.0,76.0,ME
997,1098,20.0,42.0,ECE
998,1099,21.0,70.0,ECE


In [32]:
df
df["age"] = pd.to_numeric(df["age"], errors='coerce')

In [None]:
df["Age"].fillna(df["Age"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].median(), inplace=True)


In [None]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,ECE
1,102,21,56,CSE
2,103,22,65,ME
3,104,20,46,ME
4,105,18,80,CSE
...,...,...,...,...
995,1096,21,48,ME
996,1097,21,76,ME
997,1098,20,42,ECE
998,1099,21,62,ECE
