In [14]:
import numpy as np
import pandas as pd


In [15]:
np.random.seed(41)
n = 1000
data = {"Student_ID": np.arange(101,101+n), "Age": np.random.randint(18,24,size=n), "Marks": np.random.randint(40,100,size=n), "Department": np.random.choice(["CSE","ECE","ME","ECE"], size=n)}
df = pd.DataFrame(data)

In [16]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,ECE
1,102,21,56,CSE
2,103,22,65,ME
3,104,20,46,ME
4,105,18,80,CSE
...,...,...,...,...
995,1096,21,48,ME
996,1097,21,76,ME
997,1098,20,42,ECE
998,1099,21,62,ECE


In [17]:
for col in ["Age","Marks","Department"]:
    df.loc[df.sample(frac=0.05).index, col] = np.nan

# add a small sample of duplicate rows
df = pd.concat([df, df.sample(20)], ignore_index=True)

# set some Age values to null
df.loc[df.sample(10).index, "Age"] = np.nan

In [18]:
df.loc[df.sample(10).index, "Marks"] = np.nan

In [19]:
df["Department"] = df["Department"].astype(str)
df.loc[df.sample(frac=0.1).index, "Department"] = df["Department"].str.lower()

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1020 entries, 0 to 1019
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Student_ID  1020 non-null   int64  
 1   Age         957 non-null    float64
 2   Marks       961 non-null    float64
 3   Department  1020 non-null   object 
dtypes: float64(2), int64(1), object(1)
memory usage: 32.0+ KB


In [21]:
df.columns = (df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)


In [22]:
df.columns

Index(['student_id', 'age', 'marks', 'department'], dtype='object')

In [27]:
df["placed"] = np.where(df["marks"] > 60, "Yes", "No")
df

Unnamed: 0,student_id,age,marks,department,Placed,placed
0,101,18.0,92.0,ECE,Yes,Yes
1,102,21.0,56.0,CSE,No,No
2,103,22.0,65.0,ME,Yes,Yes
3,104,,46.0,ME,No,No
4,105,18.0,80.0,cse,Yes,Yes
...,...,...,...,...,...,...
1015,995,,63.0,ECE,Yes,Yes
1016,152,22.0,90.0,CSE,Yes,Yes
1017,919,21.0,91.0,ece,Yes,Yes
1018,630,,91.0,ECE,Yes,Yes


In [None]:
df.columns

Index(['student_id', 'age', 'marks', 'department', 'Placed', 'placed'], dtype='object')

In [30]:
df

Unnamed: 0,student_id,age,marks,department,Placed,placed
0,101,18.0,92.0,ECE,Yes,Yes
1,102,21.0,56.0,CSE,No,No
2,103,22.0,65.0,ME,Yes,Yes
3,104,,46.0,ME,No,No
4,105,18.0,80.0,cse,Yes,Yes
...,...,...,...,...,...,...
1015,995,,63.0,ECE,Yes,Yes
1016,152,22.0,90.0,CSE,Yes,Yes
1017,919,21.0,91.0,ece,Yes,Yes
1018,630,,91.0,ECE,Yes,Yes


In [31]:
del df['Placed']

In [32]:
df

Unnamed: 0,student_id,age,marks,department,placed
0,101,18.0,92.0,ECE,Yes
1,102,21.0,56.0,CSE,No
2,103,22.0,65.0,ME,Yes
3,104,,46.0,ME,No
4,105,18.0,80.0,cse,Yes
...,...,...,...,...,...
1015,995,,63.0,ECE,Yes
1016,152,22.0,90.0,CSE,Yes
1017,919,21.0,91.0,ece,Yes
1018,630,,91.0,ECE,Yes


In [None]:
df["age"] = pd.to_numeric(df["age"], errors='coerce')
df["marks"] = pd.to_numeric(df["marks"], errors='coerce')   
df["student_id"] = pd.to_numeric(df["student_id"], errors='coerce')
df["department"] = df["department"].str.strip().str.upper()
df

Unnamed: 0,student_id,age,marks,department,placed
0,101,18.0,92.0,ECE,Yes
1,102,21.0,56.0,CSE,No
2,103,22.0,65.0,ME,Yes
3,104,,46.0,ME,No
4,105,18.0,80.0,CSE,Yes
...,...,...,...,...,...
1015,995,,63.0,ECE,Yes
1016,152,22.0,90.0,CSE,Yes
1017,919,21.0,91.0,ECE,Yes
1018,630,,91.0,ECE,Yes


In [None]:
df["age"].fillna(df["age"].median(), inplace=True)
df["marks"].fillna(df["marks"].median(), inplace=True)
df["department"].fillna("UNKNOWN", inplace=True)

df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(df["age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["marks"].fillna(df["marks"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Unnamed: 0,student_id,age,marks,department,placed
0,101,18.0,92.0,ECE,Yes
1,102,21.0,56.0,CSE,No
2,103,22.0,65.0,ME,Yes
3,104,20.0,46.0,ME,No
4,105,18.0,80.0,CSE,Yes
...,...,...,...,...,...
1015,995,20.0,63.0,ECE,Yes
1016,152,22.0,90.0,CSE,Yes
1017,919,21.0,91.0,ECE,Yes
1018,630,20.0,91.0,ECE,Yes


In [None]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,student_id,age,marks,department,placed
0,101,18.0,92.0,ECE,Yes
1,102,21.0,56.0,CSE,No
2,103,22.0,65.0,ME,Yes
3,104,20.0,46.0,ME,No
4,105,18.0,80.0,CSE,Yes
...,...,...,...,...,...
995,1096,21.0,48.0,ME,No
996,1097,21.0,76.0,ME,Yes
997,1098,20.0,42.0,ECE,No
998,1099,21.0,70.0,ECE,Yes
