# Dealing with missing values

In [2]:
# Pandas manages empty cells with Nan values. Let's say if in our data set there are some missing values.
# Pandas automatically manages it with nan values so it doesn't raise an error
# when we have a column filled with ints or floats then pandas manages it with NAN.
# OR let's say if we have object dtype column then it may assign it with None.

In [3]:
# Let's take some examples

In [5]:
import numpy as np
import pandas as pd

In [7]:
pd.Series([1,np.nan,2])

# so you see nan is nothing but a very small floating integer.
# That's why it type casted everything with float values

0    1.0
1    NaN
2    2.0
dtype: float64

In [9]:
pd.Series([1,np.nan,2,None])

# It converted even None to Nan

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [11]:
a = pd.Series(['1','2','3',np.nan,None])
a

# None can not convert nan into none.
# even though nan is there but overall our dtype is an object
# but if we check for individual values nan is still a float

0       1
1       2
2       3
3     NaN
4    None
dtype: object

In [14]:
type(a[3])

float

### Calculating missing values row wise and column wise

In [16]:
df2 = pd.DataFrame([[np.nan,2,np.nan,0],
                   [3,4,np.nan,1],
                   [np.nan,3,np.nan,4]],
                   columns = ['A','B','C','D'])

In [19]:
df2


Unnamed: 0,A,B,C,D
0,,2,,0
1,3.0,4,,1
2,,3,,4


In [20]:
# we have this function isna . it returns us a boolean matrix. let's say if a cell has nan value, it returns True else False

df2.isna()


Unnamed: 0,A,B,C,D
0,True,False,True,False
1,False,False,True,False
2,True,False,True,False


In [21]:
# How to find out nan values across the different axis, even after we know that pd.isna() doesn't have any parameter axis

In [22]:
df2.isna().sum(axis = 1)

0    2
1    1
2    2
dtype: int64

In [24]:
df2.isna().sum()

# by default axis = 0
# the sum function is taking True, False as binary digits. 

A    2
B    0
C    3
D    0
dtype: int64

In [26]:
df2['D'].mean().round(2)

1.67

In [28]:
# But how it behaves when we have nan in our column cells?
# it is not calculating rows where we have nan values

df2['A'].mean()

3.0

# FILLING NAN VALUES

In [30]:
df2.fillna(2)

# filled a scalar value in our dataset

Unnamed: 0,A,B,C,D
0,2.0,2,2.0,0
1,3.0,4,2.0,1
2,2.0,3,2.0,4


In [32]:
# But this doesn't make sense for the entire data set.

df2.fillna(df2['A'].mean())



Unnamed: 0,A,B,C,D
0,3.0,2,3.0,0
1,3.0,4,3.0,1
2,3.0,3,3.0,4


In [35]:
# This doesn't make any sense either

df2['A'].fillna(df2['A'].mean())

# Remember inplace is required if you want a permanent change

0    3.0
1    3.0
2    3.0
Name: A, dtype: float64

In [37]:
# this method filled the missing values with the last value

df2.fillna(method = 'ffill')

Unnamed: 0,A,B,C,D
0,,2,,0
1,3.0,4,,1
2,3.0,3,,4


In [39]:
df2.fillna(method = 'bfill')

Unnamed: 0,A,B,C,D
0,3.0,2,,0
1,3.0,4,,1
2,,3,,4


In [41]:
df2.fillna(method = 'backfill')

Unnamed: 0,A,B,C,D
0,3.0,2,,0
1,3.0,4,,1
2,,3,,4


In [42]:
# now let's work with our pfizer data set

In [77]:
df = pd.read_csv("C:/Users/htc/Downloads/Pfizer_1.csv")

In [78]:
df

Unnamed: 0,Date,Drug_Name,Parameter,1:30:00,2:30:00,3:30:00,4:30:00,5:30:00,6:30:00,7:30:00,8:30:00,9:30:00,10:30:00,11:30:00,12:30:00
0,15-10-2020,diltiazem hydrochloride,Temperature,23.0,22.0,,21.0,21.0,22,23.0,21.0,22.0,20,20.0,21
1,15-10-2020,diltiazem hydrochloride,Pressure,12.0,13.0,,11.0,13.0,14,16.0,16.0,24.0,18,19.0,20
2,15-10-2020,docetaxel injection,Temperature,,17.0,18.0,,17.0,18,,,23.0,23,25.0,25
3,15-10-2020,docetaxel injection,Pressure,,22.0,22.0,,22.0,23,,,27.0,26,29.0,28
4,15-10-2020,ketamine hydrochloride,Temperature,24.0,,,27.0,,26,25.0,24.0,23.0,22,21.0,20
5,15-10-2020,ketamine hydrochloride,Pressure,8.0,,,7.0,,9,10.0,11.0,10.0,9,9.0,11
6,16-10-2020,diltiazem hydrochloride,Temperature,34.0,35.0,36.0,36.0,37.0,38,37.0,38.0,39.0,40,,42
7,16-10-2020,diltiazem hydrochloride,Pressure,18.0,19.0,20.0,21.0,22.0,23,24.0,25.0,25.0,24,,27
8,16-10-2020,docetaxel injection,Temperature,46.0,47.0,,48.0,48.0,49,50.0,52.0,55.0,56,57.0,58
9,16-10-2020,docetaxel injection,Pressure,23.0,24.0,,25.0,26.0,27,28.0,29.0,28.0,28,29.0,30


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       18 non-null     object 
 1   Drug_Name  18 non-null     object 
 2   Parameter  18 non-null     object 
 3   1:30:00    16 non-null     float64
 4   2:30:00    16 non-null     float64
 5   3:30:00    12 non-null     float64
 6   4:30:00    14 non-null     float64
 7   5:30:00    16 non-null     float64
 8   6:30:00    18 non-null     int64  
 9   7:30:00    16 non-null     float64
 10  8:30:00    14 non-null     float64
 11  9:30:00    16 non-null     float64
 12  10:30:00   18 non-null     int64  
 13  11:30:00   16 non-null     float64
 14  12:30:00   18 non-null     int64  
dtypes: float64(9), int64(3), object(3)
memory usage: 2.2+ KB


In [47]:
df.isna().sum(axis=1)

0     1
1     1
2     4
3     4
4     3
5     3
6     1
7     1
8     1
9     1
10    2
11    2
12    1
13    1
14    0
15    0
16    0
17    0
dtype: int64

In [48]:
df.isna().sum()

Date         0
Drug_Name    0
Parameter    0
1:30:00      2
2:30:00      2
3:30:00      6
4:30:00      4
5:30:00      2
6:30:00      0
7:30:00      2
8:30:00      4
9:30:00      2
10:30:00     0
11:30:00     2
12:30:00     0
dtype: int64

In [49]:
# we have a function pd.dropna() it drops all the rows where we have nan values

In [53]:
df.dropna(axis=1)

# by default axis = 0
# LOL we are just left with 4 rows.
# not ideal to use dropna 
# we will fill the missing values 

Unnamed: 0,Date,Drug_Name,Parameter,6:30:00,10:30:00,12:30:00
0,15-10-2020,diltiazem hydrochloride,Temperature,22,20,21
1,15-10-2020,diltiazem hydrochloride,Pressure,14,18,20
2,15-10-2020,docetaxel injection,Temperature,18,23,25
3,15-10-2020,docetaxel injection,Pressure,23,26,28
4,15-10-2020,ketamine hydrochloride,Temperature,26,22,20
5,15-10-2020,ketamine hydrochloride,Pressure,9,9,11
6,16-10-2020,diltiazem hydrochloride,Temperature,38,40,42
7,16-10-2020,diltiazem hydrochloride,Pressure,23,24,27
8,16-10-2020,docetaxel injection,Temperature,49,56,58
9,16-10-2020,docetaxel injection,Pressure,27,28,30


In [54]:
df.dropna(axis=0)

Unnamed: 0,Date,Drug_Name,Parameter,1:30:00,2:30:00,3:30:00,4:30:00,5:30:00,6:30:00,7:30:00,8:30:00,9:30:00,10:30:00,11:30:00,12:30:00
14,17-10-2020,docetaxel injection,Temperature,12.0,13.0,14.0,15.0,16.0,17,18.0,19.0,20.0,21,22.0,23
15,17-10-2020,docetaxel injection,Pressure,20.0,22.0,22.0,22.0,22.0,23,25.0,26.0,27.0,28,29.0,28
16,17-10-2020,ketamine hydrochloride,Temperature,13.0,14.0,15.0,16.0,17.0,18,19.0,20.0,21.0,22,23.0,24
17,17-10-2020,ketamine hydrochloride,Pressure,8.0,9.0,10.0,11.0,11.0,12,12.0,11.0,12.0,13,14.0,15


In [56]:
df.dropna(axis=1)

# LOL half of the important columns are gone

Unnamed: 0,Date,Drug_Name,Parameter,6:30:00,10:30:00,12:30:00
0,15-10-2020,diltiazem hydrochloride,Temperature,22,20,21
1,15-10-2020,diltiazem hydrochloride,Pressure,14,18,20
2,15-10-2020,docetaxel injection,Temperature,18,23,25
3,15-10-2020,docetaxel injection,Pressure,23,26,28
4,15-10-2020,ketamine hydrochloride,Temperature,26,22,20
5,15-10-2020,ketamine hydrochloride,Pressure,9,9,11
6,16-10-2020,diltiazem hydrochloride,Temperature,38,40,42
7,16-10-2020,diltiazem hydrochloride,Pressure,23,24,27
8,16-10-2020,docetaxel injection,Temperature,49,56,58
9,16-10-2020,docetaxel injection,Pressure,27,28,30


In [58]:
df['1:30:00'].fillna(df['1:30:00'].mean())

# Is this an ideal way?
# No. because in a single column we have data set of different  - different drugs. So this doesn't make much sense

0     23.00
1     12.00
2     17.75
3     17.75
4     24.00
5      8.00
6     34.00
7     18.00
8     46.00
9     23.00
10     8.00
11    12.00
12    20.00
13     3.00
14    12.00
15    20.00
16    13.00
17     8.00
Name: 1:30:00, dtype: float64

In [59]:
# We will use group by based on drugname and then we will calculate mean values and apply fillna
# Before that let's make our data a lil' clean. It is a lil' clumsy rn

In [79]:
df_melted = pd.melt(df,id_vars = ['Date','Drug_Name','Parameter'],
                   var_name = 'Time',
                   value_name = 'Result')

In [80]:
df_tidy = pd.pivot(df_melted,index = ['Date','Drug_Name','Time'],
                  columns = 'Parameter',
                  values = 'Result').reset_index()

In [81]:
df_tidy.columns.name = None

In [82]:
df_tidy

Unnamed: 0,Date,Drug_Name,Time,Pressure,Temperature
0,15-10-2020,diltiazem hydrochloride,10:30:00,18.0,20.0
1,15-10-2020,diltiazem hydrochloride,11:30:00,19.0,20.0
2,15-10-2020,diltiazem hydrochloride,12:30:00,20.0,21.0
3,15-10-2020,diltiazem hydrochloride,1:30:00,12.0,23.0
4,15-10-2020,diltiazem hydrochloride,2:30:00,13.0,22.0
...,...,...,...,...,...
103,17-10-2020,ketamine hydrochloride,5:30:00,11.0,17.0
104,17-10-2020,ketamine hydrochloride,6:30:00,12.0,18.0
105,17-10-2020,ketamine hydrochloride,7:30:00,12.0,19.0
106,17-10-2020,ketamine hydrochloride,8:30:00,11.0,20.0


In [68]:
# Now we will fill mean values in our pressure and temperature table based on different drugs

In [87]:
def temp_mean(x):
    x['temp_avg'] = x['Temperature'].mean().round(2)
    return x
df_tidy = df_tidy.groupby('Drug_Name').apply(temp_mean)

In [91]:
# now let's fill values

df_tidy['Temperature'].fillna(df_tidy['temp_avg'],inplace = True)

In [99]:
df_tidy.drop(columns = 'Drug_Name',inplace=True)

In [104]:
df_tidy = df_tidy.reset_index()

In [105]:
df_tidy

Unnamed: 0,Drug_Name,level_1,Date,Time,Pressure,Temperature,temp_avg
0,diltiazem hydrochloride,0,15-10-2020,10:30:00,18.0,20.0,24.85
1,diltiazem hydrochloride,1,15-10-2020,11:30:00,19.0,20.0,24.85
2,diltiazem hydrochloride,2,15-10-2020,12:30:00,20.0,21.0,24.85
3,diltiazem hydrochloride,3,15-10-2020,1:30:00,12.0,23.0,24.85
4,diltiazem hydrochloride,4,15-10-2020,2:30:00,13.0,22.0,24.85
...,...,...,...,...,...,...,...
103,ketamine hydrochloride,103,17-10-2020,5:30:00,11.0,17.0,17.71
104,ketamine hydrochloride,104,17-10-2020,6:30:00,12.0,18.0,17.71
105,ketamine hydrochloride,105,17-10-2020,7:30:00,12.0,19.0,17.71
106,ketamine hydrochloride,106,17-10-2020,8:30:00,11.0,20.0,17.71


In [93]:
# same process with pressure 

In [108]:
def press_avg(x):
    x['press_avg'] = x['Pressure'].mean().round(2)
    return x

df_tidy = df_tidy.groupby('Drug_Name').apply(press_avg)

In [118]:
df_tidy = df_tidy.reset_index()

In [124]:
df_tidy['Pressure'].fillna(df_tidy['press_avg'],inplace= True)

In [128]:
df_tidy.drop(columns = ['temp_avg','press_avg'],inplace = True)

In [129]:
df_tidy

Unnamed: 0,Drug_Name,Date,Time,Pressure,Temperature
0,diltiazem hydrochloride,15-10-2020,10:30:00,18.0,20.0
1,diltiazem hydrochloride,15-10-2020,11:30:00,19.0,20.0
2,diltiazem hydrochloride,15-10-2020,12:30:00,20.0,21.0
3,diltiazem hydrochloride,15-10-2020,1:30:00,12.0,23.0
4,diltiazem hydrochloride,15-10-2020,2:30:00,13.0,22.0
...,...,...,...,...,...
103,ketamine hydrochloride,17-10-2020,5:30:00,11.0,17.0
104,ketamine hydrochloride,17-10-2020,6:30:00,12.0,18.0
105,ketamine hydrochloride,17-10-2020,7:30:00,12.0,19.0
106,ketamine hydrochloride,17-10-2020,8:30:00,11.0,20.0


In [130]:
# Now we will learn how to convert continuous data to discrete data

In [132]:
# we will create bins for temparature in 4 categories'
# low
# medium
# high
# very high

In [133]:
df_tidy['Temperature'].max()

58.0

In [134]:
df_tidy['Temperature'].min()

8.0

In [137]:
# 8-21 (low)
# 21-33 (medium)
# 33-45 (high)
# 45-59 (very high)

In [138]:
temp_points = [8,21,33,45,59] # taking 5 points means 4 bins to be created

temp_labels = ['low','medium','high','very high']

In [139]:
df_tidy['temp_catg'] = pd.cut(df_tidy['Temperature'],
                             bins = temp_points,
                             labels = temp_labels)

In [142]:
df_tidy['temp_catg'].value_counts()

temp_catg
low          56
medium       29
high         11
very high    11
Name: count, dtype: int64

In [143]:
df_tidy

Unnamed: 0,Drug_Name,Date,Time,Pressure,Temperature,temp_catg
0,diltiazem hydrochloride,15-10-2020,10:30:00,18.0,20.0,low
1,diltiazem hydrochloride,15-10-2020,11:30:00,19.0,20.0,low
2,diltiazem hydrochloride,15-10-2020,12:30:00,20.0,21.0,low
3,diltiazem hydrochloride,15-10-2020,1:30:00,12.0,23.0,medium
4,diltiazem hydrochloride,15-10-2020,2:30:00,13.0,22.0,medium
...,...,...,...,...,...,...
103,ketamine hydrochloride,17-10-2020,5:30:00,11.0,17.0,low
104,ketamine hydrochloride,17-10-2020,6:30:00,12.0,18.0,low
105,ketamine hydrochloride,17-10-2020,7:30:00,12.0,19.0,low
106,ketamine hydrochloride,17-10-2020,8:30:00,11.0,20.0,low


## String method

In [144]:
# same like SQL like %


In [145]:
df_tidy.loc[df_tidy['Drug_Name'].str.contains('hydrochloride')]

Unnamed: 0,Drug_Name,Date,Time,Pressure,Temperature,temp_catg
0,diltiazem hydrochloride,15-10-2020,10:30:00,18.0,20.0,low
1,diltiazem hydrochloride,15-10-2020,11:30:00,19.0,20.0,low
2,diltiazem hydrochloride,15-10-2020,12:30:00,20.0,21.0,low
3,diltiazem hydrochloride,15-10-2020,1:30:00,12.0,23.0,medium
4,diltiazem hydrochloride,15-10-2020,2:30:00,13.0,22.0,medium
...,...,...,...,...,...,...
103,ketamine hydrochloride,17-10-2020,5:30:00,11.0,17.0,low
104,ketamine hydrochloride,17-10-2020,6:30:00,12.0,18.0,low
105,ketamine hydrochloride,17-10-2020,7:30:00,12.0,19.0,low
106,ketamine hydrochloride,17-10-2020,8:30:00,11.0,20.0,low


In [146]:
# extract only years from date

In [147]:
df_tidy['Date'].str.split('-').apply(lambda x : x[2])

0      2020
1      2020
2      2020
3      2020
4      2020
       ... 
103    2020
104    2020
105    2020
106    2020
107    2020
Name: Date, Length: 108, dtype: object

In [148]:
# let's create a new column timestamp

In [150]:
df_tidy['timestamp'] = df_tidy['Date']+' '+df_tidy['Time']

In [152]:
df_tidy['timestamp'] = pd.to_datetime(df_tidy['timestamp'])

  df_tidy['timestamp'] = pd.to_datetime(df_tidy['timestamp'])


In [153]:
df_tidy

Unnamed: 0,Drug_Name,Date,Time,Pressure,Temperature,temp_catg,timestamp
0,diltiazem hydrochloride,15-10-2020,10:30:00,18.0,20.0,low,2020-10-15 10:30:00
1,diltiazem hydrochloride,15-10-2020,11:30:00,19.0,20.0,low,2020-10-15 11:30:00
2,diltiazem hydrochloride,15-10-2020,12:30:00,20.0,21.0,low,2020-10-15 12:30:00
3,diltiazem hydrochloride,15-10-2020,1:30:00,12.0,23.0,medium,2020-10-15 01:30:00
4,diltiazem hydrochloride,15-10-2020,2:30:00,13.0,22.0,medium,2020-10-15 02:30:00
...,...,...,...,...,...,...,...
103,ketamine hydrochloride,17-10-2020,5:30:00,11.0,17.0,low,2020-10-17 05:30:00
104,ketamine hydrochloride,17-10-2020,6:30:00,12.0,18.0,low,2020-10-17 06:30:00
105,ketamine hydrochloride,17-10-2020,7:30:00,12.0,19.0,low,2020-10-17 07:30:00
106,ketamine hydrochloride,17-10-2020,8:30:00,11.0,20.0,low,2020-10-17 08:30:00


In [154]:
ts= df_tidy['timestamp'][0]

In [155]:
ts

Timestamp('2020-10-15 10:30:00')

In [156]:
# no longer a string

In [157]:
# now if you want to extract month, year , day, hours etc. you don't have to write long codes.
# Pandas gives us some time stamp attributes

In [158]:
ts.month

10

In [159]:
ts.year

2020

In [160]:
ts.day

15

In [161]:
ts.hour

10

In [162]:
ts.minute

30

In [163]:
ts.second

0

In [166]:
# For entire column
# we can use dt instead of str in timestamp dtype

In [168]:
df_tidy['timestamp'].dt.year

0      2020
1      2020
2      2020
3      2020
4      2020
       ... 
103    2020
104    2020
105    2020
106    2020
107    2020
Name: timestamp, Length: 108, dtype: int32

In [169]:
# Reversing time

In [171]:
df_tidy['timestamp'][0].strftime('%y-%m-%d')

'20-10-15'

In [172]:
# END OF PANDAS
# HOW TO WRITE A FILE FOR OTHER USERS 

df_tidy.to_csv('drug_clean_data.csv',sep=',')

# Assignments

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Given a function ‘is_null’ with the following implementation.

df2 = pd.DataFrame([[np.nan,2,np.nan,0],
                   [3,4,np.nan,1],
                   [np.nan,3,np.nan,4]],
                   columns = ['A','B','C','D'])

In [4]:
df2

Unnamed: 0,A,B,C,D
0,,2,,0
1,3.0,4,,1
2,,3,,4


In [5]:
def is_null(x):
    return sum(x.isnull())

In [6]:
df2.apply(is_null,axis = 1)

0    2
1    1
2    2
dtype: int64

In [7]:
df2.isna().sum().sum()

5

In [8]:
df2.loc[[2]].isna().sum().sum()

2

In [9]:
# As an educational institute, you need to keep a track of all the registered students. Here you’re given the registration IDs and the corresponding dates of a batch of students. You need to return a DataFrame containing the columns as follows:

In [10]:
df = pd.DataFrame({'RID':[156,92,29,93,55,32],
                  'RDATE':['2021-01-01','2021-02-12','2021-04-16','2021-01-22','2021-01-15','2021-02-26']})

In [12]:
df['RDATE'] = pd.to_datetime(df['RDATE'])

In [14]:
df['RMONTH']= df['RDATE'].dt.month
df['RYEAR']= df['RDATE'].dt.year
df['RDAY']= df['RDATE'].dt.day

In [15]:
df

Unnamed: 0,RID,RDATE,RMONTH,RYEAR,RDAY
0,156,2021-01-01,1,2021,1
1,92,2021-02-12,2,2021,12
2,29,2021-04-16,4,2021,16
3,93,2021-01-22,1,2021,22
4,55,2021-01-15,1,2021,15
5,32,2021-02-26,2,2021,26


In [17]:
df = pd.DataFrame({'Date':["2015-12-06", "2011-12-27", "2015-09-07", "2012-12-21", "2020-02-13", "2015-06-09"],
                   'RID':[498, 721, 375, 464, 813, 853],
                   'Phy':[22, 45, 1, 65, 22, 17],
                   'Chem':[52, 56, 32, 50, 24, 61],
                   'Math':[63, 37, 68, 62, 43 ,42]})

In [24]:
df['Date'] = pd.to_datetime(df['Date'])

In [30]:
df['month'] = df['Date'].dt.month

In [35]:

# Find the month with the maximum registrations
max_month = df['Month'].value_counts().idxmax()

# Filter the DataFrame to include only the rows for the month with maximum registrations
max_month_df = df[df['Month'] == max_month]

# Calculate the average marks for each subject
avg_marks = max_month_df[['Phy', 'Chem', 'Math']].mean()

In [36]:
print("Month with Maximum Registrations:", max_month)
print("Average Marks in Physics:", avg_marks['Phy'])
print("Average Marks in Chemistry:", avg_marks['Chem'])
print("Average Marks in Mathematics:", avg_marks['Math'])

Month with Maximum Registrations: 2015-12
Average Marks in Physics: 22.0
Average Marks in Chemistry: 52.0
Average Marks in Mathematics: 63.0


In [37]:
# next

In [54]:
# Given the data frame df as input, do the following steps for preprocessing:

# 1. Remove the row if all the columns have missing values.

# 2. Replace the missing values of “Roll_ID” column with 0 and “Name” column with “Anonymous”

# 3. Replace the missing values in “Marks” column with the median value of the column

# 4. Change the numerical columns (Roll_ID and Marks) to int datatype in the output

In [73]:
df = pd.DataFrame({'Roll_id':[412,np.nan,456,np.nan,434,429,418],
                  'Name':['John','Mitra','Ritz',np.nan,'Anny','Hema',np.nan],
                  'Marks':[np.nan,32,25,np.nan,35,28,38]})

In [74]:
# Step 1: Remove rows with all missing values
df.dropna(how='all', inplace=True)

# Step 2: Replace missing values in 'Roll_ID' and 'Name' columns
df['Roll_id'].fillna(0, inplace=True)
df['Name'].fillna('Anonymous', inplace=True)

# Step 3: Replace missing values in 'Marks' column with the median
median_marks = df['Marks'].median()
df['Marks'].fillna(median_marks, inplace=True)



In [75]:
df

Unnamed: 0,Roll_id,Name,Marks
0,412.0,John,32.0
1,0.0,Mitra,32.0
2,456.0,Ritz,25.0
4,434.0,Anny,35.0
5,429.0,Hema,28.0
6,418.0,Anonymous,38.0


In [None]:
# Step 4: Change numerical columns to int datatype
df['Roll_ID'] = df['Roll_ID'].astype(int)
df['Marks'] = df['Marks'].astype(int)
