# [Introduction to Pandas 10-day Bootcamp by DPhi](https://dphi.tech/bootcamps/introduction-to-pandas?utm_source=header)
by [CSpanias](https://github.com/CSpanias), 02/2022

# Content

1. [Series & DataFrame](#seriesDf)
1. [Read & Write Files](#readWrite)
1. [Indexing, Selecting & Assigning](#index)
1. [Summary & Aggregation Functions](#sum)
1. [Sorting & Renaming](#sortRename)
1. [Missing Data](#nans)
1. [Extras](#extras)

<a name="seriesDf"></a>
# 1. Series & DataFrame

In [1]:
import pandas as pd

# create list
my_list = [1, 2, 34, 53, 65]

# check dtype
print(f"my_list is of type {type(my_list)}\n")

# convert Series
my_series = pd.Series(my_list)

# check dtype
print(f"my_series is of type {type(my_series)}")
my_series

my_list is of type <class 'list'>

my_series is of type <class 'pandas.core.series.Series'>


0     1
1     2
2    34
3    53
4    65
dtype: int64

In [2]:
# create dictionary
my_dict = {"name": "Charalampos",
           "surname": "Xman",
           "age": 30,
           "occupation": "student"}

# check dtype
print(f"my_dict is of type {type(my_dict)}\n")


# convert Series
my_series = pd.Series(my_dict)

# check dtype
print(f"my_series is of type {type(my_series)}")
my_series

my_dict is of type <class 'dict'>

my_series is of type <class 'pandas.core.series.Series'>


name          Charalampos
surname              Xman
age                    30
occupation        student
dtype: object

In [3]:
# create dictionary
my_list = [
    ['Charalampos', 'student', 30],
    ['Mike', 'scientist', 25],
    ['Jenny', 'professor', 35]
]

# check dtype
print(f"my_list is of type {type(my_list)}\n")

# create DataFrame
my_df = pd.DataFrame(my_list)

# check dtype
print(f"my_series is of type {type(my_df)}")
my_df

my_list is of type <class 'list'>

my_series is of type <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,0,1,2
0,Charalampos,student,30
1,Mike,scientist,25
2,Jenny,professor,35


In [4]:
import numpy as np

# create array
my_array = np.array([
    ['Charalampos', 'student', 30],
    ['Mike', 'scientist', 25],
    ['Jenny', 'professor', 35]
])

# check dtype
print(f"my_array is of type {type(my_array)}\n")

# create DataFrame
my_df = pd.DataFrame(my_array)

# check dtype
print(f"my_series is of type {type(my_df)}")
my_df

my_array is of type <class 'numpy.ndarray'>

my_series is of type <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,0,1,2
0,Charalampos,student,30
1,Mike,scientist,25
2,Jenny,professor,35


In [30]:
# create df with custom col names and indices
my_df = pd.DataFrame(my_array,
                    columns=['Name', 'Occupation', 'Age'],
                    index = [1, 2, 3])
my_df

Unnamed: 0,Name,Occupation,Age
1,Charalampos,student,30
2,Mike,scientist,25
3,Jenny,professor,35


<a name="readWrite"></a>
# 3. Read & Write Files

In [5]:
# import data
df = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/exam_scores.csv")
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group B,bachelor's degree,standard,none,74,68,67
1,female,group C,some college,standard,completed,58,68,66
2,male,group C,some college,free/reduced,none,66,65,65
3,female,group D,bachelor's degree,free/reduced,none,74,75,73
4,male,group D,some college,standard,none,78,77,71
...,...,...,...,...,...,...,...,...
995,female,group C,some high school,standard,none,68,77,72
996,female,group E,some college,standard,none,98,81,94
997,female,group E,associate's degree,free/reduced,none,67,67,67
998,female,group C,high school,standard,none,63,68,70


In [6]:
# check working directory
%pwd

'C:\\Users\\10inm\\Codecademy\\dphi\\dphi_pandas'

In [7]:
# check shape
df.shape

(1000, 8)

In [8]:
# check first 5 rows
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group B,bachelor's degree,standard,none,74,68,67
1,female,group C,some college,standard,completed,58,68,66
2,male,group C,some college,free/reduced,none,66,65,65
3,female,group D,bachelor's degree,free/reduced,none,74,75,73
4,male,group D,some college,standard,none,78,77,71


In [9]:
# check last 5 rows
df.tail()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
995,female,group C,some high school,standard,none,68,77,72
996,female,group E,some college,standard,none,98,81,94
997,female,group E,associate's degree,free/reduced,none,67,67,67
998,female,group C,high school,standard,none,63,68,70
999,male,group C,some college,free/reduced,none,49,57,50


In [10]:
# check dtypes
df.dtypes

gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object

In [11]:
# check basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


<a name="index"></a>
# 4. Indexing, Selecting & Assigning

## Indexing

### Index based selection
`df.iloc[:, :]`

In [12]:
# slice first row of the first column
df.iloc[0, 0]

'male'

In [13]:
# slice first 5 rows of the 5th column
df.iloc[0:5, 4]

0         none
1    completed
2         none
3         none
4         none
Name: test preparation course, dtype: object

In [14]:
# slice everything!
df.iloc[:, :]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group B,bachelor's degree,standard,none,74,68,67
1,female,group C,some college,standard,completed,58,68,66
2,male,group C,some college,free/reduced,none,66,65,65
3,female,group D,bachelor's degree,free/reduced,none,74,75,73
4,male,group D,some college,standard,none,78,77,71
...,...,...,...,...,...,...,...,...
995,female,group C,some high school,standard,none,68,77,72
996,female,group E,some college,standard,none,98,81,94
997,female,group E,associate's degree,free/reduced,none,67,67,67
998,female,group C,high school,standard,none,63,68,70


In [15]:
# slice first 4 rows of the 3rd column
df.iloc[[0, 1, 2, 3], 2]

0    bachelor's degree
1         some college
2         some college
3    bachelor's degree
Name: parental level of education, dtype: object

In [16]:
# slice all rows from last column
df.iloc[:, -1]

0      67
1      66
2      65
3      73
4      71
       ..
995    72
996    94
997    67
998    70
999    50
Name: writing score, Length: 1000, dtype: int64

### Label based selection
`df.loc[:, :]`

In [38]:
# slice all rows from last col
df.loc[:, 'writing score']

0      67
1      66
2      65
3      73
4      71
       ..
995    72
996    94
997    67
998    70
999    50
Name: writing score, Length: 1000, dtype: int64

In [39]:
# select first 5 rows from gender, lunch, math score
df.loc[0:5, ['gender', 'lunch', 'math score']]

Unnamed: 0,gender,lunch,math score
0,male,standard,74
1,female,standard,58
2,male,free/reduced,66
3,female,free/reduced,74
4,male,standard,78
5,female,standard,75


## Selecting

### Attribute (Dot) Based Selection
`df.column_name`

In [41]:
# select gender col
df.gender

0        male
1      female
2        male
3      female
4        male
        ...  
995    female
996    female
997    female
998    female
999      male
Name: gender, Length: 1000, dtype: object

### Dictionary (Bracket) Based Selection
`df['col_name']`

In [42]:
# select gender
df['gender']

0        male
1      female
2        male
3      female
4        male
        ...  
995    female
996    female
997    female
998    female
999      male
Name: gender, Length: 1000, dtype: object

### Conditional Selection

In [48]:
# select rows with math & writing score above 95
df[(df['math score'] > 95) & (df['writing score'] > 95) ]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
91,male,group D,associate's degree,standard,none,100,94,96
495,male,group C,master's degree,standard,completed,100,96,100
520,female,group C,associate's degree,standard,completed,97,97,100
565,male,group E,bachelor's degree,standard,completed,100,100,100
744,female,group E,associate's degree,free/reduced,completed,98,100,100
767,female,group D,bachelor's degree,standard,completed,96,100,100
776,male,group E,associate's degree,standard,completed,100,100,100
895,female,group E,master's degree,standard,none,97,100,100
919,female,group E,associate's degree,standard,completed,96,100,100


## Assigning

In [49]:
# add new col & fill all values with DPhi
df['provider'] = 'DPhi'
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,provider
0,male,group B,bachelor's degree,standard,none,74,68,67,DPhi
1,female,group C,some college,standard,completed,58,68,66,DPhi
2,male,group C,some college,free/reduced,none,66,65,65,DPhi
3,female,group D,bachelor's degree,free/reduced,none,74,75,73,DPhi
4,male,group D,some college,standard,none,78,77,71,DPhi
...,...,...,...,...,...,...,...,...,...
995,female,group C,some high school,standard,none,68,77,72,DPhi
996,female,group E,some college,standard,none,98,81,94,DPhi
997,female,group E,associate's degree,free/reduced,none,67,67,67,DPhi
998,female,group C,high school,standard,none,63,68,70,DPhi


<a name="sum"></a>
# 5. Summary & Aggregation Functions

In [17]:
# check basic stats
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,67.128,70.174,68.973
std,14.815367,14.85599,15.109155
min,15.0,18.0,10.0
25%,58.0,60.0,59.0
50%,67.0,70.0,69.0
75%,78.0,81.0,80.0
max,100.0,100.0,100.0


In [18]:
# check basic stats of categorical cols
df.describe(include='object')

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course
count,1000,1000,1000,1000,1000
unique,2,5,6,2,2
top,female,group C,some college,standard,none
freq,502,294,226,649,654


In [19]:
# check basic stats of both categorical & numerical cols
df.describe(include='all')

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
count,1000,1000,1000,1000,1000,1000.0,1000.0,1000.0
unique,2,5,6,2,2,,,
top,female,group C,some college,standard,none,,,
freq,502,294,226,649,654,,,
mean,,,,,,67.128,70.174,68.973
std,,,,,,14.815367,14.85599,15.109155
min,,,,,,15.0,18.0,10.0
25%,,,,,,58.0,60.0,59.0
50%,,,,,,67.0,70.0,69.0
75%,,,,,,78.0,81.0,80.0


In [20]:
# calculate mean of scores
df['math score'].mean()

67.128

In [21]:
# check unique values of education
df['parental level of education'].unique()

array(["bachelor's degree", 'some college', "associate's degree",
       'some high school', "master's degree", 'high school'], dtype=object)

In [22]:
# check number of unique values
df['parental level of education'].value_counts()

some college          226
associate's degree    197
high school           190
some high school      181
bachelor's degree     130
master's degree        76
Name: parental level of education, dtype: int64

<a name="sortRename"></a>
# 6. Sorting & Renaming

Pandas provides a method called `sort_values()` which returns the sorted result in value order. 

In [24]:
# sort by match score
df.sort_values(by = 'math score', ascending=False)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
91,male,group D,associate's degree,standard,none,100,94,96
776,male,group E,associate's degree,standard,completed,100,100,100
588,male,group D,some college,standard,completed,100,85,91
128,male,group A,associate's degree,standard,completed,100,97,94
565,male,group E,bachelor's degree,standard,completed,100,100,100
...,...,...,...,...,...,...,...,...
349,male,group B,some high school,free/reduced,none,25,31,30
739,female,group E,some high school,free/reduced,none,25,35,38
891,female,group C,high school,free/reduced,none,23,31,27
854,male,group C,high school,free/reduced,none,18,30,18


Pandas provides a function `rename()` to rename column/indexes in a dataframe.

In [26]:
df.rename(columns={'math score': "math_score",
          'reading score': 'reading_score',
          'writing score': 'writing_score'}, inplace=True)
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math_score,reading_score,writing_score
0,male,group B,bachelor's degree,standard,none,74,68,67
1,female,group C,some college,standard,completed,58,68,66
2,male,group C,some college,free/reduced,none,66,65,65
3,female,group D,bachelor's degree,free/reduced,none,74,75,73
4,male,group D,some college,standard,none,78,77,71
...,...,...,...,...,...,...,...,...
995,female,group C,some high school,standard,none,68,77,72
996,female,group E,some college,standard,none,98,81,94
997,female,group E,associate's degree,free/reduced,none,67,67,67
998,female,group C,high school,standard,none,63,68,70


<a name="nans"></a>
# 7. Missing Data

`df.isna()` or `df.isnull()` returns the dataframe with boolean values indicating missing values.

In [27]:
df.isna().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64

In [28]:
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64

Pandas also provides `fillna()` method to fill the missing values. `fillna() `provides many different strategy to fill the missing values.

In [32]:
my_df['Country'] = ['GR', np.nan, 'IT']

In [40]:
my_df = my_df.append({'Name': 'Maria',
             'Occupation': np.nan,
             'Age': np.nan,
             'Country': 'SYR'}, ignore_index=True)

In [43]:
my_df.Age = pd.to_numeric(my_df.Age)
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        4 non-null      object 
 1   Occupation  3 non-null      object 
 2   Age         3 non-null      float64
 3   Country     3 non-null      object 
dtypes: float64(1), object(3)
memory usage: 256.0+ bytes


In [46]:
my_df.Age.fillna(my_df.Age.mean(), inplace=True)
my_df

Unnamed: 0,Name,Occupation,Age,Country
0,Charalampos,student,30.0,GR
1,Mike,scientist,25.0,
2,Jenny,professor,35.0,IT
3,Maria,,30.0,SYR


In [49]:
my_df.Occupation.fillna('Unknown', inplace=True)
my_df

Unnamed: 0,Name,Occupation,Age,Country
0,Charalampos,student,30.0,GR
1,Mike,scientist,25.0,
2,Jenny,professor,35.0,IT
3,Maria,Unknown,30.0,SYR


<a name="extras"></a>
# 8. Extras

## Quiz 2

In [50]:
import pandas as pd

# load data
sma_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/Standard_Metropolitan_Areas_Data-data.csv')
sma_data.head()

Unnamed: 0,land_area,percent_city,percent_senior,physicians,hospital_beds,graduates,work_force,income,region,crime_rate
0,1384,78.1,12.3,25627,69678,50.1,4083.9,72100,1,75.55
1,3719,43.9,9.4,13326,43292,53.9,3305.9,54542,2,56.03
2,3553,37.4,10.7,9724,33731,50.6,2066.3,33216,1,41.32
3,3916,29.9,8.8,6402,24167,52.2,1966.7,32906,2,67.38
4,2480,31.5,10.5,8502,16751,66.1,1514.5,26573,4,80.19


In [51]:
# check shape
sma_data.shape

(99, 10)

In [52]:
# check dtypes
sma_data.dtypes

land_area           int64
percent_city      float64
percent_senior    float64
physicians          int64
hospital_beds       int64
graduates         float64
work_force        float64
income              int64
region              int64
crime_rate        float64
dtype: object

## Quiz 3

In [53]:
# select 10th observation
sma_data.iloc[9, 9]

55.3

In [59]:
# index of physicians
sma_data.columns.get_loc('physicians')

3

In [54]:
# select last value from physicians'column
sma_data.iloc[-1, 3]

140

elect the records with index labels (numerical positions) - 1, 3, 5, 7, 9 and 13 and columns - 'land_area', 'work_force', 'income', 'region' and 'crime_rate'. Assign the result to a variable sample_data1. Select the correct statement about this sample_data1. [Hint: Use label-based selection technique]

In [72]:
# select specific value and cols
subset = sma_data[['land_area', 'work_force', 'income', 'region', 'crime_rate']]
sample_data1 = subset.iloc[[-1, 3, 5, 7, 9, 13], :]
sample_data1

Unnamed: 0,land_area,work_force,income,region,crime_rate
98,654,66.9,1148,3,68.76
3,3916,1966.7,32906,2,67.38
5,2815,1541.9,25663,3,58.48
7,6794,1272.7,18221,3,64.88
9,4647,1032.2,14542,2,55.3
13,782,915.2,12591,4,63.2


In [71]:
# region == 2
sample_data2 = sma_data[sma_data['region'] == 2]
print(sample_data2.shape)
len(sample_data2)

(25, 10)


25

## Quiz 4

In [73]:
# check mean
sma_data.describe()

Unnamed: 0,land_area,percent_city,percent_senior,physicians,hospital_beds,graduates,work_force,income,region,crime_rate
count,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
mean,2615.727273,42.518182,9.781818,1828.333333,6345.868687,54.463636,449.366667,6762.505051,2.494949,55.64303
std,3045.82621,17.348277,2.524547,3192.199763,9136.202716,7.773286,610.990885,10393.34966,1.013921,13.470943
min,47.0,13.4,3.9,140.0,481.0,30.3,66.9,769.0,1.0,23.32
25%,1408.0,30.1,8.35,459.0,2390.0,50.25,150.3,2003.0,2.0,46.115
50%,1951.0,39.5,9.7,774.0,3472.0,54.0,257.2,3510.0,3.0,56.06
75%,2890.5,52.6,10.75,1911.5,6386.5,58.3,436.5,6283.5,3.0,63.86
max,27293.0,100.0,21.8,25627.0,69678.0,72.8,4083.9,72100.0,4.0,85.62


In [74]:
# check region
sma_data.region.unique()

array([1, 2, 4, 3], dtype=int64)

In [81]:
# region == 3
sample_data1 = sma_data[sma_data.region == 3]
print(sample_data1.shape)
print(sample_data1.crime_rate.mean())
print(sample_data1.graduates.isnull().sum())

(36, 10)
58.265555555555565
0


In [86]:
sma_data.region.value_counts()

3    36
2    25
1    21
4    17
Name: region, dtype: int64

## Quiz 5

In [89]:
sma_data.sort_values(by = 'crime_rate', ascending=False)

Unnamed: 0,land_area,percent_city,percent_senior,physicians,hospital_beds,graduates,work_force,income,region,crime_rate
20,9155,53.8,11.1,2280,6450,60.1,575.2,7766,4,85.62
74,1412,39.2,11.3,436,1837,49.4,154.2,2098,4,82.68
53,5966,39.5,9.6,737,1907,52.7,246.6,3007,4,80.94
4,2480,31.5,10.5,8502,16751,66.1,1514.5,26573,4,80.19
67,8152,22.3,9.1,405,1254,51.7,165.6,2257,4,78.10
...,...,...,...,...,...,...,...,...,...,...
8,3049,19.5,12.1,4005,21149,53.4,967.5,15826,1,30.51
72,2658,39.0,12.2,365,5430,49.9,136.9,1862,1,28.52
76,862,26.3,13.4,423,1929,43.3,145.5,2010,1,25.49
38,1951,28.4,14.5,696,4843,47.9,271.5,3667,1,23.64


# Graded Quiz

In [102]:
import pandas as pd 

df = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/Chronic%20Kidney%20Disease%20(CKD)%20Dataset/ChronicKidneyDisease.csv')
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [103]:
df.dtypes

id                  int64
age               float64
bp                float64
sg                float64
al                float64
su                float64
rbc                object
pc                 object
pcc                object
ba                 object
bgr               float64
bu                float64
sc                float64
sod               float64
pot               float64
hemo              float64
pcv                object
wc                 object
rc                 object
htn                object
dm                 object
cad                object
appet              object
pe                 object
ane                object
classification     object
dtype: object

In [106]:
df.describe(include='all')

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
count,400.0,391.0,388.0,353.0,354.0,351.0,248,335,396,396,...,330.0,295.0,270.0,398,398,398,399,399,399,400
unique,,,,,,,2,2,2,2,...,44.0,92.0,49.0,2,5,3,2,2,2,3
top,,,,,,,normal,normal,notpresent,notpresent,...,41.0,9800.0,5.2,no,no,no,good,no,no,ckd
freq,,,,,,,201,259,354,374,...,21.0,11.0,18.0,251,258,362,317,323,339,248
mean,199.5,51.483376,76.469072,1.017408,1.016949,0.450142,,,,,...,,,,,,,,,,
std,115.614301,17.169714,13.683637,0.005717,1.352679,1.099191,,,,,...,,,,,,,,,,
min,0.0,2.0,50.0,1.005,0.0,0.0,,,,,...,,,,,,,,,,
25%,99.75,42.0,70.0,1.01,0.0,0.0,,,,,...,,,,,,,,,,
50%,199.5,55.0,80.0,1.02,0.0,0.0,,,,,...,,,,,,,,,,
75%,299.25,64.5,80.0,1.02,2.0,0.0,,,,,...,,,,,,,,,,


In [109]:
df.loc[247:254,['id', 'age', 'classification']]

Unnamed: 0,id,age,classification
247,247,54.0,ckd
248,248,59.0,ckd
249,249,56.0,ckd
250,250,40.0,notckd
251,251,23.0,notckd
252,252,45.0,notckd
253,253,57.0,notckd
254,254,51.0,notckd


In [111]:
df_a = df.rename(columns={
    'id': 'Id',
    'age': 'Age',
    'bp': 'blood_pressure',   
})
df_a

Unnamed: 0,Id,Age,blood_pressure,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [112]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [113]:
df.bp.mean()

76.46907216494846

In [114]:
df.bp.std()

13.683637493525255

In [115]:
df.bp.fillna(df.bp.median(), inplace=True)
print(df.bp.mean())
print(df.bp.std())

76.575
13.489785423680392


In [116]:
df.cad.fillna(df.cad.mean(), inplace=True)

TypeError: can only concatenate str (not "int") to str