In [4]:
import numpy as np
import pandas as pd

from numpy import nan as NA

data = pd.Series([1,NA,3.5,NA,7])
print(data)

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64


In [7]:
data= data.dropna()
print(data)


0    1.0
2    3.5
4    7.0
dtype: float64


In [8]:
print(data[data.notnull()])

0    1.0
2    3.5
4    7.0
dtype: float64


In [9]:
data1 = pd.DataFrame([[1.,6.5,3.],[1.,NA,NA],
                     [NA,NA,NA],[NA,6.5,3]])
print(data1)

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


In [10]:
cleaned = data1.dropna()
print(cleaned)

     0    1    2
0  1.0  6.5  3.0


# to drop only those rows which have all NA we use dropna(how=all)

In [11]:
cleaned1 =  data1.dropna(how='all')
print(cleaned1)

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0


# to drop columns we can use axis = 1 which indicates column 

In [13]:
cleaned2 = data1.dropna(axis=1,how='all')
print(cleaned2)

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


# to keep only those rows which have values we can use thresh attribute in dropna()

# to fill the missing data using fillna()

In [15]:
print(data1)

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


In [16]:
data1.fillna(0)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


# MINI PROJECT ON HANDLING MISSING DATA

In [138]:
import pandas as pd
dataset = pd.read_csv('pima-indians-diabetes.csv', header = None)
print(dataset.head(20))

     0    1   2   3    4     5      6   7  8
0    6  148  72  35    0  33.6  0.627  50  1
1    1   85  66  29    0  26.6  0.351  31  0
2    8  183  64   0    0  23.3  0.672  32  1
3    1   89  66  23   94  28.1  0.167  21  0
4    0  137  40  35  168  43.1  2.288  33  1
5    5  116  74   0    0  25.6  0.201  30  0
6    3   78  50  32   88  31.0  0.248  26  1
7   10  115   0   0    0  35.3  0.134  29  0
8    2  197  70  45  543  30.5  0.158  53  1
9    8  125  96   0    0   0.0  0.232  54  1
10   4  110  92   0    0  37.6  0.191  30  0
11  10  168  74   0    0  38.0  0.537  34  1
12  10  139  80   0    0  27.1  1.441  57  0
13   1  189  60  23  846  30.1  0.398  59  1
14   5  166  72  19  175  25.8  0.587  51  1
15   7  100   0   0    0  30.0  0.484  32  1
16   0  118  84  47  230  45.8  0.551  31  1
17   7  107  74   0    0  29.6  0.254  31  1
18   1  103  30  38   83  43.3  0.183  33  0
19   1  115  70  30   96  34.6  0.529  32  1


In [139]:
"""columns fields are /n
0. Number of times pregnant.
1. Plasma glucose concentration a 2 hours in an oral glucose tolerance test.
2. Diastolic blood pressure (mm Hg).
3. Triceps skinfold thickness (mm).
4. 2-Hour serum insulin (mu U/ml).
5. Body mass index (weight in kg/(height in m)^2).
6. Diabetes pedigree function.
7. Age (years).
8. Class variable (0 or 1))"""

'columns fields are /n\n0. Number of times pregnant.\n1. Plasma glucose concentration a 2 hours in an oral glucose tolerance test.\n2. Diastolic blood pressure (mm Hg).\n3. Triceps skinfold thickness (mm).\n4. 2-Hour serum insulin (mu U/ml).\n5. Body mass index (weight in kg/(height in m)^2).\n6. Diabetes pedigree function.\n7. Age (years).\n8. Class variable (0 or 1))'

In [140]:
print((dataset[[1,2,3,4,5]]== 0).sum())

1      5
2     35
3    227
4    374
5     11
dtype: int64


In [141]:
#we basically mark the missing values as NaN so that these values will be
#ignored when performing any arithmatic operations

In [142]:
import numpy as np
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace('0','NA')


In [143]:
# the above output mentions that number of missing values are same as number 
# of zeros in the earlier output. This shows that we marked the identified missing 
#values correctly


In [144]:
print(dataset.head(20))

     0    1   2   3    4     5      6   7  8
0    6  148  72  35    0  33.6  0.627  50  1
1    1   85  66  29    0  26.6  0.351  31  0
2    8  183  64   0    0  23.3  0.672  32  1
3    1   89  66  23   94  28.1  0.167  21  0
4    0  137  40  35  168  43.1  2.288  33  1
5    5  116  74   0    0  25.6  0.201  30  0
6    3   78  50  32   88  31.0  0.248  26  1
7   10  115   0   0    0  35.3  0.134  29  0
8    2  197  70  45  543  30.5  0.158  53  1
9    8  125  96   0    0   0.0  0.232  54  1
10   4  110  92   0    0  37.6  0.191  30  0
11  10  168  74   0    0  38.0  0.537  34  1
12  10  139  80   0    0  27.1  1.441  57  0
13   1  189  60  23  846  30.1  0.398  59  1
14   5  166  72  19  175  25.8  0.587  51  1
15   7  100   0   0    0  30.0  0.484  32  1
16   0  118  84  47  230  45.8  0.551  31  1
17   7  107  74   0    0  29.6  0.254  31  1
18   1  103  30  38   83  43.3  0.183  33  0
19   1  115  70  30   96  34.6  0.529  32  1


In [145]:
print(dataset.info())
dataset[8]=np.where(dataset[8] == 1, True, False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
0    768 non-null int64
1    768 non-null int64
2    768 non-null int64
3    768 non-null int64
4    768 non-null int64
5    768 non-null float64
6    768 non-null float64
7    768 non-null int64
8    768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None


In [146]:
print(dataset.head(3))

   0    1   2   3  4     5      6   7      8
0  6  148  72  35  0  33.6  0.627  50   True
1  1   85  66  29  0  26.6  0.351  31  False
2  8  183  64   0  0  23.3  0.672  32   True


In [147]:
dataset.to_csv('final_pima-indians-diabetes.csv')

In [148]:
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
0    768 non-null int64
1    768 non-null int64
2    768 non-null int64
3    768 non-null int64
4    768 non-null int64
5    768 non-null float64
6    768 non-null float64
7    768 non-null int64
8    768 non-null bool
dtypes: bool(1), float64(2), int64(6)
memory usage: 48.8 KB
None


In [151]:
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].fillna('Nan')


In [152]:
print(dataset.head(5))

   0    1   2   3    4     5      6   7      8
0  6  148  72  35    0  33.6  0.627  50   True
1  1   85  66  29    0  26.6  0.351  31  False
2  8  183  64   0    0  23.3  0.672  32   True
3  1   89  66  23   94  28.1  0.167  21  False
4  0  137  40  35  168  43.1  2.288  33   True
