# Handling NAN values
* Either we drop those NA entries or we fill those with some data

In [18]:
import pandas as pd
import numpy as np

In [19]:
iris = pd.read_csv('iris.csv')
df =iris.copy()
df.columns = ['sl','sw','pl','pw','flower_type']
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [3]:
df.iloc[2:4,1:3] = np.nan
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,,,0.2,Iris-setosa
3,5.0,,,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [4]:
df.describe()

Unnamed: 0,sl,sw,pl,pw
count,149.0,147.0,147.0,149.0
mean,5.848322,3.046939,3.806122,1.205369
std,0.828594,0.434048,1.750351,0.761292
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


### To drop NAN
* If very less columns has NAN value then we will drop it

In [5]:
df.dropna(inplace=True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa
6,5.0,3.4,1.5,0.2,Iris-setosa


In [6]:
df.reset_index(drop=True,inplace=True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,5.4,3.9,1.7,0.4,Iris-setosa
3,4.6,3.4,1.4,0.3,Iris-setosa
4,5.0,3.4,1.5,0.2,Iris-setosa


### To fill some data

In [7]:
df =iris.copy()
df.columns = ['sl','sw','pl','pw','flower_type']
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [8]:
df.iloc[2:4,1:3] = np.nan
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,,,0.2,Iris-setosa
3,5.0,,,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


#### Filling mean value

In [9]:
df['sw'].fillna(df['sw'].mean(),inplace=True)
df['pl'].fillna(df['pl'].mean(),inplace=True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.046939,3.806122,0.2,Iris-setosa
3,5.0,3.046939,3.806122,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


# Handling String values

In [15]:
df =iris.copy()
df.columns = ['sl','sw','pl','pw','flower_type']
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [16]:
df['gender'] = 'Female'
df.iloc[0:10,5]='Male'
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type,gender
0,4.9,3.0,1.4,0.2,Iris-setosa,Male
1,4.7,3.2,1.3,0.2,Iris-setosa,Male
2,4.6,3.1,1.5,0.2,Iris-setosa,Male
3,5.0,3.6,1.4,0.2,Iris-setosa,Male
4,5.4,3.9,1.7,0.4,Iris-setosa,Male


#### This function will take each and every entry of gender column and apply function on that and each entry will return some ouput that we will store it in another column (will create a new column)

In [17]:
def f(s):
    if s == 'Male':
        return 0
    else:
        return 1
df['sex'] = df['gender'].apply(f)
df.drop('gender',axis=1,inplace=True)
df.head(20)

Unnamed: 0,sl,sw,pl,pw,flower_type,sex
0,4.9,3.0,1.4,0.2,Iris-setosa,0
1,4.7,3.2,1.3,0.2,Iris-setosa,0
2,4.6,3.1,1.5,0.2,Iris-setosa,0
3,5.0,3.6,1.4,0.2,Iris-setosa,0
4,5.4,3.9,1.7,0.4,Iris-setosa,0
5,4.6,3.4,1.4,0.3,Iris-setosa,0
6,5.0,3.4,1.5,0.2,Iris-setosa,0
7,4.4,2.9,1.4,0.2,Iris-setosa,0
8,4.9,3.1,1.5,0.1,Iris-setosa,0
9,5.4,3.7,1.5,0.2,Iris-setosa,0
