# Missing Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
np.nan

nan

In [3]:
pd.NA

<NA>

In [5]:
pd.NaT

NaT

Some imp articles

https://stackoverflow.com/questions/20320022/why-in-numpy-nan-nan-is-false-while-nan-in-nan-is-true

https://towardsdatascience.com/navigating-the-hell-of-nans-in-python-71b12558895b

In [6]:
np.nan == np.nan

False

In [7]:
np.nan in [np.nan]

True

In [8]:
np.nan is np.nan

True

In [9]:
pd.NA == pd.NA

<NA>

## Data

In [10]:
df = pd.read_csv('movie_scores.csv')

In [11]:
df.head(3)

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,


## Checking and Selecting for Null Values

In [12]:
df.isnull()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,False,False,False,False,False,False
1,True,True,True,True,True,True
2,False,False,False,False,True,True
3,False,False,False,False,False,False
4,False,False,False,False,False,False


In [14]:
df.isnull().sum()

first_name          1
last_name           1
age                 1
sex                 1
pre_movie_score     2
post_movie_score    2
dtype: int64

In [17]:
df.notnull()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,True,True,True,True,True,True
1,False,False,False,False,False,False
2,True,True,True,True,False,False
3,True,True,True,True,True,True
4,True,True,True,True,True,True


In [18]:
df.notnull().sum()

first_name          4
last_name           4
age                 4
sex                 4
pre_movie_score     3
post_movie_score    3
dtype: int64

In [21]:
df['last_name']

0      Hanks
1        NaN
2    Jackman
3    Winfrey
4      Stone
Name: last_name, dtype: object

In [22]:
df[df['last_name'].notnull()]

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [23]:
df[(df['pre_movie_score'].isnull()) & df['sex'].notnull()]

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
2,Hugh,Jackman,51.0,m,,


## Drop Data

In [24]:
#This command show the all functionality of the dropna
#help(df.dropna)

In [26]:
#Tis funcation removes the all nan row in data
df.dropna()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [29]:
#Require that many non-NA values.
df.dropna(thresh=1)

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [31]:
#This command drop all data because each column have na value
df.dropna(axis=1)

0
1
2
3
4


In [32]:
df.dropna(thresh=4,axis=1)

Unnamed: 0,first_name,last_name,age,sex
0,Tom,Hanks,63.0,m
1,,,,
2,Hugh,Jackman,51.0,m
3,Oprah,Winfrey,66.0,f
4,Emma,Stone,31.0,f


## Fill Data

In [34]:
#It fill all na values in the data
df.fillna("NEW VALUE!")

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,NEW VALUE!,NEW VALUE!,NEW VALUE!,NEW VALUE!,NEW VALUE!,NEW VALUE!
2,Hugh,Jackman,51.0,m,NEW VALUE!,NEW VALUE!
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [36]:
df['last_name'].fillna("Empty")

0      Hanks
1      Empty
2    Jackman
3    Winfrey
4      Stone
Name: last_name, dtype: object

In [38]:
df['last_name'] = df['last_name'].fillna("Empty")

In [39]:
df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,Empty,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [40]:
df['post_movie_score'].mean()

9.0

In [41]:
df['post_movie_score'].fillna(df['post_movie_score'].mean())

0    10.0
1     9.0
2     9.0
3     8.0
4     9.0
Name: post_movie_score, dtype: float64

In [42]:
df.fillna(df.mean())

  df.fillna(df.mean())


Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,Empty,52.75,,7.0,9.0
2,Hugh,Jackman,51.0,m,7.0,9.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


## Filling with Interpolation

Be careful with this technique, you should try to really understand whether or not this is a valid choice for your data.

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html


In [44]:
airline_tix = {'first':100,'business':np.nan,'economy-plus':50,'economy':30}

In [45]:
ser = pd.Series(airline_tix)

In [46]:
ser

first           100.0
business          NaN
economy-plus     50.0
economy          30.0
dtype: float64

In [47]:
ser.interpolate()

first           100.0
business         75.0
economy-plus     50.0
economy          30.0
dtype: float64

## Practice Practice and Practice