# Handling Null Values
import library and data

In [1]:
import pandas as pd
olympics = pd.read_csv('athlete_events.csv')
olympics.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


### dropna()
The method ***dropna*** will remove any row which contains at least 1 null value (by default)

In [6]:
olympics.dropna().head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
40,16,Juhamatti Tapio Aaltonen,M,28.0,184.0,85.0,Finland,FIN,2014 Winter,2014,Winter,Sochi,Ice Hockey,Ice Hockey Men's Ice Hockey,Bronze
41,17,Paavo Johannes Aaltonen,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Individual All-Around,Bronze
42,17,Paavo Johannes Aaltonen,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Team All-Around,Gold
44,17,Paavo Johannes Aaltonen,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Horse Vault,Gold
48,17,Paavo Johannes Aaltonen,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Pommelled Horse,Gold


### how
We can specify ***how*** we want pandas to determin which rows to remove. By selecting *all* we specify that only rows which contains nulls on all columns will be removed

In [2]:
olympics.dropna(how='all').head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


### subset
We can use the ***subset*** argument to define a sequence of columns we want to check for nulls. empty values in other columns will be ignored

In [6]:
olympics.dropna(subset=['Medal']).head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
37,15,Arvo Ossian Aaltonen,M,30.0,,,Finland,FIN,1920 Summer,1920,Summer,Antwerpen,Swimming,Swimming Men's 200 metres Breaststroke,Bronze
38,15,Arvo Ossian Aaltonen,M,30.0,,,Finland,FIN,1920 Summer,1920,Summer,Antwerpen,Swimming,Swimming Men's 400 metres Breaststroke,Bronze
40,16,Juhamatti Tapio Aaltonen,M,28.0,184.0,85.0,Finland,FIN,2014 Winter,2014,Winter,Sochi,Ice Hockey,Ice Hockey Men's Ice Hockey,Bronze
41,17,Paavo Johannes Aaltonen,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Individual All-Around,Bronze


In [7]:
olympics.dropna(subset=['Height','Weight']).head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,
5,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,"Speed Skating Women's 1,000 metres",
6,5,Christine Jacoba Aaftink,F,25.0,185.0,82.0,Netherlands,NED,1992 Winter,1992,Winter,Albertville,Speed Skating,Speed Skating Women's 500 metres,


### fillna()
Replace all empty values with the specified value

In [8]:
olympics.fillna(0).head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,0
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,0
2,3,Gunnar Nielsen Aaby,M,24.0,0.0,0.0,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,0
3,4,Edgar Lindenau Aabye,M,34.0,0.0,0.0,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,0


Replacing all nulls across all columns with the same value would be a bad idea in this case. A better approach would be to replace nulls in each column independently.

In [14]:
olympics['Height'].fillna(olympics['Height'].mean().round(1), inplace=True)
olympics['Weight'].fillna(olympics['Weight'].mean().round(1), inplace=True)
olympics['Medal'].fillna('No Medal', inplace=True)
olympics.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,No Medal
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,No Medal
2,3,Gunnar Nielsen Aaby,M,24.0,175.3,70.7,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,No Medal
3,4,Edgar Lindenau Aabye,M,34.0,175.3,70.7,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,No Medal


## Type Conversions
### astype()
We can use the astype method to convert columns from one type to another

In [38]:
olympics = pd.read_csv('athlete_events.csv')
olympics.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


Let's say we want to convert the *Age* column to whole numbers. converting a column with nulls to integer type is not allowed so we'll get rid of them first

In [22]:
olympics['Age'] = olympics['Age'].fillna(0).astype(int)
olympics['Age'].head()

0    24
1    23
2    24
3    34
4    21
Name: Age, dtype: int32

We can convert many columns at once

In [40]:
olympics[['Height', 'Weight', 'Age']] = olympics[['Height', 'Weight', 'Age']].fillna(0).astype(int)
olympics.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24,180,80,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23,170,60,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24,0,0,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34,0,0,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21,185,82,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


### type category
Inspect the size of our dataframe

In [41]:
olympics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271116 entries, 0 to 271115
Data columns (total 15 columns):
ID        271116 non-null int64
Name      271116 non-null object
Sex       271116 non-null object
Age       271116 non-null int32
Height    271116 non-null int32
Weight    271116 non-null int32
Team      271116 non-null object
NOC       271116 non-null object
Games     271116 non-null object
Year      271116 non-null int64
Season    271116 non-null object
City      271116 non-null object
Sport     271116 non-null object
Event     271116 non-null object
Medal     39783 non-null object
dtypes: int32(3), int64(2), object(10)
memory usage: 17.6+ MB


We can specify categorical data as type *category* to dramatically reduce the space our data takes in memory, making our analysis and calculations faster as a result. To find the best candidates for the conversion we can look at the unique number of values in each column. The lower the number the greater the benefit we'll get form converting to *category*

In [31]:
olympics.nunique()

ID        135571
Name      134732
Sex            2
Age           75
Height        95
Weight       220
Team        1184
NOC          230
Games         51
Year          35
Season         2
City          42
Sport         66
Event        765
Medal          3
dtype: int64

***astype*** does not have an *inplace* argument but we can still use sub-selection from the Dataframe to assign the new type to multiple columns at once

In [44]:
olympics[['Sex','Medal','Season','Year','City','Sport']] = olympics[['Sex','Medal','Season','Year','City','Sport']].astype('category')

In [45]:
olympics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271116 entries, 0 to 271115
Data columns (total 15 columns):
ID        271116 non-null int64
Name      271116 non-null object
Sex       271116 non-null category
Age       271116 non-null int32
Height    271116 non-null int32
Weight    271116 non-null int32
Team      271116 non-null object
NOC       271116 non-null object
Games     271116 non-null object
Year      271116 non-null category
Season    271116 non-null category
City      271116 non-null category
Sport     271116 non-null category
Event     271116 non-null object
Medal     39783 non-null category
dtypes: category(6), int32(3), int64(1), object(5)
memory usage: 11.9+ MB


### Date conversion

In [46]:
movies = pd.read_csv('IMDB.csv')
movies.head()

Unnamed: 0,Title,Release Date,Color/B&W,Genre,Language,Country,Rating,Lead Actor,Director Name,Lead Actor FB Likes,Cast FB Likes,Director FB Likes,Movie FB Likes,IMDb Score (1-10),Total Reviews,Duration (min),Gross Revenue,Budget
0,Over the Hill to the Poorhouse,9/15/1920,Black and White,Crime,English,USA,Not Rated,Stephen Carr,Harry F. Millarde,2.0,4,0,0,4.8,1.0,110.0,3000000,100000
1,Metropolis,1/26/1927,Black and White,Drama,German,Germany,Not Rated,Brigitte Helm,Fritz Lang,136.0,203,756,12000,8.3,260.0,145.0,26435,6000000
2,The Broadway Melody,11/11/1929,Black and White,Musical,English,USA,Passed,Anita Page,Harry Beaumont,77.0,109,4,167,6.3,36.0,100.0,2808000,379000
3,42nd Street,8/29/1933,Black and White,Comedy,English,USA,Unrated,Ginger Rogers,Lloyd Bacon,610.0,995,24,439,7.7,65.0,89.0,2300000,439000
4,Top Hat,4/15/1935,Black and White,Comedy,English,USA,Approved,Ginger Rogers,Mark Sandrich,610.0,824,10,1000,7.8,66.0,81.0,3000000,609000


The date column is currently identified as a string

In [47]:
movies['Release Date'].head()

0     9/15/1920
1     1/26/1927
2    11/11/1929
3     8/29/1933
4     4/15/1935
Name: Release Date, dtype: object

In [51]:
movies['Release Date'] = movies['Release Date'].astype('datetime64')
movies['Release Date'].head()

0   1920-09-15
1   1927-01-26
2   1929-11-11
3   1933-08-29
4   1935-04-15
Name: Release Date, dtype: datetime64[ns]

### pandas.to_datetime()
Pandas is smart, but he won't always recognize our date columns by himself. In those tricky situtions we can help him by specifying the current format

In [52]:
movies = pd.read_csv('IMDB.csv')
movies['Release Date'].head()

0     9/15/1920
1     1/26/1927
2    11/11/1929
3     8/29/1933
4     4/15/1935
Name: Release Date, dtype: object

In [56]:
movies['Release Date'] = pd.to_datetime(movies['Release Date'],format='%m/%d/%Y')
movies['Release Date'].head()

0   1920-09-15
1   1927-01-26
2   1929-11-11
3   1933-08-29
4   1935-04-15
Name: Release Date, dtype: datetime64[ns]

### dt.strftime()
Sometimes when working with dates we would need to break them apart or display them in a different format. 
The ***strftime*** method would convert the date to a string in the format of our choosing

In [60]:
movies['Release Date IL Format'] = movies['Release Date'].dt.strftime('%d/%m/%Y')

In [62]:
movies[['Release Date','Release Date IL Format']].head()

Unnamed: 0,Release Date,Release Date IL Format
0,1920-09-15,15/09/1920
1,1927-01-26,26/01/1927
2,1929-11-11,11/11/1929
3,1933-08-29,29/08/1933
4,1935-04-15,15/04/1935


Extract part of date

In [63]:
movies['Year'] = movies['Release Date'].dt.strftime('%Y')

In [65]:
movies[['Release Date','Year']].head()

Unnamed: 0,Release Date,Year
0,1920-09-15,1920
1,1927-01-26,1927
2,1929-11-11,1929
3,1933-08-29,1933
4,1935-04-15,1935
