### Casting Datatypes and Handling Missing Values

In [31]:
import pandas as pd
import numpy as np

In [68]:
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA',], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing'],
    "id":[None, None, None, None, None, None, None]
}

In [69]:
df = pd.DataFrame(people)

In [67]:
df

Unnamed: 0,first,last,email,age,id
0,Corey,Schafer,CoreyMSchafer@gmail.com,33,
1,Jane,Doe,JaneDoe@email.com,55,
2,John,Doe,JohnDoe@email.com,63,
3,Chris,Schafer,,36,
4,,,,,
5,,,Anonymous@email.com,,
6,,Missing,,Missing,


In [35]:
df.dropna(axis="index", how="all") #all mean delete row with all the Na values

Unnamed: 0,first,last,email,age,id
0,Corey,Schafer,CoreyMSchafer@gmail.com,33,
1,Jane,Doe,JaneDoe@email.com,55,
2,John,Doe,JohnDoe@email.com,63,
3,Chris,Schafer,,36,
5,,,Anonymous@email.com,,
6,,Missing,,Missing,


In [36]:
df.dropna(axis=1, how="all") #all the column with the complete Na values filled down

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [37]:
df.dropna(axis="index", how="any", subset=["age"]) #axis = 'index' mean rows , and any mean delete complete row when you there's a Na value

Unnamed: 0,first,last,email,age,id
0,Corey,Schafer,CoreyMSchafer@gmail.com,33,
1,Jane,Doe,JaneDoe@email.com,55,
2,John,Doe,JohnDoe@email.com,63,
3,Chris,Schafer,,36,
6,,Missing,,Missing,


In [39]:
df.dropna(axis="index", how="any", subset=["last", "email"])

Unnamed: 0,first,last,email,age,id
0,Corey,Schafer,CoreyMSchafer@gmail.com,33,
1,Jane,Doe,JaneDoe@email.com,55,
2,John,Doe,JohnDoe@email.com,63,
6,,Missing,,Missing,


In [41]:
df

Unnamed: 0,first,last,email,age,id
0,Corey,Schafer,CoreyMSchafer@gmail.com,33,
1,Jane,Doe,JaneDoe@email.com,55,
2,John,Doe,JohnDoe@email.com,63,
3,Chris,Schafer,,36,
4,,,,,
5,,,Anonymous@email.com,,
6,,Missing,,Missing,


In [43]:
df.replace("NA", np.nan, inplace=True)
df.replace("Missing", np.nan, inplace=True)

In [44]:
df

Unnamed: 0,first,last,email,age,id
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0,
1,Jane,Doe,JaneDoe@email.com,55.0,
2,John,Doe,JohnDoe@email.com,63.0,
3,Chris,Schafer,,36.0,
4,,,,,
5,,,Anonymous@email.com,,
6,,,,,


In [45]:
df.isna()

Unnamed: 0,first,last,email,age,id
0,False,False,False,False,True
1,False,False,False,False,True
2,False,False,False,False,True
3,False,False,True,False,True
4,True,True,True,True,True
5,True,True,False,True,True
6,True,True,True,True,True


In [46]:
df.fillna("MISSING")

Unnamed: 0,first,last,email,age,id
0,Corey,Schafer,CoreyMSchafer@gmail.com,33,MISSING
1,Jane,Doe,JaneDoe@email.com,55,MISSING
2,John,Doe,JohnDoe@email.com,63,MISSING
3,Chris,Schafer,MISSING,36,MISSING
4,MISSING,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,Anonymous@email.com,MISSING,MISSING
6,MISSING,MISSING,MISSING,MISSING,MISSING


In [47]:
df.fillna(0)

Unnamed: 0,first,last,email,age,id
0,Corey,Schafer,CoreyMSchafer@gmail.com,33,0
1,Jane,Doe,JaneDoe@email.com,55,0
2,John,Doe,JohnDoe@email.com,63,0
3,Chris,Schafer,0,36,0
4,0,0,0,0,0
5,0,0,Anonymous@email.com,0,0
6,0,0,0,0,0


In [49]:
df.dtypes

first    object
last     object
email    object
age      object
id       object
dtype: object

In [50]:
type(np.nan)

float

In [52]:
df["age"] = df["age"].astype("float")

In [54]:
df.dtypes

first     object
last      object
email     object
age      float64
id        object
dtype: object

In [55]:
df.age.mean()

46.75

In [58]:
numeric_col = df.select_dtypes(include=["float64", "int64"])

Unnamed: 0,age
0,33.0
1,55.0
2,63.0
3,36.0
4,
5,
6,
