# Dealing With Null Data

### Introduction:

The data have been modified to contain some missing values, identified by NaN.  
Using pandas should make this exercise
easier, in particular for the bonus question.

## Step 1: Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np

## Step 2: Creating the dataset

In [2]:
# Creating date for use as index
dates = pd.date_range(start='2020-01-20',periods= 7)

#creating random data
M = np.random.random((7,7))

# creating data frame
dframe = pd.DataFrame(M, index= dates)

# Give columns name
dframe.columns = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7'] 
dframe

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2020-01-20,0.24447,0.054897,0.477246,0.85452,0.655192,0.149104,0.620058
2020-01-21,0.487026,0.757193,0.717999,0.614723,0.765389,0.106335,0.19777
2020-01-22,0.473574,0.939685,0.237862,0.535752,0.125301,0.829055,0.757988
2020-01-23,0.77214,0.137287,0.20739,0.387479,0.107713,0.797141,0.212993
2020-01-24,0.193865,0.003091,0.827118,0.894139,0.401646,0.771042,0.646733
2020-01-25,0.969902,0.245148,0.367785,0.820675,0.552599,0.75579,0.166762
2020-01-26,0.431907,0.42039,0.901929,0.052094,0.96943,0.96582,0.516718


## Step 3: Set NaN values in dframe

In [3]:
dframe.at[0:8,'C7']=np.NaN
dframe.at[0:2,'C6']=np.NaN
dframe.at[5:6,'C5']=np.NaN
dframe

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2020-01-20,0.24447,0.054897,0.477246,0.85452,0.655192,,
2020-01-21,0.487026,0.757193,0.717999,0.614723,0.765389,,
2020-01-22,0.473574,0.939685,0.237862,0.535752,0.125301,0.829055,
2020-01-23,0.77214,0.137287,0.20739,0.387479,0.107713,0.797141,
2020-01-24,0.193865,0.003091,0.827118,0.894139,0.401646,0.771042,
2020-01-25,0.969902,0.245148,0.367785,0.820675,,0.75579,
2020-01-26,0.431907,0.42039,0.901929,0.052094,0.96943,0.96582,


## Step 4: Fill all NULL values with 1020

In [4]:
dframe= dframe.fillna(1020)
dframe

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2020-01-20,0.24447,0.054897,0.477246,0.85452,0.655192,1020.0,1020.0
2020-01-21,0.487026,0.757193,0.717999,0.614723,0.765389,1020.0,1020.0
2020-01-22,0.473574,0.939685,0.237862,0.535752,0.125301,0.829055,1020.0
2020-01-23,0.77214,0.137287,0.20739,0.387479,0.107713,0.797141,1020.0
2020-01-24,0.193865,0.003091,0.827118,0.894139,0.401646,0.771042,1020.0
2020-01-25,0.969902,0.245148,0.367785,0.820675,1020.0,0.75579,1020.0
2020-01-26,0.431907,0.42039,0.901929,0.052094,0.96943,0.96582,1020.0


In [5]:
dframe.at[0:5 , 'C7'] = np.NaN
dframe.at[0:2 , 'C6'] = np.NaN
dframe.at[5:6 , 'C5'] = np.NaN
dframe

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2020-01-20,0.24447,0.054897,0.477246,0.85452,0.655192,,
2020-01-21,0.487026,0.757193,0.717999,0.614723,0.765389,,
2020-01-22,0.473574,0.939685,0.237862,0.535752,0.125301,0.829055,
2020-01-23,0.77214,0.137287,0.20739,0.387479,0.107713,0.797141,
2020-01-24,0.193865,0.003091,0.827118,0.894139,0.401646,0.771042,
2020-01-25,0.969902,0.245148,0.367785,0.820675,,0.75579,1020.0
2020-01-26,0.431907,0.42039,0.901929,0.052094,0.96943,0.96582,1020.0


In [6]:
# Replace Null values in Column 'C5' with number 123
# Replace Null values in Column 'C6' with number 789
dframe.fillna(value={'C5':123, 'C6':789})

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2020-01-20,0.24447,0.054897,0.477246,0.85452,0.655192,789.0,
2020-01-21,0.487026,0.757193,0.717999,0.614723,0.765389,789.0,
2020-01-22,0.473574,0.939685,0.237862,0.535752,0.125301,0.829055,
2020-01-23,0.77214,0.137287,0.20739,0.387479,0.107713,0.797141,
2020-01-24,0.193865,0.003091,0.827118,0.894139,0.401646,0.771042,
2020-01-25,0.969902,0.245148,0.367785,0.820675,123.0,0.75579,1020.0
2020-01-26,0.431907,0.42039,0.901929,0.052094,0.96943,0.96582,1020.0


In [7]:
#Replace first NULL value in Column C7 with 789
dframe.fillna(value={'C7':789}, limit=1)

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2020-01-20,0.24447,0.054897,0.477246,0.85452,0.655192,,789.0
2020-01-21,0.487026,0.757193,0.717999,0.614723,0.765389,,
2020-01-22,0.473574,0.939685,0.237862,0.535752,0.125301,0.829055,
2020-01-23,0.77214,0.137287,0.20739,0.387479,0.107713,0.797141,
2020-01-24,0.193865,0.003091,0.827118,0.894139,0.401646,0.771042,
2020-01-25,0.969902,0.245148,0.367785,0.820675,,0.75579,1020.0
2020-01-26,0.431907,0.42039,0.901929,0.052094,0.96943,0.96582,1020.0


## Step 5: Drop Rows with NULL values

In [8]:
dframe.dropna()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2020-01-26,0.431907,0.42039,0.901929,0.052094,0.96943,0.96582,1020.0


## Step 6: Drop Columns with NULL values

In [9]:
dframe.dropna(axis='columns')

Unnamed: 0,C1,C2,C3,C4
2020-01-20,0.24447,0.054897,0.477246,0.85452
2020-01-21,0.487026,0.757193,0.717999,0.614723
2020-01-22,0.473574,0.939685,0.237862,0.535752
2020-01-23,0.77214,0.137287,0.20739,0.387479
2020-01-24,0.193865,0.003091,0.827118,0.894139
2020-01-25,0.969902,0.245148,0.367785,0.820675
2020-01-26,0.431907,0.42039,0.901929,0.052094


## Step 7: Drop Rows with NULL values present in C5 or C6

In [10]:
dframe.dropna(subset=['C5' ,'C6'])

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2020-01-22,0.473574,0.939685,0.237862,0.535752,0.125301,0.829055,
2020-01-23,0.77214,0.137287,0.20739,0.387479,0.107713,0.797141,
2020-01-24,0.193865,0.003091,0.827118,0.894139,0.401646,0.771042,
2020-01-26,0.431907,0.42039,0.901929,0.052094,0.96943,0.96582,1020.0
