# Assignment 1 -- Dealing with missing data

Step 1: Read the csv file as a pandas dataframe

Step 2: Check the number of missing values for the columns

Step 3: access the underlying NumPy array via the values attribute

Step 4: Remove rows from df that contain missing values

Step 5: Remove columns from df that contain missing values

Step 6: Only drop rows where all columns are NaN

Step 7: Drop rows that have less than 3 real values

Step 8: Only drop rows where NaN appear in specific columns (here: 'C')

In [5]:
import pandas as pd
import numpy as np

## Read the csv file as a pandas dataframe

In [170]:
# The given data must be convert into an list of list
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

# Convert the csv_data from string to list
raw_data_as_list = csv_data.replace("\n",",").split(",")

# Separate the columns and data into two variables
columns = raw_data_as_list[0:4]
raw_data_as_list = raw_data_as_list[4:]

# For the data list convert all the empty spaces into np.NaN
# I did this way since the string str function cannot if a string represent a float.
data_as_list = []
for item in raw_data_as_list:
    try:
        new_value = float(item)
        data_as_list.append(new_value)
    except:
        data_as_list.append(np.NaN)
data_as_list

# Convert the data into array and reshape so that the rows and column are correct then
# add the data and columns to the dataframe.
np_data = np.reshape( np.array(data_as_list), newshape=(3,4))
data_frame_1 = pd.DataFrame(data=np_data,columns=columns)
data_frame_1

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


## Check the number of missing values for the columns

In [171]:
nan_by_column = data_frame_1.isna().sum()
nan_by_column_total = nan_by_column.sum()

print("The total of missing cells is ", nan_by_column_total)
print("The columns with missing cells are", nan_by_column, sep="\n")

The total of missing cells is  2
The columns with missing cells are
A    0
B    0
C    1
D    1
dtype: int64


## access the underlying NumPy array via the values attribute

In [172]:
data_frame_1.values
data_frame_1.to_numpy()     # Recommended by Pandas itself.

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

## Remove rows from df that contain missing values

In [173]:
data_frame_1.dropna(how='any')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


## Remove columns from df that contain missing values

In [174]:
data_frame_1.dropna(axis=1, how='any')

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


## Only drop rows where all columns are NaN

In [175]:
data_frame_1.loc[len(data_frame_1.index)] = [np.NAN, np.NaN, np.NAN, np.NaN]
data_frame_1

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,
3,,,,


In [176]:
data_frame_1.dropna(how='all', axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


## Step 7: Drop rows that have less than 3 real values

In [177]:
data_frame_1.dropna(thresh=3)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


## Only drop rows where NaN appear in specific columns (here: 'C')

In [178]:
data_frame_1.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


# B. Imputing missing values

## Step 1: impute missing values via the column mean

In [4]:
from sklearn.impute import SimpleImputer

In [18]:
frame_data = np.array([1,2,3,4,5,6,7,8,10,11,12, np.nan]).reshape((4,3))
dataframe_2 = pd.DataFrame(data=frame_data)
dataframe_2

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,5.0,6.0
2,7.0,8.0,10.0
3,11.0,12.0,


In [35]:
# input missing vlaues via the column mean
pd.set_option("display.precision", 2)
#pd.set_option("display.precision",2)
simpleImputer = SimpleImputer(strategy='mean')
dataframe_2 = simpleImputer.fit(dataframe_2).transform(dataframe_2)
dataframe_2.round(2)

array([[ 1.  ,  2.  ,  3.  ],
       [ 4.  ,  5.  ,  6.  ],
       [ 7.  ,  8.  , 10.  ],
       [11.  , 12.  ,  6.33]])