# Code alongs missing data

## Simulating missing values

In [10]:
import numpy as np 
import pandas as pd 

# in order to get comparable results 
np.random.seed(42)
n = 8 

random_matrix = np.random.randint(1,10, size=(n,n))
random_matrix



array([[7, 4, 8, 5, 7, 3, 7, 8],
       [5, 4, 8, 8, 3, 6, 5, 2],
       [8, 6, 2, 5, 1, 6, 9, 1],
       [3, 7, 4, 9, 3, 5, 3, 7],
       [5, 9, 7, 2, 4, 9, 2, 9],
       [5, 2, 4, 7, 8, 3, 1, 4],
       [2, 8, 4, 2, 6, 6, 4, 6],
       [2, 2, 4, 8, 7, 9, 8, 5]])

In [12]:
random_matrix.size

64

In [11]:
# sampling without replacement
index = np.random.choice(random_matrix.size, 10, replace=False)
index

array([21, 20, 36,  3,  5, 29, 12, 37, 41, 33])

In [13]:
# flattens our 8x8 2D-array into a 64 1D-array 
random_matrix.ravel()

array([7, 4, 8, 5, 7, 3, 7, 8, 5, 4, 8, 8, 3, 6, 5, 2, 8, 6, 2, 5, 1, 6,
       9, 1, 3, 7, 4, 9, 3, 5, 3, 7, 5, 9, 7, 2, 4, 9, 2, 9, 5, 2, 4, 7,
       8, 3, 1, 4, 2, 8, 4, 2, 6, 6, 4, 6, 2, 2, 4, 8, 7, 9, 8, 5])

In [17]:
# need to convert our random_matrix into floats 
# because can't assign None to int as np.nan is float
random_matrix = random_matrix*1.0
random_matrix.ravel()[index] = None
random_matrix

array([[ 7.,  4.,  8., nan,  7., nan,  7.,  8.],
       [ 5.,  4.,  8.,  8., nan,  6.,  5.,  2.],
       [ 8.,  6.,  2.,  5., nan, nan,  9.,  1.],
       [ 3.,  7.,  4.,  9.,  3., nan,  3.,  7.],
       [ 5., nan,  7.,  2., nan, nan,  2.,  9.],
       [ 5., nan,  4.,  7.,  8.,  3.,  1.,  4.],
       [ 2.,  8.,  4.,  2.,  6.,  6.,  4.,  6.],
       [ 2.,  2.,  4.,  8.,  7.,  9.,  8.,  5.]])

In [23]:
scores = pd.DataFrame(
    random_matrix,
    columns=[f"Round {i}" for i in range(1, 9)],
    index=[f"Player {i}" for i in range(1, 9)],
)

scores

Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8
Player 1,7.0,4.0,8.0,,7.0,,7.0,8.0
Player 2,5.0,4.0,8.0,8.0,,6.0,5.0,2.0
Player 3,8.0,6.0,2.0,5.0,,,9.0,1.0
Player 4,3.0,7.0,4.0,9.0,3.0,,3.0,7.0
Player 5,5.0,,7.0,2.0,,,2.0,9.0
Player 6,5.0,,4.0,7.0,8.0,3.0,1.0,4.0
Player 7,2.0,8.0,4.0,2.0,6.0,6.0,4.0,6.0
Player 8,2.0,2.0,4.0,8.0,7.0,9.0,8.0,5.0


## Handle missing values

In [24]:
scores.isnull()

Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8
Player 1,False,False,False,True,False,True,False,False
Player 2,False,False,False,False,True,False,False,False
Player 3,False,False,False,False,True,True,False,False
Player 4,False,False,False,False,False,True,False,False
Player 5,False,True,False,False,True,True,False,False
Player 6,False,True,False,False,False,False,False,False
Player 7,False,False,False,False,False,False,False,False
Player 8,False,False,False,False,False,False,False,False


In [25]:
scores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, Player 1 to Player 8
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Round 1  8 non-null      float64
 1   Round 2  6 non-null      float64
 2   Round 3  8 non-null      float64
 3   Round 4  7 non-null      float64
 4   Round 5  5 non-null      float64
 5   Round 6  4 non-null      float64
 6   Round 7  8 non-null      float64
 7   Round 8  8 non-null      float64
dtypes: float64(8)
memory usage: 576.0+ bytes


In [27]:
scores

Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8
Player 1,7.0,4.0,8.0,,7.0,,7.0,8.0
Player 2,5.0,4.0,8.0,8.0,,6.0,5.0,2.0
Player 3,8.0,6.0,2.0,5.0,,,9.0,1.0
Player 4,3.0,7.0,4.0,9.0,3.0,,3.0,7.0
Player 5,5.0,,7.0,2.0,,,2.0,9.0
Player 6,5.0,,4.0,7.0,8.0,3.0,1.0,4.0
Player 7,2.0,8.0,4.0,2.0,6.0,6.0,4.0,6.0
Player 8,2.0,2.0,4.0,8.0,7.0,9.0,8.0,5.0


In [26]:
# all rows that contains at least one nan will be removed
scores.dropna()

Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8
Player 7,2.0,8.0,4.0,2.0,6.0,6.0,4.0,6.0
Player 8,2.0,2.0,4.0,8.0,7.0,9.0,8.0,5.0


In [28]:
scores.dropna(axis="columns")

Unnamed: 0,Round 1,Round 3,Round 7,Round 8
Player 1,7.0,8.0,7.0,8.0
Player 2,5.0,8.0,5.0,2.0
Player 3,8.0,2.0,9.0,1.0
Player 4,3.0,4.0,3.0,7.0
Player 5,5.0,7.0,2.0,9.0
Player 6,5.0,4.0,1.0,4.0
Player 7,2.0,4.0,4.0,6.0
Player 8,2.0,4.0,8.0,5.0


### Which strategy to choose? 

Say that a gaming expert (domain expert) says, if you have missing value, then your score for that round will be 0.

In [30]:
scores = scores.fillna(0)
scores

Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8
Player 1,7.0,4.0,8.0,0.0,7.0,0.0,7.0,8.0
Player 2,5.0,4.0,8.0,8.0,0.0,6.0,5.0,2.0
Player 3,8.0,6.0,2.0,5.0,0.0,0.0,9.0,1.0
Player 4,3.0,7.0,4.0,9.0,3.0,0.0,3.0,7.0
Player 5,5.0,0.0,7.0,2.0,0.0,0.0,2.0,9.0
Player 6,5.0,0.0,4.0,7.0,8.0,3.0,1.0,4.0
Player 7,2.0,8.0,4.0,2.0,6.0,6.0,4.0,6.0
Player 8,2.0,2.0,4.0,8.0,7.0,9.0,8.0,5.0


In [33]:
scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Round 1,8.0,4.625,2.199838,2.0,2.75,5.0,5.5,8.0
Round 2,8.0,3.875,3.044316,0.0,1.5,4.0,6.25,8.0
Round 3,8.0,5.125,2.232071,2.0,4.0,4.0,7.25,8.0
Round 4,8.0,5.125,3.399054,0.0,2.0,6.0,8.0,9.0
Round 5,8.0,3.875,3.522884,0.0,0.0,4.5,7.0,8.0
Round 6,8.0,3.0,3.585686,0.0,0.0,1.5,6.0,9.0
Round 7,8.0,4.875,2.900123,1.0,2.75,4.5,7.25,9.0
Round 8,8.0,5.25,2.815772,1.0,3.5,5.5,7.25,9.0


## Missing data - strategy



In [34]:
import seaborn as sns 

sns.load_dataset("titanic")

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True
