# Missing data and aggregations

In [14]:
import pandas as pd
import numpy as np

np.random.seed(1337)

size = 8

random_matrix = np.random.randint(1, 10, (size, size))
print(f"{random_matrix = }")
print(f"{random_matrix.size = }")

index = np.random.choice(
    random_matrix.size, 10, replace=False
)  # array of randomly picked values from random_matrix, unique because replace = False
print(f"{index = }")

random_matrix = (
    random_matrix * 1.0
)  # type convert to float by multiplying by 1.0 in order to allow None values (nan is considered a float)

# random_matrix[5,3] = 999 # value of row 5, col 3 = 999
random_matrix.ravel()[
    index
] = None  # flattens 2d array to 1d to work with 1d array "index"

random_matrix # now has random missing data


random_matrix = array([[8, 9, 8, 8, 3, 3, 5, 9],
       [7, 7, 8, 9, 2, 7, 7, 3],
       [3, 9, 2, 8, 4, 2, 4, 4],
       [5, 9, 9, 8, 5, 2, 7, 5],
       [5, 3, 6, 8, 5, 3, 4, 3],
       [5, 9, 7, 4, 9, 9, 8, 5],
       [4, 3, 2, 3, 1, 4, 9, 3],
       [2, 1, 2, 5, 3, 1, 1, 8]])
random_matrix.size = 64
index = array([28, 42, 17, 49, 53, 31, 35,  2, 13, 62])


array([[ 8.,  9., nan,  8.,  3.,  3.,  5.,  9.],
       [ 7.,  7.,  8.,  9.,  2., nan,  7.,  3.],
       [ 3., nan,  2.,  8.,  4.,  2.,  4.,  4.],
       [ 5.,  9.,  9.,  8., nan,  2.,  7., nan],
       [ 5.,  3.,  6., nan,  5.,  3.,  4.,  3.],
       [ 5.,  9., nan,  4.,  9.,  9.,  8.,  5.],
       [ 4., nan,  2.,  3.,  1., nan,  9.,  3.],
       [ 2.,  1.,  2.,  5.,  3.,  1., nan,  8.]])

In [18]:
scores = pd.DataFrame(
    random_matrix,
    index=[f"Player {i}" for i in range(1, size + 1)],
    columns=[f"Round {i}" for i in range(1, size + 1)],
)  # create scores DataFrame of data array where NaN represents players not participating in a specific round

scores


Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8
Player 1,8.0,9.0,,8.0,3.0,3.0,5.0,9.0
Player 2,7.0,7.0,8.0,9.0,2.0,,7.0,3.0
Player 3,3.0,,2.0,8.0,4.0,2.0,4.0,4.0
Player 4,5.0,9.0,9.0,8.0,,2.0,7.0,
Player 5,5.0,3.0,6.0,,5.0,3.0,4.0,3.0
Player 6,5.0,9.0,,4.0,9.0,9.0,8.0,5.0
Player 7,4.0,,2.0,3.0,1.0,,9.0,3.0
Player 8,2.0,1.0,2.0,5.0,3.0,1.0,,8.0


---
## Working with missing data



In [19]:
scores.isnull()
# with large amounts of data, if the specific rows are not highly relevant, it is possible to drop none values, but rarely best option
# otherwise an attempt to calculate an estimate of the missing data can be used
# for example: using a machine learning algorithm, with the help of a domain expert, using median, or averages

Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8
Player 1,False,False,True,False,False,False,False,False
Player 2,False,False,False,False,False,True,False,False
Player 3,False,True,False,False,False,False,False,False
Player 4,False,False,False,False,True,False,False,True
Player 5,False,False,False,True,False,False,False,False
Player 6,False,False,True,False,False,False,False,False
Player 7,False,True,False,False,False,True,False,False
Player 8,False,False,False,False,False,False,True,False


In [20]:
scores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, Player 1 to Player 8
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Round 1  8 non-null      float64
 1   Round 2  6 non-null      float64
 2   Round 3  6 non-null      float64
 3   Round 4  7 non-null      float64
 4   Round 5  7 non-null      float64
 5   Round 6  6 non-null      float64
 6   Round 7  7 non-null      float64
 7   Round 8  7 non-null      float64
dtypes: float64(8)
memory usage: 576.0+ bytes


In [27]:
scores.loc["Player 8"]["Round 7"] = 4
scores.dropna() # drops all rows containing nan (defaults to rows)

Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8
Player 8,2.0,1.0,2.0,5.0,3.0,1.0,4.0,8.0


In [28]:
scores.dropna(axis = "columns") # drops all columns containing nan

Unnamed: 0,Round 1,Round 7
Player 1,8.0,5.0
Player 2,7.0,7.0
Player 3,3.0,4.0
Player 4,5.0,7.0
Player 5,5.0,4.0
Player 6,5.0,8.0
Player 7,4.0,9.0
Player 8,2.0,4.0


In [32]:
scores.fillna(0, inplace = True) # replaces nan with 0, mutates scores because of inplace = True
scores

Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8
Player 1,8.0,9.0,0.0,8.0,3.0,3.0,5.0,9.0
Player 2,7.0,7.0,8.0,9.0,2.0,0.0,7.0,3.0
Player 3,3.0,0.0,2.0,8.0,4.0,2.0,4.0,4.0
Player 4,5.0,9.0,9.0,8.0,0.0,2.0,7.0,0.0
Player 5,5.0,3.0,6.0,0.0,5.0,3.0,4.0,3.0
Player 6,5.0,9.0,0.0,4.0,9.0,9.0,8.0,5.0
Player 7,4.0,0.0,2.0,3.0,1.0,0.0,9.0,3.0
Player 8,2.0,1.0,2.0,5.0,3.0,1.0,4.0,8.0


---
## Missing data - strategy

In [33]:
import seaborn as sns

titanic = sns.load_dataset("titanic")
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True
