# Find bad images

With the current pipeline, some locations will have images were a large percentage of the pixels are missing due to clouds. Let's loop through and find these images.

In [1]:
import os
import pandas as pd
import configparser
import numpy as np

RANDOM_STATE = 42

In [2]:
# Read config file
config = configparser.ConfigParser()
config.read('config.ini')

DATA_DIR = config['PATHS']['DATA_DIR']

df = pd.read_csv(os.path.join(DATA_DIR, 'dhs_data.csv'))

df

Unnamed: 0,cluster_id,lon,lat,rural,region_id,country,survey,month,year,iwi
0,AO.Bengo.71.135,13.640789,-8.589805,False,AO.Bengo,Angola,Angola 2015-16 Standard DHS,11,2015,62.334459
1,AO.Bengo.71.158,14.122619,-7.718385,True,AO.Bengo,Angola,Angola 2015-16 Standard DHS,2,2016,8.226589
2,AO.Bengo.71.169,13.654425,-8.592545,False,AO.Bengo,Angola,Angola 2015-16 Standard DHS,10,2015,62.760211
3,AO.Bengo.71.203,13.517859,-8.652260,True,AO.Bengo,Angola,Angola 2015-16 Standard DHS,1,2016,68.211697
4,AO.Bengo.71.208,13.721998,-7.852511,True,AO.Bengo,Angola,Angola 2015-16 Standard DHS,11,2015,14.825944
...,...,...,...,...,...,...,...,...,...,...
69944,ZW.Midlands.72.37,30.008579,-20.911177,True,ZW.Midlands,Zimbabwe,Zimbabwe 2015 Standard DHS,9,2015,27.791567
69945,ZW.Midlands.72.52,29.860028,-20.402214,True,ZW.Midlands,Zimbabwe,Zimbabwe 2015 Standard DHS,10,2015,36.929878
69946,ZW.Midlands.72.69,30.172833,-20.724753,True,ZW.Midlands,Zimbabwe,Zimbabwe 2015 Standard DHS,10,2015,24.406326
69947,ZW.Midlands.72.91,29.820084,-19.453466,False,ZW.Midlands,Zimbabwe,Zimbabwe 2015 Standard DHS,7,2015,59.887344


Neat feature for adding a progress bar to pandas apply operations

In [3]:
from tqdm import tqdm
tqdm.pandas()

Get fraction of bad pixels for each image

In [4]:
def get_missing_px_frac(cluster_id):
    img_path = os.path.join(DATA_DIR, 'dhs_images', cluster_id, 'landsat.np')
    
    # If file doesn't exist, this means it was excluded for only containing cloudy pixels
    if not os.path.isfile(img_path):
        return 1.0
    
    img = np.load(img_path)
    
    # If all bands of an image are zero, consider it a "bad" pixel
    mask = (img == 0).all(axis=2)
    
    # Return fraction of bad pixels
    return mask.mean()

df['bad_px_frac'] = df['cluster_id'].progress_apply(get_missing_px_frac)

100%|██████████| 69949/69949 [08:13<00:00, 141.66it/s]


If more than 10% of the pixels are "bad", call it a bad image

In [14]:
df['bad_image'] = df['bad_px_frac'] > 0.1

Just to get a sense for the data, get the fractions of bad images per survey. Print the surveys with any amount of missigness

In [17]:
survey_bad_frac = df.groupby('survey')['bad_image'].mean()
survey_bad_frac[survey_bad_frac > 0.1]

survey
Benin 1996 Standard DHS                          0.752632
Cameroon 1991 Standard DHS                       0.966216
Central African Republic 1994-95 Standard DHS    0.600000
Cote dIvoire 1994 Standard DHS                   0.540984
Cote dIvoire 1998-99 Standard DHS                0.135714
Ghana 1993 Standard DHS                          0.748634
Ghana 1998 Standard DHS                          0.622807
Niger 1992 Standard DHS                          0.335294
Niger 1998 Standard DHS                          0.258993
Togo 1998 Standard DHS                           0.558824
Name: bad_image, dtype: float64

Create a new CSV file, excluding the rows without good images

In [6]:
good_df = df[~df['bad_image']].drop(columns=['bad_image', 'bad_px_frac'])
good_df.to_csv(os.path.join(DATA_DIR, 'dhs_with_imgs.csv'), index=False)