Using the images marked as valid per cluster, we pass them through the CNN and extract their feature vectors. the results are stored at a per-country basis. For example, all Malawi feature extractions will go into results/malawi_2019/cnn.

In [11]:
import os
import shutil
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import pickle

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import copy

In [12]:
BASE_DIR = '..'
COUNTRIES_DIR = os.path.join(BASE_DIR, 'data', 'countries')
PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')
RESULTS_DIR = os.path.join(BASE_DIR, 'results')
CNN_TRAIN_IMAGE_DIR = os.path.join(BASE_DIR, 'data', 'cnn_images')
CNN_DIR = os.path.join(BASE_DIR, 'models', 'trained_model.pt')

In [13]:
#Create results directories for each country if they don't exist
os.makedirs(RESULTS_DIR, exist_ok=True)
for country in ['malawi_2019', 'ethiopia_2019', 'nigeria_2019']:
    os.makedirs(os.path.join(RESULTS_DIR, country), exist_ok=True)

# Feature extract with CNN
If you have run this step before, you can skip it and run the commented out code in the next section to quick-start.

In [14]:
df_images = pd.read_csv(os.path.join(PROCESSED_DIR, 'image_download_actual.csv')) #those are the 33k images

In [15]:
df_images.head() #variable with training, validation metadata

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,country,nightlights_bin,is_train
0,-17.0935306549072_35.20822373164363_-17.093530...,-17.093531,35.208224,-17.093531,35.253139,1.994561,0.034344,mw,0,True
1,-17.07855873350521_35.20822373164363_-17.09353...,-17.078559,35.208224,-17.093531,35.253139,1.994561,0.034344,mw,0,True
2,-17.123474497711186_35.22319565304562_-17.0935...,-17.123474,35.223196,-17.093531,35.253139,1.994561,0.034344,mw,0,False
3,-17.138446419113176_35.23816757444761_-17.0935...,-17.138446,35.238168,-17.093531,35.253139,1.994561,0.034344,mw,0,True
4,-17.063586812103217_35.23816757444761_-17.0935...,-17.063587,35.238168,-17.093531,35.253139,1.994561,0.034344,mw,0,True


In [16]:
#Set up the device for PyTorch (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using {device} as backend')
model = torch.load(CNN_DIR, map_location=device)

Using cuda as backend


In [17]:
#Display the classifier part of the CNN model
#CNN model typically has two main parts:

#Feature Extractor: This is usually a series of convolutional layers that process the input images 
#and extract features. These layers apply filters to the input to create feature maps that 
#represent various aspects of the input data.

#Classifier: After the feature extractor, the model includes a classifier, which is typically 
#made up of fully connected layers (also known as linear layers in PyTorch). The classifier's job 
#is to take the features extracted by the convolutional layers and use them to classify the input 
#image into various categories (e.g., different types of objects in an image recognition task)."""

model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=4096, out_features=3, bias=True)
)

In [18]:
# keep only the first 4 layers for feature extraction
model.classifier = model.classifier[:4]

In [19]:
model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
)

In [20]:
##Define image transformations and create a custom dataset class for image loading
transformer = transforms.Compose([#takes a list of transformation commands, applies them sequentially to an image. 
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

# custom dataset for fast image loading and processing
# does not follow the usual style of folder -> folder for each class -> image
# we just want one folder with images
class ForwardPassDataset(torch.utils.data.Dataset):
    #forward pass just means running data through the CNN
    def __init__(self, image_dir, transformer):
        self.image_dir = image_dir
        self.image_list = os.listdir(self.image_dir)
        self.transformer = transformer

    def __len__(self):
        return len(self.image_list)

    def __getitem__(self, index):
        image_name = self.image_list[index]

        # Load image
        X = self.filename_to_im_tensor(self.image_dir + '/' + image_name)
        
        # dataloaders need to return a label, but for the forward pass we don't really care
        return X, -1
    
    def filename_to_im_tensor(self, file):
        im = plt.imread(file)[:,:,:3]
        im = self.transformer(im)
        return im

model.eval() # for evaluating, instead of training the model. 
classes = [0, 1, 2]
# shape of final array will be (num_validation_images, 4096)
# we also want to record the image each index represents
feats = np.zeros(((~df_images['is_train']).sum(), 4096))
image_order = []
i = 0
for c in classes:
    # use the validation images to do the forward pass
    dataset = ForwardPassDataset(os.path.join(CNN_TRAIN_IMAGE_DIR, 'valid', str(c)), transformer)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=0)
    image_order += dataset.image_list
    # forward pass for this class
    for inputs, _ in tqdm(dataloader):
        inputs = inputs.to(device)
        outputs = model(inputs)
        feats[i:i+len(inputs),:] = outputs.cpu().detach().numpy()
        i += len(inputs)
        
transformer = transforms.Compose([#takes a list of transformation commands, applies them sequentially to an image. 
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

  0%|          | 0/410 [00:00<?, ?it/s]

  0%|          | 0/330 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

In [21]:
feats

array([[-0.4096325 ,  0.18340945,  0.66169846, ..., -0.67364454,
        -0.27376688, -0.28155813],
       [ 0.35640758,  0.06036946, -0.51295668, ..., -0.3266423 ,
        -0.49054784, -0.00386778],
       [ 0.04689836,  0.50569928, -0.59484124, ..., -0.21110877,
        -0.02411379,  0.34765425],
       ...,
       [ 0.02955244,  0.44402122, -0.50215864, ...,  0.42083105,
         0.81993848, -0.56816518],
       [-0.54984337,  0.45200229, -0.80329543, ..., -0.66747063,
        -0.10088408,  0.05845577],
       [-0.54904073,  0.41596875, -0.91849011, ...,  0.63263136,
         0.71722877, -1.40478349]])

In [22]:
forward_pass_df = pd.DataFrame.from_dict({'image_name': image_order, 'feat_index': np.arange(len(image_order))})
forward_pass_df.head()

Unnamed: 0,image_name,feat_index
0,-10.028112114259333_33.44129297258391_-9.99917...,0
1,-10.034778406662008_33.93980560787962_-10.0497...,1
2,-10.049750328064_33.99969329348758_-10.0497503...,2
3,-10.0754804611206_33.345972449905325_-10.07548...,3
4,-10.094666092269977_33.99969329348758_-10.0497...,4


In [23]:
df_consumption = pd.merge(left=df_images, right=forward_pass_df, on='image_name')

In [24]:
# have we maintained all validation images?
assert len(df_consumption) == (~df_images['is_train']).sum()

In [25]:
df_consumption.head()

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,country,nightlights_bin,is_train,feat_index
0,-17.123474497711186_35.22319565304562_-17.0935...,-17.123474,35.223196,-17.093531,35.253139,1.994561,0.034344,mw,0,False,1302
1,-17.138446419113176_35.268111417251596_-17.093...,-17.138446,35.268111,-17.093531,35.253139,1.994561,0.034344,mw,0,False,1303
2,-17.063586812103217_35.29805526005558_-17.0935...,-17.063587,35.298055,-17.093531,35.253139,1.994561,0.034344,mw,0,False,1298
3,-17.110595314376873_35.13684616574092_-17.0656...,-17.110595,35.136846,-17.06568,35.16679,1.83421,0.001391,mw,0,False,1300
4,-17.0656795501709_35.13684616574092_-17.065679...,-17.06568,35.136846,-17.06568,35.16679,1.83421,0.001391,mw,0,False,1299


## Aggregate Features
For each country, we aggregate the image features per cluster and save them to results/country/cnn

In [26]:
country_abbrv = ['mw', 'eth', 'ng']
country_dir = ['malawi_2019', 'ethiopia_2019', 'nigeria_2019']

for ca, cd in zip(country_abbrv, country_dir):
    df_c = df_consumption[df_consumption['country'] == ca]
    group = df_c.groupby(['cluster_lat', 'cluster_lon'])
    x = np.zeros((len(group), 4096))
    cluster_list = [] # the corresponding clusters (lat, lon) to the x aggregate feature array
    for i, g in enumerate(group):
        lat, lon = g[0]
        im_sub = df_consumption[(df_consumption['cluster_lat'] == lat) & (df_consumption['cluster_lon'] == lon)].reset_index(drop=True)
        agg_feats = np.zeros((len(im_sub), 4096))
        for j, d in im_sub.iterrows():
            agg_feats[j,:] = feats[d.feat_index]
        agg_feats = agg_feats.mean(axis=0) # averages the features across all images in the cluster

        x[i,:] = agg_feats
        cluster_list.append([lat, lon])
    # save to the correct directory
    save_dir = os.path.join(RESULTS_DIR, cd, 'cnn')
    os.makedirs(save_dir, exist_ok=True)
    np.save(os.path.join(save_dir, 'cluster_feats.npy'), x)
    pickle.dump(cluster_list, open(os.path.join(save_dir, 'cluster_order.pkl'), 'wb')) 
    