In [None]:
!pip install -U kaleido
!pip install plotly
!pip install --upgrade torchvision
!pip3 install -e dollarstreet

## SETUP: Set constants

In [None]:
import os

# Folder to all copied and generated assets
ASSETS_FOLDER = 'experiments/will_2022_10_15'


# DATASETS ############################################################

# Subfolder to all copied and generated datasets within assets folder
DATASETS_FOLDER = 'datasets'

# Source files
ROOT_DIR = ''
IMAGE_FOLDER = 'images_v2'
TRAIN_FILE = 'images_v2_imagenet_train_subset_seedcheck.csv'
TEST_FILE = 'images_v2_imagenet_test_subset_seedcheck.csv'
VALIDATION_FILE = 'images_v2_imagenet_validation_subset_seedcheck.csv'

# Quantiles
qt_target_col = 'income'
qt_col = 'quantile'
qts = range(4)

# Flag for skipping saving generated datasets (recommend setting to FALSE for just plotting)
ALL_SAVE_DATASETS = True
QT_SAVE_DATASETS = True


# Names of saved files split by quantile
QT_TRAIN_FILES = {qt:f'{os.path.splitext(TRAIN_FILE)[0]}_qt{qt}.csv' for qt in qts}
QT_VAL_FILES = {qt:f'{os.path.splitext(VALIDATION_FILE)[0]}_qt{qt}.csv' for qt in qts}
QT_TEST_FILES = {qt:f'{os.path.splitext(TEST_FILE)[0]}_qt{qt}.csv' for qt in qts}



# TRAINING ##############################################################

# Subfolder to all copied and generated models within assets folder
MODELS_FOLDER = 'models'

# Flags for skipping training (recommend setting to FALSE when just plotting)
ALL_TRAIN_AND_SAVE_MODELS = True
QT_TRAIN_AND_SAVE_MODELS = True


# Seeds for training 
training_seeds = [0, 1, 2, 3, 4]

# Number of epochs for training on FULL training set
NUM_EPOCHS = 15


# PLOTS ##############################################################

# Subfolder to all copied and generated plots within assets folder
PLOTS_FOLDER = 'plots'

# 1. Generate datasets and models

## 1.1 Create and save train/val/test split data

In [None]:
# Ensure save directory exists
os.makedirs(os.path.join(
    ROOT_DIR, ASSETS_FOLDER, DATASETS_FOLDER), exist_ok=True)

### 1.1.1 For all data

In [None]:
import os

import pandas as pd

# Save data in experiment folder
for filename in [TRAIN_FILE, VALIDATION_FILE, TEST_FILE]:
    df = pd.read_csv(os.path.join(ROOT_DIR, filename))
    save_path = os.path.join(
        ROOT_DIR, ASSETS_FOLDER, DATASETS_FOLDER, filename)
    if ALL_SAVE_DATASETS:
        df.to_csv(save_path, index=False)
        print(f'Saved {save_path}')
print()

### 1.1.2 For quartiles

In [None]:
import os

import pandas as pd

from dollarstreet.datasets import get_csv_dataset
from dollarstreet.dataloaders import get_loader


# Load relevant dfs
train_df = pd.read_csv(os.path.join(ROOT_DIR, TRAIN_FILE))
val_df = pd.read_csv(os.path.join(ROOT_DIR, VALIDATION_FILE))
test_df = pd.read_csv(os.path.join(ROOT_DIR, TEST_FILE))
all_data_df = pd.concat([train_df, val_df, test_df])

# Calculate quantile boundaries
all_data_quantiles = [
    all_data_df[qt_target_col].quantile((i+1)/len(qts)) for i in qts
][:len(qts)-1]

# Define quantile function using boundaries
def get_quantile_from_value(qt_target_col_value: int) -> int:
    quantile = 0
    for quantile_bound in all_data_quantiles:
        if qt_target_col_value <= quantile_bound:
            return quantile
        quantile += 1
    return quantile

# Create quantile dfs by split and save in experiment folder
split_dfs = {
    'train': (train_df, QT_TRAIN_FILES),
    'val': (val_df, QT_VAL_FILES), 
    'test': (test_df, QT_TEST_FILES)
}
for split in split_dfs.keys():
    # Load dataframe and filenames
    df, qt_files = split_dfs[split]
    
    # Split by quantile
    df[qt_col] = df[qt_target_col].apply(lambda v: get_quantile_from_value(v))
    qt_dfs = {qt: df[df[qt_col] == qt] for qt in qts}

    # Print quantile ranges
    print(f'{split} set quantiles')
    for qt in qts:
        print(f'Quantile {qt + 1}/{len(qts)} '
              f'min: {qt_dfs[qt][qt_target_col].min()} '
              f'max: {qt_dfs[qt][qt_target_col].max()} '
              f'mean: {qt_dfs[qt][qt_target_col].mean()} '
              f'median: {qt_dfs[qt][qt_target_col].median()} '
              f'count: {len(qt_dfs[qt])}')

    # Save quantiles
    for qt in qts:
        save_path = os.path.join(
            ROOT_DIR, ASSETS_FOLDER, DATASETS_FOLDER, qt_files[qt])
        if QT_SAVE_DATASETS:
            qt_dfs[qt].to_csv(save_path, index=False)
            print(f'Saved {save_path}')
    print()

## 1.2 Create dataset and dataloader objects

### 1.2.1 For all data

In [None]:
import os

import torch
import torch.nn as nn

import dollarstreet.constants as c
from dollarstreet.datasets import get_csv_dataset
from dollarstreet.dataloaders import get_loader

# Datasets
split_dfs = {
    'train': os.path.join(ROOT_DIR, ASSETS_FOLDER, DATASETS_FOLDER, TRAIN_FILE),
    'val': os.path.join(ROOT_DIR, ASSETS_FOLDER, DATASETS_FOLDER, VALIDATION_FILE),
    'test': os.path.join(ROOT_DIR, ASSETS_FOLDER, DATASETS_FOLDER, TEST_FILE),
}
all_datasets = {}
for split in split_dfs.keys():
    dataset = get_csv_dataset(
            csv_file=split_dfs[split],
            root_dir=os.path.join(ROOT_DIR, IMAGE_FOLDER),
            train= True if split == 'train' else False,
            explode=True)
    all_datasets[split] = dataset 

# Loaders
all_loaders = {}
for split in ['train', 'val', 'test']:
    dataset = all_datasets[split]
    if split == 'train':
        loader = get_loader(dataset)
    else:
        loader = get_loader(dataset, batch_size=64)
    all_loaders[split] = loader
        
print(all_datasets)
print()
print(all_loaders)

### 1.2.2 For quartiles

In [None]:
import os

import torch
import torch.nn as nn

import dollarstreet.constants as c
from dollarstreet.datasets import get_csv_dataset
from dollarstreet.dataloaders import get_loader


# Datasets
split_dfs = {
    'train': QT_TRAIN_FILES,
    'val': QT_VAL_FILES,
    'test': QT_TEST_FILES
}
qt_datasets = {qt: {} for qt in qts}
for split in split_dfs.keys():
    for qt in qts:
        csv_file = os.path.join(
            ROOT_DIR, ASSETS_FOLDER, DATASETS_FOLDER, split_dfs[split][qt])
        dataset = get_csv_dataset(
                csv_file=csv_file,
                root_dir=os.path.join(ROOT_DIR, IMAGE_FOLDER),
                train= True if split == 'train' else False,
                explode=True)
        qt_datasets[qt][split] = dataset 

# Loaders
qt_loaders = {qt: {} for qt in qts}
for split in ['train', 'val', 'test']:
    for qt in qts:
        dataset = qt_datasets[qt][split]
        if split == 'train':
            loader = get_loader(dataset)
        else:
            loader = get_loader(dataset, batch_size=64)
        qt_loaders[qt][split] = loader
        
print(qt_datasets)
print()
print(qt_loaders)

## 1.3 Train and save FT-resnet models

In [None]:
# Ensure save directory exists
os.makedirs(os.path.join(
    ROOT_DIR, ASSETS_FOLDER, MODELS_FOLDER), exist_ok=True)

### 1.3.1 For all training data

In [None]:
import pickle
from dollarstreet.run import train_models

if ALL_TRAIN_AND_SAVE_MODELS:
    # Models to fine tune
    model_names = [
        'resnet',
    ]

    # Params
    train_epochs = NUM_EPOCHS

    # Training
    all_ftmodel = None
    description = (
        f'Fine tuning pre-trained {model_names[0]} '
        f'model for {train_epochs} epochs on all training data '
    )
    all_ftmodel, _, _ = train_models(model_names=model_names,
                                     dataloaders=all_loaders, 
                                     num_epochs=train_epochs,
                                     seed=training_seeds[0],
                                     save_log=True,
                                     description=description)

    filename = f'{os.path.splitext(TRAIN_FILE)[0]}_ftresnet.pickle'
    save_path = os.path.join(
        ROOT_DIR, ASSETS_FOLDER, MODELS_FOLDER, filename)
    with open(save_path, 'wb') as handle:
        pickle.dump(all_ftmodel, handle)
    print(f'Fine tuned resnet model saved to {save_path}')

### 1.3.2 For quartile training data

In [None]:
import pickle
from dollarstreet.run import train_models

if QT_TRAIN_AND_SAVE_MODELS:
    # Models to fine tune
    model_names = [
        'resnet',
    ]

    # Params
    train_epochs = NUM_EPOCHS * len(qts)

    # Training
    qt_ftmodels = {}
    for qt in qts:
        description = (
            f'Fine tuning pre-trained {model_names[0]} '
            f'model for {train_epochs} epochs on quartile {qt} '
            'of training data'
        )
        qt_ftmodels[qt], _, _ = train_models(model_names=model_names,
                                           dataloaders=qt_loaders[qt], 
                                           num_epochs=train_epochs,
                                           seed=training_seeds[0],
                                           save_log=True,
                                           description=description)

        filename = f'{os.path.splitext(QT_TRAIN_FILES[qt])[0]}_ftresnet.pickle'
        save_path = os.path.join(
            ROOT_DIR, ASSETS_FOLDER, MODELS_FOLDER, filename)
        with open(save_path, 'wb') as handle:
            pickle.dump(qt_ftmodels[qt], handle)
        print(f'Fine tuned resnet model saved to {save_path}')

# 2. Figure 6

In [None]:
# Ensure save directory exists
os.makedirs(os.path.join(
    ROOT_DIR, ASSETS_FOLDER, PLOTS_FOLDER), exist_ok=True)

## 2.1 Score pre-trained classifiers on quartile test data

In [None]:
from dollarstreet.run import validate_models

model_names = [
    'resnet',
    'squeezenet',
    'densenet',
    'mobilenet',
    'efficientnet',
    'shufflenet',
    'visionnet',
]

results_pt = {}
for qt in qts:
    description = (
        f'Scoring all pre-trained models on {qt + 1}/{len(qts)} quantile '
        'of test data.'
    )
    _, top1, top5 = validate_models(model_names=model_names,
                                    dataloaders=qt_loaders[qt],
                                    seed=training_seeds[0],
                                    save_log=False,
                                    description=description)
    results_pt[qt] = {'top1': top1, 'top5': top5}
    print(f'Finished scoring {qt + 1}/{len(qts)} quantile')

## 2.2 Score FT-resnet on quartile test data

In [None]:
from dollarstreet.run import validate_model

# Load ft-resnet model
filename = f'{os.path.splitext(TRAIN_FILE)[0]}_ftresnet.pickle'
model_save_path = os.path.join(
        ROOT_DIR, ASSETS_FOLDER, MODELS_FOLDER, filename)

with open(model_save_path, 'rb') as handle:
    model = pickle.load(handle)['resnet']

results_ft = {qt:{} for qt in qts}
for qt in qts:
    description = (
        f'Scoring all fine-tuned model on {qt + 1}/{len(qts)} quantile '
        'of test data.'
    )
    _, top1, top5 = validate_model(model=model,
                                   dataloaders=qt_loaders[qt],
                                   seed=training_seeds[0],
                                   save_log=False,
                                   description=description)
    results_ft[qt] = {'top1': top1, 'top5': top5}
    print(f'Finished scoring {qt + 1}/{len(qts)} quantile')

## 2.3 Plot figure 6

In [None]:
import plotly.graph_objects as go

quartile_labels = ['$27-200', '$200-684', '$685-1,997', '$1,998-19,671']
model_labels = [
    'squeezenet',
    'shufflenet',
    'resnet',
    'mobilenet',
    'densenet',
    'efficientnet',
    'visionnet',
    'fine-tuned resnet',
]

# Get top 5 accuracy data per model per quartile
acc_top5 = {}
for model in model_labels:    
    results = []
    for qt in qts:
        if model == 'fine-tuned resnet':
            results.append(results_ft[qt]['top5'][0].tolist())
        else:
            results.append(results_pt[qt]['top5'][model][0].tolist())
    acc_top5[model] = results

# Add data to figure
fig = go.Figure(
    data=[
        go.Bar(name=model, x=quartile_labels, y=acc_top5[model])
        for model in model_labels
    ],
)

# Plot ########################################

# Title and axis format
fig.update_layout(
    #title_text='',
    title_x=0.5,
    title_y=0.85,
    xaxis_title="Monthly income (USD)",
    yaxis_title="Top 5 accuracy (%)",
    title_font_family="Times New Roman",
    font=dict(
        size=12,
        color="black"),
)

# Legend format
fig.update_layout(legend=dict(
    #yanchor="top",
    #y=0.5,
    #xanchor="left",
    #x=1.01,
    title_font_family="Times New Roman",
    font=dict(
        size=12,
        color="black"),
))

# Overall layout
fig.update_layout(template='seaborn')
fig.update_layout(barmode='group')

# Show
fig.show()

# Save
filename = 'Figure6.pdf'
save_path = os.path.join(
    ROOT_DIR, ASSETS_FOLDER, PLOTS_FOLDER, filename)
fig.write_image(save_path)
print(f'Saved figure to {save_path}')

In [None]:
# Printout useful info for publication
sum_ft = 0
for qt in qts:
    sum_pt = 0
    count_pt = 0
    for model, top5 in acc_top5.items():
        if model == 'fine-tuned resnet':
            sum_ft += top5[qt]
        else:
            sum_pt += top5[qt]
            count_pt += 1
    
    print(f'average for all pre-trained models for quartile {qt}: {(sum_pt/count_pt):.2f}')
    
print(f'average for fine-tuned resnet across all quartiles: {(sum_ft/len(qts)):.2f}')

# 3. Supplemental figure: train and score by quantile

## 3.1 Score pre-trained classifiers on quartile test data

In [None]:
from dollarstreet.run import validate_models

model_names = [
    'resnet',
    'squeezenet',
    'densenet',
    'mobilenet',
    'efficientnet',
    'shufflenet',
    'visionnet',
]

results_pt = {}
for qt in qts:
    description = (
        f'Scoring all pre-trained models on {qt + 1}/{len(qts)} quantile '
        'of test data.'
    )
    _, top1, top5 = validate_models(model_names=model_names,
                                    dataloaders=qt_loaders[qt],
                                    seed=training_seeds[0],
                                    save_log=False,
                                    description=description)
    results_pt[qt] = {'top1': top1, 'top5': top5}
    print(f'Finished scoring {qt + 1}/{len(qts)} quantile')

## 3.2 Score quartile-FT-resnets on quartile test data

In [None]:
from dollarstreet.run import validate_model

# Load ft-resnet models trained on each quartile
qt_models = {}
for qt in qts:
    filename = f'{os.path.splitext(QT_TRAIN_FILES[qt])[0]}_ftresnet.pickle'
    model_save_path = os.path.join(
            ROOT_DIR, ASSETS_FOLDER, MODELS_FOLDER, filename)

    with open(model_save_path, 'rb') as handle:
        qt_models[f'model_qt{qt}'] = pickle.load(handle)['resnet']

results_ft = {qt:{} for qt in qts}
for qt in qts:
    top1 = {}
    top5 = {}
    for name, model in qt_models.items():
        description = (
            f'Scoring {name} fine-tuned model on {qt + 1}/{len(qts)} quantile '
            'of test data.'
        )
        _, model_top1, model_top5 = validate_model(model=model,
                                                   dataloaders=qt_loaders[qt],
                                                   seed=training_seeds[0],
                                                   save_log=False,
                                                   description=description)
        top1[name] = model_top1
        top5[name] = model_top5
        
    results_ft[qt] = {'top1': top1, 'top5': top5}
    print(f'Finished scoring {qt + 1}/{len(qts)} quantile')

## 3.3 Generate all quartile plots

In [None]:
import plotly.graph_objects as go

for plot_qt in range(4):
    quartile_ft_label = f'resnet fine-tuned with quartile{plot_qt}'
    quartile_labels = ['$27-200', '$200-684', '$685-1,997', '$1,998-19,671']
    model_labels = [
        'resnet',
        quartile_ft_label,
    ]

    # Get top 5 accuracy data per model per quartile
    acc_top5 = {}
    for model in model_labels:    
        results = []
        for qt in qts:
            if model == quartile_ft_label:
                results.append(results_ft[qt]['top5'][f'model_qt{plot_qt}'][0].tolist())
            else:
                results.append(results_pt[qt]['top5'][model][0].tolist())
        acc_top5[model] = results

    # Add data to figure
    colors = {'resnet':'#2ca02c', quartile_ft_label:'#7f7f7f'}
    fig = go.Figure(
        data=[
            go.Bar(name=model, x=quartile_labels, y=acc_top5[model], marker_color=colors[model])
            for model in model_labels
        ],
    )

    # Plot ########################################

    # Title and axis format
    fig.update_layout(
        #title_text='',
        title_x=0.5,
        title_y=0.85,
        xaxis_title="Monthly income (USD)",
        yaxis_title="Top 5 accuracy (%)",
        title_font_family="Times New Roman",
        font=dict(
            size=12,
            color="black"),
        yaxis_range=[0,83],
    )

    # Legend format
    fig.update_layout(legend=dict(
        #yanchor="top",
        #y=0.5,
        #xanchor="left",
        #x=1.01,
        title_font_family="Times New Roman",
        font=dict(
            size=12,
            color="black"),
    ))

    # Overall layout
    fig.update_layout(template='seaborn')
    fig.update_layout(barmode='group')

    # Show
    fig.show()

    # Save
    filename = f'Supplemental_fig1_qt{plot_qt}.pdf'
    save_path = os.path.join(
        ROOT_DIR, ASSETS_FOLDER, PLOTS_FOLDER, filename)
    fig.write_image(save_path)
    print(f'Saved figure to {save_path}')