## imports

In [12]:
!python -m pip install --upgrade pip
!pip install pyarrow
!pip install pandarallel
!pip install pandas
!pip install mxnet
!pip install autogluon
!pip install gluoncv

import numpy as np
import pandas as pd
from pandarallel import pandarallel
import sagemaker
import boto3
import os, time
import autogluon as ag
import mxnet as mx
from mxnet import nd, gluon, init, autograd
from mxnet.gluon import nn
import pickle

os.environ['NUMEXPR_MAX_THREADS'] = str(os.cpu_count())
pandarallel.initialize()
s3_client = boto3.client('s3')

mx.random.seed(127)
#gpus = mx.test_utils.list_gpus()
contexts = [mx.cpu()] #[mx.gpu(i) for i in gpus] if len(gpus) > 0 else [mx.cpu()]

Requirement already up-to-date: pip in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (20.2.1)


INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
!pip list
!python --version

Package                            Version
---------------------------------- -------------------
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-project                   0.8.3
argh                               0.26.2
asn1crypto                         1.3.0
astroid                            2.3.3
astropy                            4.0.1.post1
atomicwrites                       1.3.0
attrs                              19.3.0
autogluon                          0.0.12
Automat                            20.2.0
autopep8                           1.4.4
autovizwidget                      0.15.0
awscli                             1.18.104
Babel                              2.8.0
backcall                           0.1.0
backports.shutil-get-terminal-size 1.0.0
bcrypt                             3.1.7
beautifulsoup4                     4.8.2
bitarray                           1.2.1
bkcharts                           0.2
bleach                      

Python 3.6.10 :: Anaconda, Inc.


## load data / parameters

In [3]:
train_dataset_path = './Data/container_formatted/train.csv'
model_path = './opt/ml/model/'
n_hours = 0
n_mins = 15
time_limits = 60 * 60 * n_hours + 60 * n_mins # n_hours + n_mins
eval_metric = 'accuracy'
id_column = 'ID'

In [4]:
# text/numeric data
dataset = pd.read_csv(train_dataset_path)
print('data read')

data read


In [5]:
# create category mapping (to define non-clashing dummy category)
target_column = "label"
cats = dataset[target_column].unique()
numcats = len(cats)
intcats = list(range(numcats))
catmap = dict(zip(cats, intcats))
invcatmap = dict(zip(intcats, cats))

# map labels to new categories
labels = dataset[target_column]
mappedlabels = pd.Series([catmap[x] for x in labels])

## data clean / suggested label generation

In [6]:
import re, string, base64
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from io import StringIO, BytesIO
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()
def clean_text(data, labelled=False):
    lemmatizer = WordNetLemmatizer()
    nltk.download('wordnet')
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    # scrub possible html
    # stackoverflow.com/questions/753052/strip-html-from-strings-in-python
    def strip_tags(html):
        s = MLStripper()
        s.feed(html)
        return s.get_data()
    # lowercases and removes special characters
    def clean_val(val):
        text = val
        text = strip_tags(text)
        if type(val) == str:
            text = text.lower().strip()
            text = re.compile(r'[%s]' % re.escape(string.punctuation)).sub(' ', text)
            text = re.sub(r'\s+', ' ', text)
            words = [w for w in text.split(" ") if not w in stop_words]
            text = " ".join([lemmatizer.lemmatize(w) for w in words])
        return text
    # get text cols
    text_cols = [x for x in data.columns if x.endswith('_text')]
    if len(text_cols) == 0:
        print('no text columns found.')
        return pd.DataFrame(), None
    print('text columns found:', text_cols)
    text_data = data[text_cols].copy()
    # lazy impute text
    text_data = text_data.fillna('')
    # clean text cols
    for col in text_cols:
        print('text cleaning:', col)
        text_data[col] = text_data[col].parallel_apply(clean_val)
    text_label = None
    if labelled:
        print('working on labels...')
        text_label = mappedlabels.copy()
        map2bad = text_data[text_cols[0]] == ''
        for col in text_cols[1:]:
            map2bad = np.logical_and(map2bad, text_data[col] == '')
        text_label[map2bad] = numcats
    print("done transforming text data.")
    return text_data, text_label
def clean_num(data, labelled=False):
    # get num cols
    num_cols = [x for x in data.columns if x.endswith('_num')]
    if len(num_cols) == 0:
        print('no numeric columns found.')
        return pd.DataFrame(), None
    print('numeric columns found:', num_cols)
    num_data = data[num_cols].copy()
    # impute numeric data
    num_data = num_data.fillna(0)
    num_label = None
    if labelled:
        print('working on labels...')
        num_label = mappedlabels.copy()
    print("done transforming numeric data.")
    return num_data, num_label
def clean_cat(data, labelled=False):
    # get cat cols
    cat_cols = [x for x in data.columns if x.endswith('_cat')]
    if len(cat_cols) == 0:
        print('no categorical columns found.')
        return pd.DataFrame(), None
    print('categorical columns found:', cat_cols)
    cat_data = data[cat_cols].copy()
    # impute categorical data
    cat_data = cat_data.fillna('unknown')
    cat_data_out = pd.get_dummies(cat_data)
    cat_label = None
    if labelled:
        print('working on labels...')
        cat_label = mappedlabels.copy()
    print("done transforming categorical data.")
    return cat_data_out, cat_label
def clean_image(data, labelled=False):
    import cv2
    # normalize image
    def normalizeimg(img):
        img = img.transpose((2, 0, 1)).expand_dims(axis=0)
        rgb_mean = nd.array([0.485, 0.456, 0.406]).reshape((1,3,1,1))
        rgb_std = nd.array([0.229, 0.224, 0.225]).reshape((1,3,1,1))
        return (img.astype('float32') / 255 - rgb_mean) / rgb_std
    def cleanimg(img_bytes):
        if img_bytes == '':
            return nd.array([])#normalizeimg(mx.nd.array([[[0]*3]*224]*244))
        img = mx.image.imdecode(base64.b64decode(img_bytes))
        img = mx.image.resize_short(img, 256)
        img, _ = mx.image.center_crop(img, (224, 224))
        return normalizeimg(img)
    # get pretrained resnet50
    def getresnet():
        net = gluon.model_zoo.vision.resnet50_v1(pretrained=True, ctx=contexts)
        print('image pre-model created...')
        return net
    # get cat cols
    img_cols = [x for x in data.columns if x.endswith('_image')]
    if len(img_cols) == 0:
        print('no image columns found.')
        return pd.DataFrame(), None
    print('image columns found:', img_cols)
    img_data = data[img_cols].copy()
    # impute categorical data
    img_data = img_data.fillna('')
    # initialize model to transform images to resnet outputs
    img_model = getresnet()
    # create suggested img labels
    badimg_pred = img_model(normalizeimg(mx.nd.array([[[0]*3]*224]*244)))
    def get_pred(x):
        if len(x) == 0:
            return badimg_pred
        else:
            return img_model(x)
    map2bads =  []
    for col in img_cols:
        print('cleaning:', col)
        map2bads.append(img_data[col] == '')
        img_data[col] = img_data[col].parallel_apply(cleanimg)
        print('images cleaned. now pre-predicting...')
        img_data[col] = img_data[col].parallel_apply(get_pred)
    image_label = []
    if labelled:
        print('working on labels...')
        import functools
        mlabs = mappedlabels.copy()
        for i in range(len(map2bads)):
            image_label.append(mlabs.copy())
            image_label[i][map2bads[i]] = numcats
    print("done transforming image data.")
    return img_data, image_label

In [7]:
text_data, text_label = clean_text(dataset, labelled=True)
num_data, num_label = clean_num(dataset, labelled=True)
cat_data, cat_label = clean_cat(dataset, labelled=True)
img_data, img_label = clean_image(dataset, labelled=True)

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


text columns found: ['item_name_text', 'product_description_text', 'bullet_point_text', 'brand_text', 'manufacturer_text', 'generic_keyword_text', 'material_text']
text cleaning: item_name_text


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


text cleaning: product_description_text
text cleaning: bullet_point_text
text cleaning: brand_text
text cleaning: manufacturer_text
text cleaning: generic_keyword_text
text cleaning: material_text
working on labels...


Model file not found. Downloading to /home/ec2-user/.mxnet/models/resnet50_v1-0aee57f9.params.


done transforming text data.
numeric columns found: ['marketplace_id_num', 'number_of_items_num', 'case_pack_quantity_num', 'item_package_quantity_num', 'item_dimensions_height_num', 'item_dimensions_width_num', 'item_dimensions_length_num', 'normalized_item_weight_num', 'normalized_item_package_weight_num', 'list_price_value_num', 'list_price_value_with_tax_num']
working on labels...
done transforming numeric data.
categorical columns found: ['list_price_currency_cat']
working on labels...
done transforming categorical data.
image columns found: ['img_image']
Downloading /home/ec2-user/.mxnet/models/resnet50_v1-0aee57f9.zip1d20bc17-b876-4676-95aa-55994768ecba from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet50_v1-0aee57f9.zip...
image pre-model created...
cleaning: img_image
images cleaned. now pre-predicting...
working on labels...
done transforming image data.


In [9]:
# save model-specific values to pickle file for inference container
has_text = (len(text_data) != 0)
has_num = (len(num_data) != 0)
has_cat = (len(cat_data) != 0)
has_img = (len(img_data) != 0)
model_config = {
    "columns": dataset.columns.tolist(),
    "columns_text": text_data.columns.tolist(),
    "columns_num": num_data.columns.tolist(),
    "columns_cat": cat_data.columns.tolist(),
    "columns_img": img_data.columns.tolist(),
    "has_text": has_text,
    "has_num": has_num,
    "has_cat": has_cat,
    "has_img": has_img,
    "catmap": catmap,
    "invcatmap": invcatmap,
    "numcats": numcats,
    "cats": cats,
}
with open(model_path+'model_config.pkl', 'wb') as f:
    pickle.dump(model_config, f)

## training

### phase one - per-modality training

In [10]:
from autogluon import TabularPrediction as task

if has_text:
    print('loading text data')
    agluon_text_train_data = task.Dataset(pd.concat([text_data, pd.Series(text_label, name='label')], axis=1))
if has_num:
    print('loading num data')
    agluon_num_train_data = task.Dataset(pd.concat([num_data, pd.Series(num_label, name='label')], axis=1))
if has_cat:
    print('loading cat data')
    agluon_cat_train_data = task.Dataset(pd.concat([cat_data, pd.Series(cat_label, name='label')], axis=1))
if has_img:
    print('loading img data')
    agluon_img_train_data = []
    for i in range(len(img_data.columns)):
        from itertools import zip_longest
        curr_img_feature = pd.DataFrame.from_records(zip_longest(
            *img_data.iloc[:, i].parallel_apply(lambda x: x[0].asnumpy()).values)).transpose()
        agluon_img_train_data.append(task.Dataset(
            pd.concat([curr_img_feature, pd.Series(img_label[i], name='label')], axis=1)))
print('done')

loading text data
loading num data
loading cat data
loading img data
done


In [9]:
'''# train on each existing modality
if has_text:
    predictor_text = task.fit(
        train_data=agluon_text_train_data,
        label=target_column,
        eval_metric=eval_metric,
        #presets='best_quality',
        time_limits=time_limits,
        id_columns=[id_column],
        ngpus_per_trial=8,
        nthreads_per_trial=os.cpu_count(),
        verbosity=3,
        problem_type='multiclass',
        output_directory=model_path+'text/'
    )
    print('done training text')
if has_num:
    predictor_num = task.fit(
        train_data=agluon_num_train_data,
        label=target_column,
        eval_metric=eval_metric,
        #presets='best_quality',
        time_limits=time_limits,
        id_columns=[id_column],
        ngpus_per_trial=8,
        nthreads_per_trial=os.cpu_count(),
        verbosity=3,
        problem_type='multiclass',
        output_directory=model_path+'num/'
    )
    print('done training num')
if has_cat:
    predictor_cat = task.fit(
        train_data=agluon_cat_train_data,
        label=target_column,
        eval_metric=eval_metric,
        #presets='best_quality',
        time_limits=time_limits,
        id_columns=[id_column],
        ngpus_per_trial=8,
        nthreads_per_trial=os.cpu_count(),
        verbosity=3,
        problem_type='multiclass',
        output_directory=model_path+'cat/'
    )
    print('done training cat')'''
if has_img:
    predictor_img = []
    for i in range(len(agluon_img_train_data)):
        task.fit(
            train_data=agluon_img_train_data[i],
            label=target_column,
            eval_metric=eval_metric,
            #presets='best_quality',
            time_limits=time_limits,
            id_columns=[id_column],
            ngpus_per_trial=8,
            nthreads_per_trial=os.cpu_count(),
            verbosity=3,
            problem_type='multiclass',
            output_directory=model_path+'img/'+img_data.columns[i]
        )
    print('done training img')

Beginning AutoGluon training ... Time limit = 900s
AutoGluon will save models to ./opt/ml/model/img/img_image/
AutoGluon Version:  0.0.12
Train Data Rows:    109590
Train Data Columns: 1001
Preprocessing data ...
Fraction of data from classes with at least 15 examples that will be kept for training models: 0.9774796970526508
Train Data Class Count: 457
Feature Generator processed 107122 data points with 1000 features
Original Features (raw dtypes):
	float64 features: 1000
Original Features (inferred dtypes):
	float features: 1000
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 1000
Final Features:
	float features: 1000
	Data preprocessing and feature engineering runtime = 9.48s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Saving ./opt/ml/model/img/img_image/learner.pkl
Saving ./opt/ml/model/i

done training img


In [11]:
# train on each existing modality
if has_text:
    predictor_text = task.load(model_path+'text/')
    print('done loading text model')
if has_num:
    predictor_num = task.load(model_path+'num/')
    print('done loading num model')
if has_cat:
    predictor_cat = task.load(model_path+'cat/')
    print('done loading cat model')
if has_img:
    predictor_img = task.load(model_path+'img/')
    print('done loading img model')

done loading text model
done loading num model
done loading cat model


FileNotFoundError: [Errno 2] No such file or directory: './opt/ml/model/img/learner.pkl'

In [None]:
if has_text:
    preds_text = predictor_text.predict_proba(agluon_text_train_data.drop(columns=['label']))
    print('done generating text unimodal preds')
if has_num:
    preds_num = predictor_num.predict_proba(agluon_num_train_data.drop(columns=['label']))
    print('done generating num unimodal preds')
if has_cat:
    preds_cat = predictor_cat.predict_proba(agluon_cat_train_data.drop(columns=['label']))
    print('done generating cat unimodal preds')
if has_img:
    preds_img = []
    for i in range(len(predictor_img)):
        preds_img.append(predictor_img[i].predict_proba(agluon_img_train_data[i].drop(columns=['label'])))
    print('done generating img unimodal preds')

### phase 2 - wholistic training

In [None]:
# create data
wholistic_train = pd.DataFrame()
if has_text:
    wholistic_train = pd.concat([wholistic_train, pd.DataFrame(preds_text)], axis=1)
if has_num:
    wholistic_train = pd.concat([wholistic_train, pd.DataFrame(preds_num)], axis=1)
if has_cat:
    wholistic_train = pd.concat([wholistic_train, pd.DataFrame(preds_cat)], axis=1)
if has_img:
    wholistic_train = pd.concat([wholistic_train, pd.DataFrame(preds_img)], axis=1)
wholistic_input_size = wholistic_train.shape[1]

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
def create_model():
    model = tf.keras.Sequential()
    model.add(layers.Dense(
        int(wholistic_input_size*float(2/3))+numcats,
        input_shape=(wholistic_input_size,),
        activation='relu')
    )
    model.add(layers.Dense(int(wholistic_input_size*float(1/3))+numcats, activation='relu'))
    model.add(layers.Dense(numcats, activation='sigmoid'))
    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
        optimizer='adam', metrics=['accuracy'])
    return model
model = create_model()

In [None]:
history = model.fit(
    wholistic_train.values,
    mappedlabels.values,
    batch_size=32,
    epochs=10,
    validation_split=0.2
)

In [None]:
model.save_weights(model_path+'wholistic/wholistic_model')