# Package installation

In [1]:
!pip install seaborn==0.10.1
!pip install -U scikit-learn
!pip install --upgrade pandas 
!pip install --upgrade s3fs
!pip install  googletrans

Collecting seaborn==0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/c7/e6/54aaaafd0b87f51dfba92ba73da94151aa3bc179e5fe88fc5dfb3038e860/seaborn-0.10.1-py3-none-any.whl (215kB)
[K    100% |████████████████████████████████| 225kB 13.4MB/s ta 0:00:01
Installing collected packages: seaborn
  Found existing installation: seaborn 0.8.1
    Uninstalling seaborn-0.8.1:
      Successfully uninstalled seaborn-0.8.1
Successfully installed seaborn-0.10.1
[33mYou are using pip version 10.0.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting scikit-learn
[?25l  Downloading https://files.pythonhosted.org/packages/d9/3a/eb8d7bbe28f4787d140bb9df685b7d5bf6115c0e2a969def4027144e98b6/scikit_learn-0.23.1-cp36-cp36m-manylinux1_x86_64.whl (6.8MB)
[K    100% |████████████████████████████████| 6.9MB 6.6MB/s eta 0:00:01
[?25hCollecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading https://files.pythonho

Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans)
[?25l  Downloading https://files.pythonhosted.org/packages/dd/d5/e4ff9318693ac6101a2095e580908b591838c6f33df8d3ee8dd953ba96a8/httpcore-0.9.1-py3-none-any.whl (42kB)
[K    100% |████████████████████████████████| 51kB 21.4MB/s ta 0:00:01
Collecting hstspreload (from httpx==0.13.3->googletrans)
[?25l  Downloading https://files.pythonhosted.org/packages/d5/b9/a183078ac6eef7c65ff97ee3477616504bb377a2939613af595b97cbaac3/hstspreload-2020.6.30-py3-none-any.whl (908kB)
[K    100% |████████████████████████████████| 911kB 23.2MB/s ta 0:00:01
[?25hCollecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans)
  Downloading https://files.pythonhosted.org/packages/78/be/7b8b99fd74ff5684225f50dd0e865393d2265656ef3b4ba9eaaaffe622b8/rfc3986-1.4.0-py2.py3-none-any.whl
Collecting contextvars>=2.1; python_version < "3.7" (from sniffio->httpx==0.13.3->googletrans)
  Downloading https://files.pythonhosted.org/packages/83/96/55b82d9f13763be9d67

# IMPORT

In [1]:
# import libraries here; add more as necessary
import os
import io

from time import gmtime, strftime

import  csv
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import psutil
import re
from joblib import dump, load

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin
from sklearn.preprocessing import Normalizer, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from scipy import sparse

from googletrans import Translator

import sagemaker
import boto3
import s3fs
from sagemaker import PCA,KMeans
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.amazon.common import write_spmatrix_to_sparse_tensor, write_numpy_to_dense_tensor
from sagemaker.s3 import S3Uploader
import mxnet as mx



# magic word for producing visualizations in notebook
%matplotlib inline

# CUSTOM FUNCTIONS

In [2]:
from utils.custom_transformers import cleaning, naning, low_freq
from utils.helper import *
from utils.clean import clean_fn

# Global variables

In [3]:
session = sagemaker.session.Session()
role = sagemaker.get_execution_role()
bucket = 'auto-ml-exploration'

In [4]:
INIT_DATA_FOLDER = 'initial_data' 
INITIAL_DATA_SAVEPTH_S3 = f's3://{bucket}/{INIT_DATA_FOLDER}'
s3_dataset_path = f's3://{bucket}/dataset'

CLEANED_DATA_FOLDER = 'cleaned_data' 
CLEANED_DATA_SAVEPTH_S3 = f's3://{bucket}/{CLEANED_DATA_FOLDER}'

TRANSFORMED_DATA_FOLDER = 'transformed_data'
TRANSFORMED_DATA_SAVEPTH_S3 = f's3://{bucket}/{TRANSFORMED_DATA_FOLDER}'

In [5]:
s3fs_handler = s3fs.S3FileSystem()

In [6]:
translator = Translator()

# DATA STORAGE TO S3

In [None]:
sagemaker.s3.S3Uploader.upload('Udacity_AZDIAS_052018.csv', s3_dataset_path)
sagemaker.s3.S3Uploader.upload('Udacity_CUSTOMERS_052018.csv', s3_dataset_path)
sagemaker.s3.S3Uploader.upload('Udacity_MAILOUT_052018_TEST.csv', s3_dataset_path)
sagemaker.s3.S3Uploader.upload('Udacity_MAILOUT_052018_TRAIN.csv', s3_dataset_path)

sagemaker.s3.S3Uploader.upload('DIAS Attributes - Values 2017.xlsx', s3_dataset_path)
sagemaker.s3.S3Uploader.upload('DIAS Information Levels - Attributes 2017.xlsx', s3_dataset_path)

In [10]:
# data location on S3
pop_dataset_loc = f's3://{bucket}/dataset/Udacity_AZDIAS_052018.csv'
customers_dataset_loc = f's3://{bucket}/dataset/Udacity_CUSTOMERS_052018.csv'
mailout_train_dataset_loc = f's3://{bucket}/dataset/Udacity_MAILOUT_052018_TRAIN.csv'
mailout_test_dataset_loc = f's3://{bucket}/dataset/Udacity_MAILOUT_052018_TEST.csv'

# Choose dataset to work on

In [11]:
flag = 'mailout' # 'pop' or 'customers' or 'mailout'

# used to perform correct operations depending on dataset to be analyzed

## Load & save to pickle if necessary

In [12]:
if s3fs_handler.exists(f'{INITIAL_DATA_SAVEPTH_S3}/{flag}_complete_df.pkl'):
    print(f"read {flag} from pickled file")
    dataset = pd.read_pickle(f'{INITIAL_DATA_SAVEPTH_S3}/{flag}_complete_df.pkl')
else:
    if flag=="pop":
        dataset = pd.read_csv(pop_dataset_loc, sep=';')
    elif flag=="customers":
        dataset = pd.read_csv(customers_dataset_loc, sep=';')
        # drop immediately columns that will never be used
        dataset.drop(['PRODUCT_GROUP', 'ONLINE_PURCHASE', 'CUSTOMER_GROUP'], inplace=True)
    elif "mailout" in flag:
        dataset = pd.read_csv(mailout_train_dataset_loc, sep=';')
        
    dataset.to_pickle(f'{INITIAL_DATA_SAVEPTH_S3}/{flag}_complete_df.pkl')

read mailout from pickled file


# Get features information

In [13]:
levels_description = pd.read_excel('DIAS Attributes - Values 2017_custom.xlsx', # Added Data Type
                                   header=1, usecols=[1,2,3,4,5,6]).fillna(method = 'ffill')
features_description = pd.read_excel('DIAS Information Levels - Attributes 2017.xlsx', 
                                     header=1, usecols=[1,2,3,4,5,6]).fillna(method = 'ffill').fillna(method = 'bfill')

In [8]:
levels_description.head()

Unnamed: 0,Attribute,Description,Data Type,Value,Meaning
0,AGER_TYP,best-ager typology,Categorical,-1,unknown
1,AGER_TYP,best-ager typology,Categorical,0,no classification possible
2,AGER_TYP,best-ager typology,Categorical,1,passive elderly
3,AGER_TYP,best-ager typology,Categorical,2,cultural elderly
4,AGER_TYP,best-ager typology,Categorical,3,experience-driven elderly


In [9]:
features_description.head()

Unnamed: 0,Information level,Attribute,Description,Additional notes
0,Person,AGER_TYP,best-ager typology,in cooperation with Kantar TNS; the informatio...
1,Person,ALTERSKATEGORIE_GROB,age through prename analysis,modelled on millions of first name-age-referen...
2,Person,ANREDE_KZ,gender,modelled on millions of first name-age-referen...
3,Person,CJT_GESAMTTYP,Customer-Journey-Typology relating to the pref...,"relating to the preferred information, marketi..."
4,Person,FINANZ_MINIMALIST,financial typology: low financial interest,Gfk-Typology based on a representative househo...


In [14]:
global_info = (pd.merge(levels_description, features_description, how='inner', on='Attribute')
               .drop(['Additional notes','Description_y'],axis=1))

In [11]:
global_info.head()

Unnamed: 0,Attribute,Description_x,Data Type,Value,Meaning,Information level
0,AGER_TYP,best-ager typology,Categorical,-1,unknown,Person
1,AGER_TYP,best-ager typology,Categorical,0,no classification possible,Person
2,AGER_TYP,best-ager typology,Categorical,1,passive elderly,Person
3,AGER_TYP,best-ager typology,Categorical,2,cultural elderly,Person
4,AGER_TYP,best-ager typology,Categorical,3,experience-driven elderly,Person


In [20]:
# number of possible values per features
# breaks for numerical ones
global_info.groupby("Attribute")["Value"].agg(["count"])

Unnamed: 0_level_0,count
Attribute,Unnamed: 1_level_1
AGER_TYP,5
ALTERSKATEGORIE_GROB,6
ALTER_HH,22
ANREDE_KZ,3
ANZ_HAUSHALTE_AKTIV,1
...,...
WACHSTUMSGEBIET_NB,6
WOHNDAUER_2008,10
WOHNLAGE,9
W_KEIT_KIND_HH,7


## Only keep information pertaining to actual datasets

In [15]:
pop_features = pd.read_csv(pop_dataset_loc, sep=';', nrows=2).columns.values
customers_features = pd.read_csv(customers_dataset_loc, sep=';', nrows=2).columns.values

dataset_features = set(pop_features).union(set(customers_features))
common_feat = set(global_info['Attribute']).intersection(dataset_features)

print((f'there are {len(dataset_features)} '
      f'features in the datasets. Among these, one has detailed information about {len(common_feat)} of them'
      f'. FYI, detailed info is available about {len(set(global_info["Attribute"]))} features'))

there are 369 features in the datasets. Among these, one has detailed information about 260 of them. FYI, detailed info is available about 300 features


## Some features have been manully flagged as duplicates or requiring split or processing. Find them.

In [62]:
# Duplicate
feature_duplicate = find_feature_per_car(global_info, "Duplicate")

loc = []
for feat in feature_duplicate:
    loc.extend(global_info.loc[global_info['Attribute']==feat,:].index.values)

print(set(global_info.loc[loc,"Attribute"].values))


# Processing
feature_processing = find_feature_per_car(global_info, "Processing")

loc = []
for feat in feature_processing:
    loc.extend(global_info.loc[global_info['Attribute']==feat,:].index.values)

print(set(global_info.loc[loc,"Attribute"].values))

{'LP_FAMILIE_FEIN', 'CAMEO_DEUG_2015', 'ALTERSKATEGORIE_GROB', 'LP_LEBENSPHASE_GROB', 'GEBURTSJAHR', 'LP_FAMILIE_GROB', 'LP_STATUS_FEIN', 'WOHNLAGE', 'CAMEO_DEUINTL_2015', 'LP_LEBENSPHASE_FEIN', 'LP_STATUS_GROB', 'CAMEO_DEU_2015'}
{'CAMEO_DEUINTL_2015', 'PRAEGENDE_JUGENDJAHRE', 'ALTER_HH'}


In [14]:
# With above information, one decides the following data treatment

# GEBURTSJAHR and ALTERSKATEGORIE_GROB provides redudant information; GEBURTSJAHR is kept

# The three CAMEO columns offer identical information. 
# Only "CAMEO_INTL_2015" will be kept and will be further processed as it provides
# in fact two information, encoded as double digit i.e. AB. A provides the wealth and B provides the family situation

# LP_LEBENSPHASE_FEIN and LP_LEBENSPHASE_GROB provide similar information, 
# one is the refined classification the other the gross classification
# They provide information about life stage & income. This seems already provided by CAMEO so they will both be dropped

# LP_FAMILIE_FEIN and LP_FAMILIE_GROB are again gross and detailed classification
# they provide the family situation. So this is similar to CAMEO_INTL_2015 after processing.
# Both will be dropped

# LP_STATUS_GROB and LP_STATUS_FEIN are again gross and detailed classification
# They provide the income of the person. Could be deemed similar to CAMEO info but this is more direct so LP_STATUS_GROB will be kept

# WOHNLAGE was marked as duplicate but not immediate to find back of which other attribute so will be kept

# ALTER_HH, CAMEO_INTL_2015 and PRAEGENDE_JUGENDJAHRE all require specific processing to extract relevant information
# In fact, ALTER_HH being ordinal, it will stay as is
# CAMEO will be split into two features and PRAEGENDE_JUGENDJAHRE will be transformed into a binary feature to identify wheter 
# the person is mainstream or avantgarde

# to_drop = ['CAMEO_DEUG_2015', 'CAMEO_DEU_2015', 'LP_LEBENSPHASE_FEIN', 'LP_LEBENSPHASE_GROB',
#          'LP_FAMILIE_FEIN', 'LP_FAMILIE_GROB', 'LP_STATUS_FEIN']

In [19]:
# Ok but what about the features over which one does not have any information ? Let's look at them
dataset_specific_feature = dataset_features.difference(common_feat)
print('\n'.join(dataset_specific_feature))

D19_VOLLSORTIMENT
CJT_TYP_4
KBA13_CCM_3000
D19_HANDWERK
D19_KINDERARTIKEL
VHN
D19_SCHUHE
D19_VERSI_DATUM
D19_TIERARTIKEL
D19_BANKEN_GROSS
D19_LEBENSMITTEL
KBA13_BAUMAX
D19_GARTEN
KBA13_ANTG1
D19_SOZIALES
D19_VERSI_ANZ_24
D19_SAMMELARTIKEL
D19_VERSI_OFFLINE_DATUM
HH_DELTA_FLAG
UMFELD_ALT
ALTER_KIND1
ARBEIT
FIRMENDICHTE
D19_BANKEN_REST
RT_SCHNAEPPCHEN
D19_VERSI_ONLINE_QUOTE_12
KBA13_ANTG4
D19_RATGEBER
D19_HAUS_DEKO
D19_VERSI_ANZ_12
SOHO_KZ
D19_VERSAND_REST
ANZ_KINDER
CUSTOMER_GROUP
CJT_TYP_3
ALTER_KIND4
D19_BIO_OEKO
VK_DISTANZ
D19_KONSUMTYP_MAX
RT_KEIN_ANREIZ
D19_WEIN_FEINKOST
CJT_TYP_1
D19_DROGERIEARTIKEL
VK_ZG11
D19_TECHNIK
D19_GESAMT_ANZ_12
D19_TELKO_ANZ_12
D19_TELKO_MOBILE
KBA13_CCM_3001
CAMEO_INTL_2015
D19_TELKO_ONLINE_QUOTE_12
D19_VERSI_ONLINE_DATUM
KONSUMZELLE
D19_REISEN
KBA13_ANTG3
D19_LETZTER_KAUF_BRANCHE
D19_VERSAND_ANZ_12
KBA13_ANTG2
EXTSEL992
PRODUCT_GROUP
CJT_TYP_5
D19_GESAMT_ANZ_24
VERDICHTUNGSRAUM
GEMEINDETYP
STRUKTURTYP
D19_BANKEN_DIREKT
D19_BUCH_CD
D19_BANKEN_LOKAL
D19_B

In [277]:
for feature in dataset_specific_feature:
    if feature == 'LNR':
        continue
    describe_feature(dataset,feature)
    print('\n')
    
# Analysis
# contains diverse information but impossible to know what it represents :
# KBA13_ANTG3, UMFELD_ALT, KBA13_ANTG2 

# contains diverse information but impossible to know what it represents and some low-frequency features : 
# CJT_TYP_4 , KBA13_HHZ, KBA13_CCM_1401_2500, KBA13_BAUMAX, CJT_TYP_3, D19_SONSTIGE, CJT_TYP_5, CJT_TYP_2, CJT_TYP_1, CJT_TYP_6  

# contains diverse information, know what it represents but many features and some low-frequency :
# D19_LETZTER_KAUF_BRANCHE, VERDICHTUNGSRAUM 

# More than 90% natural NaN
# ALTER_KIND2 , ALTER_KIND3, ALTER_KIND4, ALTER_KIND1 

# suspected to have high percentage of non-natural NaN (e.g. 0, 10, -1, ...)
# > 90% : D19_DIGIT_SERV , D19_BANKEN_LOKAL, D19_VERSI_OFFLINE_DATUM, D19_BANKEN_REST, D19_VERSI_ONLINE_DATUM, D19_GARTEN 
#         D19_TELKO_ANZ_12, D19_BANKEN_ANZ_24, D19_ENERGIE, D19_VERSI_ANZ_12, D19_BANKEN_ANZ_12, D19_BANKEN_GROSS, D19_BIO_OEKO 
#         D19_NAHRUNGSERGAENZUNG , D19_TELKO_ANZ_24 
#         
# > 70% : D19_TELKO_ONLINE_QUOTE_12, D19_SAMMELARTIKEL, D19_KOSMETIK, D19_DROGERIEARTIKEL, D19_WEIN_FEINKOST, D19_VERSAND_REST 
#         D19_TELKO_MOBILE, D19_TELKO_REST, D19_VERSI_ANZ_24, D19_VERSICHERUNGEN, D19_VERSICHERUNGEN, D19_VERSI_DATUM, D19_LEBENSMITTEL 
#         D19_SCHUHE , D19_VERSI_ONLINE_QUOTE_12, D19_KINDERARTIKEL, D19_HAUS_DEKO, D19_BANKEN_DIREKT, D19_BILDUNG, D19_RATGEBER, 
#         D19_HANDWERK, D19_FREIZEIT 
# > 50% : D19_VERSAND_ANZ_24, D19_BEKLEIDUNG_REST, D19_VOLLSORTIMENT, D19_GESAMT_ANZ_12, D19_TECHNIK, D19_REISEN, D19_BUCH_CD   
#         D19_VERSAND_ANZ_12, 

# combination of non-natural NaN and NaN is high :
# > 90% : ANZ_KINDER 
# > 70% : D19_LOTTO 
#  > 60% : KK_KUNDENTYP, KBA13_ANTG4 

# suspected to be a duplicate of already available information
# ALTERSKATEGORIE_FEIN (ages)

# EINGEZOGENAM_HH_JAHR  lots of values, could be ordinal but will order be alright ?

# True numeric : ANZ_STATISTISCHE_HAUSHALTE, EXTSEL992 

# 'EINGEFUEGT_AM' : seems to be the time of input of the data, many categories

feature KBA13_ANTG3 is categorized as float64 per panda
It means KBA13 ANTG3 in english
it has 4 different values
value 1.0 has 49583.0 samples and represents 25.87% of data
value 2.0 has 47973.0 samples and represents 25.03% of data
value 0.0 has 25498.0 samples and represents 13.30% of data
value 3.0 has 17317.0 samples and represents 9.04% of data
it presents 26.76% natural NaN


feature D19_DIGIT_SERV is categorized as int64 per panda
It means D19 DIGIT SERV in english
it has 8 different values
value 0 has 183539.0 samples and represents 95.77% of data
value 6 has 4233.0 samples and represents 2.21% of data
value 3 has 1582.0 samples and represents 0.83% of data
value 7 has 922.0 samples and represents 0.48% of data
value 5 has 765.0 samples and represents 0.40% of data
value 2 has 434.0 samples and represents 0.23% of data
value 4 has 121.0 samples and represents 0.06% of data
value 1 has 56.0 samples and represents 0.03% of data
it has no natural NaN


feature PRODUCT_GROUP is ca

feature D19_SAMMELARTIKEL is categorized as int64 per panda
It means D19 COLLECTIBLES in english
it has 8 different values
value 0 has 145113.0 samples and represents 75.72% of data
value 6 has 39605.0 samples and represents 20.67% of data
value 7 has 2958.0 samples and represents 1.54% of data
value 5 has 2069.0 samples and represents 1.08% of data
value 3 has 1607.0 samples and represents 0.84% of data
value 4 has 197.0 samples and represents 0.10% of data
value 2 has 87.0 samples and represents 0.05% of data
value 1 has 16.0 samples and represents 0.01% of data
it has no natural NaN


feature DSL_FLAG is categorized as float64 per panda
It means DSL FLAG in english
it has 2 different values
value 1.0 has 138494.0 samples and represents 72.26% of data
value 0.0 has 3231.0 samples and represents 1.69% of data
it presents 26.05% natural NaN


feature KONSUMZELLE is categorized as float64 per panda
It means CONSUMER CELL in english
it has 2 different values
value 0.0 has 116619.0 sample

feature ALTERSKATEGORIE_FEIN is categorized as float64 per panda
It means AGE CATEGORY FEIN in english
it has 25 different values
value 10.0 has 20088.0 samples and represents 10.48% of data
value 9.0 has 19713.0 samples and represents 10.29% of data
value 11.0 has 13508.0 samples and represents 7.05% of data
value 12.0 has 12956.0 samples and represents 6.76% of data
value 8.0 has 11776.0 samples and represents 6.14% of data
value 13.0 has 11629.0 samples and represents 6.07% of data
value 0.0 has 11019.0 samples and represents 5.75% of data
value 14.0 has 10817.0 samples and represents 5.64% of data
value 15.0 has 8116.0 samples and represents 4.23% of data
value 7.0 has 7185.0 samples and represents 3.75% of data
value 16.0 has 4477.0 samples and represents 2.34% of data
value 17.0 has 2478.0 samples and represents 1.29% of data
value 6.0 has 2375.0 samples and represents 1.24% of data
value 18.0 has 1510.0 samples and represents 0.79% of data
value 19.0 has 774.0 samples and repres

feature D19_VERSI_ANZ_12 is categorized as int64 per panda
It means D19 VERSI NUMBER 12th in english
it has 7 different values
value 0 has 177236.0 samples and represents 92.48% of data
value 1 has 10135.0 samples and represents 5.29% of data
value 2 has 3639.0 samples and represents 1.90% of data
value 3 has 496.0 samples and represents 0.26% of data
value 4 has 133.0 samples and represents 0.07% of data
value 5 has 11.0 samples and represents 0.01% of data
value 6 has 2.0 samples and represents 0.00% of data
it has no natural NaN


feature D19_BANKEN_ANZ_12 is categorized as int64 per panda
It means D19 BANKS NUMBER 12th in english
it has 7 different values
value 0 has 180150.0 samples and represents 94.00% of data
value 1 has 7450.0 samples and represents 3.89% of data
value 2 has 2836.0 samples and represents 1.48% of data
value 3 has 701.0 samples and represents 0.37% of data
value 4 has 379.0 samples and represents 0.20% of data
value 5 has 109.0 samples and represents 0.06% of d

feature D19_BANKEN_GROSS is categorized as int64 per panda
It means D19 BANKS BIG in english
it has 7 different values
value 0 has 175064.0 samples and represents 91.34% of data
value 6 has 9097.0 samples and represents 4.75% of data
value 3 has 3040.0 samples and represents 1.59% of data
value 5 has 2488.0 samples and represents 1.30% of data
value 4 has 875.0 samples and represents 0.46% of data
value 2 has 622.0 samples and represents 0.32% of data
value 1 has 466.0 samples and represents 0.24% of data
it has no natural NaN


feature D19_TIERARTIKEL is categorized as int64 per panda
It means D19 ANIMAL ITEMS in english
it has 8 different values
value 0 has 183788.0 samples and represents 95.90% of data
value 6 has 3266.0 samples and represents 1.70% of data
value 7 has 2524.0 samples and represents 1.32% of data
value 3 has 1136.0 samples and represents 0.59% of data
value 5 has 737.0 samples and represents 0.38% of data
value 2 has 129.0 samples and represents 0.07% of data
value 4

feature D19_BEKLEIDUNG_REST is categorized as int64 per panda
It means D19 CLOTHING REST in english
it has 8 different values
value 0 has 137848.0 samples and represents 71.93% of data
value 6 has 31571.0 samples and represents 16.47% of data
value 7 has 7880.0 samples and represents 4.11% of data
value 3 has 6749.0 samples and represents 3.52% of data
value 5 has 4074.0 samples and represents 2.13% of data
value 2 has 1555.0 samples and represents 0.81% of data
value 1 has 1137.0 samples and represents 0.59% of data
value 4 has 838.0 samples and represents 0.44% of data
it has no natural NaN


feature D19_VERSI_ONLINE_QUOTE_12 is categorized as float64 per panda
It means D19 VERSI ON-LINE QUOTE 12th in english
it has 4 different values
value 0.0 has 143697.0 samples and represents 74.98% of data
value 10.0 has 245.0 samples and represents 0.13% of data
value 5.0 has 11.0 samples and represents 0.01% of data
value 7.0 has 2.0 samples and represents 0.00% of data
it presents 24.89% natu

feature KOMBIALTER is categorized as int64 per panda
It means KOMBIALTER in english
it has 5 different values
value 4 has 109179.0 samples and represents 56.97% of data
value 9 has 47051.0 samples and represents 24.55% of data
value 3 has 28344.0 samples and represents 14.79% of data
value 2 has 5561.0 samples and represents 2.90% of data
value 1 has 1517.0 samples and represents 0.79% of data
it has no natural NaN


feature KBA13_ANTG4 is categorized as float64 per panda
It means KBA13 ANTG4 in english
it has 3 different values
value 0.0 has 78976.0 samples and represents 41.21% of data
value 1.0 has 49804.0 samples and represents 25.99% of data
value 2.0 has 11591.0 samples and represents 6.05% of data
it presents 26.76% natural NaN


feature ARBEIT is categorized as float64 per panda
It means JOB in english
it has 6 different values
value 3.0 has 50905.0 samples and represents 26.56% of data
value 4.0 has 37595.0 samples and represents 19.62% of data
value 2.0 has 33334.0 samples an

feature D19_BANKEN_DIREKT is categorized as int64 per panda
It means D19 BANKS DIRECTLY in english
it has 8 different values
value 0 has 166726.0 samples and represents 86.99% of data
value 6 has 11802.0 samples and represents 6.16% of data
value 3 has 4884.0 samples and represents 2.55% of data
value 5 has 2684.0 samples and represents 1.40% of data
value 7 has 2516.0 samples and represents 1.31% of data
value 2 has 1144.0 samples and represents 0.60% of data
value 4 has 1053.0 samples and represents 0.55% of data
value 1 has 843.0 samples and represents 0.44% of data
it has no natural NaN


feature CUSTOMER_GROUP is categorized as object per panda
It means CUSTOMER GROUP in english
it has 2 different values
value MULTI_BUYER has 132238.0 samples and represents 69.00% of data
value SINGLE_BUYER has 59414.0 samples and represents 31.00% of data
it has no natural NaN


feature VHA is categorized as float64 per panda
It means VHA in english
it has 6 different values
value 0.0 has 74250.0

feature D19_REISEN is categorized as int64 per panda
It means D19 REISEN in english
it has 8 different values
value 0 has 134825.0 samples and represents 70.35% of data
value 6 has 34244.0 samples and represents 17.87% of data
value 7 has 16397.0 samples and represents 8.56% of data
value 2 has 2078.0 samples and represents 1.08% of data
value 3 has 1982.0 samples and represents 1.03% of data
value 5 has 1653.0 samples and represents 0.86% of data
value 4 has 402.0 samples and represents 0.21% of data
value 1 has 71.0 samples and represents 0.04% of data
it has no natural NaN


feature SOHO_KZ is categorized as float64 per panda
It means SOHO concentration camp in english
it has 2 different values
value 0.0 has 143625.0 samples and represents 74.94% of data
value 1.0 has 1431.0 samples and represents 0.75% of data
it presents 24.31% natural NaN


feature D19_BUCH_CD is categorized as int64 per panda
It means D19 BOOK CD in english
it has 8 different values
value 0 has 102937.0 samples 

In [41]:
# to_drop.extend(["ALTER_KIND2", "ALTER_KIND3", "ALTER_KIND4", "ALTER_KIND1",
# "D19_DIGIT_SERV" , "D19_BANKEN_LOKAL", "D19_VERSI_OFFLINE_DATUM", "D19_BANKEN_REST", "D19_VERSI_ONLINE_DATUM", "D19_GARTEN" ,
# "D19_TELKO_ANZ_12", "D19_BANKEN_ANZ_24", "D19_ENERGIE", "D19_VERSI_ANZ_12", "D19_BANKEN_ANZ_12", "D19_BANKEN_GROSS", "D19_BIO_OEKO", 
# "D19_NAHRUNGSERGAENZUNG" , "D19_TELKO_ANZ_24",
# "D19_TELKO_ONLINE_QUOTE_12", "D19_SAMMELARTIKEL", "D19_KOSMETIK", "D19_DROGERIEARTIKEL", "D19_WEIN_FEINKOST", "D19_VERSAND_REST", 
# "D19_TELKO_MOBILE", "D19_TELKO_REST", "D19_VERSI_ANZ_24", "D19_VERSICHERUNGEN", "D19_VERSICHERUNGEN", "D19_VERSI_DATUM", "D19_LEBENSMITTEL", 
# "D19_SCHUHE" , "D19_VERSI_ONLINE_QUOTE_12", "D19_KINDERARTIKEL", "D19_HAUS_DEKO", "D19_BANKEN_DIREKT", "D19_BILDUNG", "D19_RATGEBER", 
# "D19_HANDWERK", "D19_FREIZEIT", "ANZ_KINDER", "D19_LOTTO", "ALTERSKATEGORIE_FEIN", "EINGEZOGENAM_HH_JAHR", "EINGEFUEGT_AM"])

# Perform cleaning

PRAEGENDE_JUGENDJAHRE and CAMEO_INTL_2015 needs specific processing

In [44]:
# first, clean 'X' and 'XX' values that appear and replace them by NaN
dataset = dataset.replace('X', np.nan)
dataset = dataset.replace('XX', np.nan)

In [45]:
# then process effectively 
avant_list, main_list = identify_mainstream(global_info)
print(f'shape before processing : {dataset.shape}')
dataset = process_specific_columns(dataset, avant_list, main_list)
print(f'shape afer processing : {dataset.shape}')

shape before processing : (42962, 315)
shape afer processing : (42962, 316)


# Convert non-natural NaN to natural NaN

In [47]:
nan_info, replacements = construct_fill_na_new(global_info, dataset)

In [48]:
dataset = make_replacement(dataset, replacements)

# the number of replacements is not significant enough to warrant a new cleaning step

88 replacements made


In [50]:
dataset = fill_na_presc(dataset, nan_info)

## Drop columns that contain NaN (total : natural & equivalent)

In [61]:
def drop_na_columns(dataset, thresh):
    """
    Drop columns that contain a fraction of NaN above threshold thresh
    
    Parameters:
    -----------
    dataset (pandas.DataFrame) : dataframe for which the columns will be removed
    thresh (float) : the threshold above which a column gets removed
    
    Returns:
    --------
    the dataset with columns removed according to threshold
       
    """
    print(f'number of columns before dropping : {dataset.shape[1]}')
    dataset = dataset.drop(dataset.loc[:, dataset.isna().sum()/dataset.shape[0]>0.65].columns , axis=1)
    print(f'number of columns after dropping : {dataset.shape[1]}')
    
    return dataset

In [62]:
dataset = drop_na_columns(dataset, 0.65)

number of columns before dropping : 309
number of columns after dropping : 309


# Drop rows that contain natural NaN above certain treshold

In [64]:
# not performed based on exploratory analysis of mailout training dataset

## Save to S3

In [63]:
# save cleaned data to S3
dataset.to_pickle(f'{CLEANED_DATA_SAVEPTH_S3}/{flag}_complete_cleaned_df.pkl')

In [65]:
# Save index & columns since LNR will be removed for future operations
# and scikit does not preserve indices

with open(f"columns_{flag}_cleaned.csv","w") as f:
    wr = csv.writer(f,delimiter="\n")
    wr.writerow(dataset.columns.values)
    
with open(f"index_{flag}_cleaned.csv","w") as f:
    wr = csv.writer(f,delimiter="\n")
    wr.writerow(dataset['LNR'].values) # index is contained in LNR columns
    
    
# and upload those to S3 as well
sagemaker.s3.S3Uploader.upload(f'columns_{flag}_cleaned.csv', 
                               f'{CLEANED_DATA_SAVEPTH_S3}')

sagemaker.s3.S3Uploader.upload(f'index_{flag}_cleaned.csv', 
                               f'{CLEANED_DATA_SAVEPTH_S3}')

's3://auto-ml-exploration/cleaned_data/index_mailout_cleaned.csv'

# Transform, Scale, Input

In [24]:
# cold start
dataset = pd.read_pickle(f'{CLEANED_DATA_SAVEPTH_S3}/{flag}_complete_cleaned_df.pkl')

In [25]:
# First, pop identification column (LNR)
dataset.drop('LNR', axis=1, inplace=True)

In [27]:
def identify_categorical_from_analysis(df_features, dataset):
    """
    identify categorical features based on a manual analysis whose insights are contained in df_features
    
    Parameters:
    -----------
    feature_df (pandas.DataFrame) : the dataframe containing 
    the features information. It is expected that one column is named "Attribute" and that another one is named "Data Type"  
    
    dataset (pandas.DataFrame) : the dataset for which the categorical features need to be identified
    
    Returns:
    --------
    the features identified as categorical and which belong to the dataset
    
    """
    
    categorical = set(global_info.loc[global_info['Data Type'].str.contains('Categorical'), :]['Attribute']).intersection(dataset.columns)
    binary = set(global_info.loc[global_info['Data Type'].str.contains('Binary'), :]['Attribute']).intersection(dataset.columns)
    
    return list(categorical.union(binary))

In [28]:
cat_columns = identify_categorical_from_analysis(global_info, dataset)
cat_columns = list(set(cat_columns).union(list(dataset.columns[dataset.dtypes == 'object'])))

num_columns = list(set(dataset.columns).difference(set(cat_columns)))

print(f'total number of columns:' 
      f'{dataset.shape[1]},\nnumber of categorical:{len(cat_columns)},\n'
      f'number of numerical:{len(num_columns)}')

total number of columns:308,
number of categorical:27,
number of numerical:281


In [103]:
print(cat_columns)

['RETOURTYP_BK_S', 'KBA05_MODTEMP', 'WOHNLAGE', 'KBA05_SEG6', 'ZABEOTYP', 'D19_KONSUMTYP', 'KBA05_MAXHERST', 'ANREDE_KZ', 'VERS_TYP', 'OST_WEST_KZ', 'PRAEGENDE_JUGENDJAHRE', 'LP_STATUS_GROB', 'GEBAEUDETYP_RASTER', 'GFK_URLAUBERTYP', 'GEBAEUDETYP', 'NATIONALITAET_KZ', 'CJT_GESAMTTYP', 'FINANZTYP', 'REGIOTYP', 'KBA05_MAXVORB', 'GREEN_AVANTGARDE', 'AGER_TYP', 'SHOPPER_TYP', 'HEALTH_TYP']


In [29]:
# define the transformation pipelines
numeric_pipeline = make_pipeline(SimpleImputer(strategy='mean', missing_values=np.nan), MinMaxScaler())
categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent', missing_values=np.nan), OneHotEncoder(handle_unknown='ignore'))


ct = make_column_transformer(
        (numeric_pipeline, num_columns),
        (categorical_pipeline, cat_columns)
                            )

In [30]:
# fit_transform
dataset_X = ct.fit_transform(dataset)

In [31]:
# reconstructing a dataframe
dataset = pd.DataFrame(dataset_X,columns = get_ct_feature_names(ct))
dataset['LNR'] = pd.read_csv(f'{CLEANED_DATA_SAVEPTH_S3}/index_{flag}_cleaned.csv', header=None).values

In [32]:
print(f'following imputing, scaling, transforming, dataset has {dataset.shape[1]} features')

following imputing, scaling, transforming, dataset has 439 features


## Send to S3

In [34]:
# Send transformed data to S3
dataset.to_pickle(f'{TRANSFORMED_DATA_SAVEPTH_S3}/{flag}_complete_transformed_df.pkl')

In [35]:
# Send transformed data to S3 as recordIO format
dataset_X = dataset_X.astype('float32', copy=False)

buf = io.BytesIO()
#write_spmatrix_to_spaase_tensor(buf, transformed_data) # produces a record IO in fact
write_numpy_to_dense_tensor(buf, dataset_X)
buf.seek(0)

boto3.resource('s3').Bucket(bucket).Object(f'{TRANSFORMED_DATA_SAVEPTH_S3}/{flag}_array').upload_fileobj(buf) 

# Mailout Classification task

In [7]:
flag='mailout'

In [8]:
mailout_PCA = pd.read_pickle(f'{TRANSFORMED_DATA_SAVEPTH_S3}/{flag}_complete_transformed_df.pkl')

In [9]:
LNR = mailout_PCA['LNR'].values
mailout_PCA = mailout_PCA.drop('LNR', axis=1)

In [14]:
# place RESPONSE as first column
RESPONSE = mailout_PCA['RESPONSE']
mailout_PCA = mailout_PCA.drop('RESPONSE', axis=1)

In [15]:
# split train test
X_train, X_test, y_train, y_test = train_test_split(
                                   mailout_PCA, RESPONSE, stratify=RESPONSE, test_size=0.2, random_state=42)

In [16]:
# put back on S3 for training
# !S3 expects target/response column to be the first column

pd.concat([y_train,X_train], axis=1).to_csv(f"s3://{bucket}/XGBOOST_INPUT/train.csv",
                                                    header=False,index=False)

pd.concat([y_test,X_test], axis=1).to_csv(f"s3://{bucket}/XGBOOST_INPUT/test.csv",
                                                    header=False,index=False)

In [17]:
XGBOOST_FOLDER = 'XGBOOST'
XGBOOST_INPUT_FOLDER = 'XGBOOST_INPUT'

TRAIN_LOCATION = f"s3://{bucket}/XGBOOST_INPUT/train.csv"
VALID_LOCATION = f"s3://{bucket}/XGBOOST_INPUT/test.csv"

## Hyperparameter tuning

In [36]:
tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "eta"
        },
        {
          "MaxValue": "32",
          "MinValue": "0",
          "Name": "alpha"
        },
        {
          "MaxValue": "32",
          "MinValue": "0",
          "Name": "gamma"
        },
        {
          "MaxValue": "32",
          "MinValue": "0",
          "Name": "lambda"
        },
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "subsample"
        },
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "colsample_bytree"
        },
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "colsample_bylevel"
        },
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "colsample_bynode"
        },
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "min_child_weight"
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "5",
          "MinValue": "1",
          "Name": "max_depth"
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 20,
      "MaxParallelTrainingJobs": 3
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:auc",
      "Type": "Maximize"
    }
  }

In [37]:
container = get_image_uri(session.boto_region_name, 'xgboost', repo_version='0.90-1')

training_params = {}

training_params['RoleArn'] = role

training_params['AlgorithmSpecification'] = {
    "TrainingImage": container,
    "TrainingInputMode": "File"
}

training_params['OutputDataConfig'] = {
    "S3OutputPath": "s3://" + bucket + "/" + XGBOOST_FOLDER + "/output"
}

training_params['ResourceConfig'] = {
    "InstanceCount": 1,
    "InstanceType": "ml.m4.xlarge",
    "VolumeSizeInGB": 5
}
    
training_params['StoppingCondition'] = {
    "MaxRuntimeInSeconds": 86400,
    "MaxWaitTimeInSeconds" : 86400,   
}

training_params['StaticHyperParameters'] = {
    "num_round" : '200',
    "objective" : 'binary:logistic',
    "eval_metric" : 'auc'
}

training_params["EnableManagedSpotTraining"] = True

training_params['InputDataConfig'] = [
    {
        "ChannelName": "train",
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": TRAIN_LOCATION,
                "S3DataDistributionType": "FullyReplicated"
            }
        },
        "CompressionType": "None",
        "ContentType": "text/csv"
    },
    
    {
        "ChannelName": "validation",
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": VALID_LOCATION,
                "S3DataDistributionType": "FullyReplicated"
            }
        },
        "CompressionType": "None",
        "ContentType": "text/csv"
    },
]

In [40]:
counter=0
tuning_job_name = f"capstone-xgboost-tuning-{counter}"

In [41]:
session.sagemaker_client.create_hyper_parameter_tuning_job(HyperParameterTuningJobName = tuning_job_name,
                                           HyperParameterTuningJobConfig = tuning_job_config,
                                           TrainingJobDefinition = training_params)

{'HyperParameterTuningJobArn': 'arn:aws:sagemaker:eu-west-1:662614605380:hyper-parameter-tuning-job/capstone-xgboost-tuning-0',
 'ResponseMetadata': {'RequestId': 'da96ab65-c684-47df-9ebc-8d4eb7e47f94',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'da96ab65-c684-47df-9ebc-8d4eb7e47f94',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '126',
   'date': 'Sun, 05 Jul 2020 17:36:16 GMT'},
  'RetryAttempts': 0}}

## Simple training

In [18]:
container = get_image_uri(session.boto_region_name, 'xgboost', repo_version='0.90-1')

training_params = {}

training_params['RoleArn'] = role

training_params['AlgorithmSpecification'] = {
    "TrainingImage": container,
    "TrainingInputMode": "File"
}

training_params['OutputDataConfig'] = {
    "S3OutputPath": "s3://" + bucket + "/" + XGBOOST_FOLDER + "/output"
}

training_params['ResourceConfig'] = {
    "InstanceCount": 1,
    "InstanceType": "ml.m4.xlarge",
    "VolumeSizeInGB": 5
}
    
training_params['StoppingCondition'] = {
    "MaxRuntimeInSeconds": 86400,
    "MaxWaitTimeInSeconds" : 86400,   
}

training_params['HyperParameters'] = {
    # based on tuning job
    "num_round" : '200',
    "alpha" : '18',
    "colsample_bytree" : '0.55',
    "colsample_bylevel" : '0.23',
    "colsample_bytree" : '0.67',
    "eta" : '0.32',
    "gamma" : '22',
    "lambda" : '16',
    "max_depth" : '3',
    "min_child_weight" : '2.75',
    "subsample" : '0.81',
    "objective" : 'binary:logistic',
    "eval_metric" : 'auc'
}

training_params["EnableManagedSpotTraining"] = True

training_params['InputDataConfig'] = [
    {
        "ChannelName": "train",
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": TRAIN_LOCATION,
                "S3DataDistributionType": "FullyReplicated"
            }
        },
        "CompressionType": "None",
        "ContentType": "text/csv"
    },
    
    {
        "ChannelName": "validation",
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": VALID_LOCATION,
                "S3DataDistributionType": "FullyReplicated"
            }
        },
        "CompressionType": "None",
        "ContentType": "text/csv"
    },
]

In [19]:
training_job_name = "capstone-mailout-xgboost-42"
training_params['TrainingJobName'] = training_job_name

# And now we ask SageMaker to create (and execute) the training job
training_job = session.sagemaker_client.create_training_job(**training_params)

session.logs_for_job(training_job_name, wait=True)

2020-07-06 08:28:29 Starting - Starting the training job...
2020-07-06 08:28:31 Starting - Launching requested ML instances.........
2020-07-06 08:30:03 Starting - Preparing the instances for training.........
2020-07-06 08:32:00 Downloading - Downloading input data
2020-07-06 08:32:00 Training - Downloading the training image...
2020-07-06 08:32:20 Training - Training image download completed. Training in progress..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter 


2020-07-06 08:32:43 Uploading - Uploading generated training model
2020-07-06 08:32:43 Completed - Training job completed
Training seconds: 65
Billable seconds: 20
Managed Spot Training savings: 69.2%


# Kaggle competition

In [20]:
mailout_test = pd.read_csv(f's3://{bucket}/dataset/Udacity_MAILOUT_052018_TEST.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [21]:
mailout_test, ct = clean_fn(mailout_test, 'mailout_test', fit=True, ct=None)

number of columns before manual droping : 366
number of columns before manual droping : 314
shape before processing : (42833, 314)
shape afer processing : (42833, 315)
88 replacements made
The following columns have a ratio of NaN above 65% : D19_BANKEN_DATUM
D19_BANKEN_OFFLINE_DATUM
D19_BANKEN_ONLINE_DATUM
D19_TELKO_DATUM
D19_TELKO_OFFLINE_DATUM
D19_TELKO_ONLINE_DATUM
TITEL_KZ
number of columns before dropping : 315
number of columns after dropping : 308
total number of columns:307,
number of categorical:27,
number of numerical:280
following imputing, scaling, transforming, dataset has 438 features
recordIO data has been saved to s3://auto-ml-exploration/s3://auto-ml-exploration/transformed_data/mailout_test_array


In [22]:
model_name = 'XGBOOST-capstone-2'
container = get_image_uri(session.boto_region_name, 'xgboost', repo_version='0.90-1')

info = session.sagemaker_client.describe_training_job(TrainingJobName='capstone-mailout-xgboost-42')
model_data = info['ModelArtifacts']['S3ModelArtifacts']
print(model_data)

primary_container = {
    'Image': container,
    'ModelDataUrl': model_data
}

create_model_response = session.sagemaker_client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = primary_container)

print(create_model_response['ModelArn'])

s3://auto-ml-exploration/XGBOOST/output/capstone-mailout-xgboost-42/output/model.tar.gz
arn:aws:sagemaker:eu-west-1:662614605380:model/xgboost-capstone-2


In [23]:
mailout_test_LNR = mailout_test['LNR']
mailout_test = mailout_test.drop('LNR', axis=1)
mailout_test.to_csv(f"s3://{bucket}/XGBOOST_INPUT/final_test.csv",
                                                    header=False,index=False)

In [33]:
mailout_PCA.head()

Unnamed: 0,KBA13_KMH_140,D19_TECHNIK,FINANZ_MINIMALIST,KBA05_ANTG2,KBA13_SEG_MITTELKLASSE,SEMIO_KAEM,UNGLEICHENN_FLAG,KBA05_ALTER3,KBA05_SEG8,KBA13_BMW,...,ZABEOTYP_4,ZABEOTYP_5,ZABEOTYP_6,REGIOTYP_1.0,REGIOTYP_2.0,REGIOTYP_3.0,REGIOTYP_4.0,REGIOTYP_5.0,REGIOTYP_6.0,REGIOTYP_7.0
0,0.75,0.0,0.5,0.0,1.0,0.833333,0.0,0.5,0.666667,0.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.75,0.0,1.0,0.0,0.75,0.666667,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.857143,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.75,0.75,0.5,0.833333,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.5,0.0,0.75,0.166667,0.0,0.5,0.333333,0.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [34]:
mailout_test.head()

Unnamed: 0,D19_SOZIALES,FINANZ_ANLEGER,KBA05_SEG9,D19_VERSAND_ONLINE_DATUM,KBA05_KRSOBER,KBA13_CCM_3001,SOHO_KZ,ANZ_STATISTISCHE_HAUSHALTE,KBA13_KMH_0_140,KBA05_MOD3,...,FINANZTYP_2,FINANZTYP_3,FINANZTYP_4,FINANZTYP_5,FINANZTYP_6,CAMEO1_1,CAMEO1_2,CAMEO1_3,CAMEO1_4,CAMEO1_5
0,0.2,0.0,0.333333,0.375,1.0,1.0,0.0,0.005333,0.2,0.25,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.2,0.0,0.0,0.57266,0.5,0.0,0.0,0.056,0.8,0.75,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.57266,1.0,0.75,0.0,0.005333,0.6,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.25,0.0,0.875,0.5,0.0,0.0,0.002667,0.6,0.25,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.5,0.333333,0.375,1.0,0.0,0.0,0.002667,0.2,0.25,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [24]:
XGBOOST_INFERENCE_FOLDER_S3 = 'XGBOOST_INFERENCE'
transform_job_name = f"xgboost-inference-mailout-test-final-4"
INFERENCE_INPUT = f"s3://{bucket}/XGBOOST_INPUT/final_test.csv"
model_name = model_name

job_config = {
   "BatchStrategy": "MultiRecord",
   "MaxConcurrentTransforms": 0,
   "MaxPayloadInMB": 6,
   "ModelName": model_name,
    
   "TransformInput": { 
      "CompressionType": "None",
      "ContentType": "text/csv",
      "DataSource": { 
         "S3DataSource": { 
            "S3DataType": "S3Prefix",
            "S3Uri": INFERENCE_INPUT,
         }
      },
      "SplitType": "Line"
   },
   "TransformJobName": transform_job_name,
    
   "TransformOutput": { 
      "Accept": "text/csv",
      "AssembleWith": "Line",
      "S3OutputPath": f's3://{bucket}/{XGBOOST_INFERENCE_FOLDER_S3}'
   },
   "TransformResources": { 
      "InstanceCount": 1,
      "InstanceType": "ml.m4.xlarge",
   }
}

batch_transform_job = session.sagemaker_client.create_transform_job(**job_config)

session.logs_for_transform_job(transform_job_name, wait=True)

.......................[34m[2020-07-06 08:39:14 +0000] [16] [INFO] Starting gunicorn 19.10.0[0m
[34m[2020-07-06 08:39:14 +0000] [16] [INFO] Listening at: unix:/tmp/gunicorn.sock (16)[0m
[34m[2020-07-06 08:39:14 +0000] [16] [INFO] Using worker: gevent[0m
[35m[2020-07-06 08:39:14 +0000] [16] [INFO] Starting gunicorn 19.10.0[0m
[35m[2020-07-06 08:39:14 +0000] [16] [INFO] Listening at: unix:/tmp/gunicorn.sock (16)[0m
[35m[2020-07-06 08:39:14 +0000] [16] [INFO] Using worker: gevent[0m
[34m[2020-07-06 08:39:14 +0000] [23] [INFO] Booting worker with pid: 23[0m
[34m[2020-07-06 08:39:14 +0000] [24] [INFO] Booting worker with pid: 24[0m
[34m[2020-07-06 08:39:14 +0000] [25] [INFO] Booting worker with pid: 25[0m
[34m[2020-07-06 08:39:14 +0000] [29] [INFO] Booting worker with pid: 29[0m
[35m[2020-07-06 08:39:14 +0000] [23] [INFO] Booting worker with pid: 23[0m
[35m[2020-07-06 08:39:14 +0000] [24] [INFO] Booting worker with pid: 24[0m
[35m[2020-07-06 08:39:14 +0000] [25] [IN

In [25]:
# read back from inference
result = pd.read_csv(f's3://{bucket}/{XGBOOST_INFERENCE_FOLDER_S3}/final_test.csv.out', header=None)
result = pd.concat([mailout_test_LNR, result], axis=1)
result.columns=['LNR', 'RESPONSE']

In [26]:
result.to_csv('submission.csv', index=False)

In [27]:
result

Unnamed: 0,LNR,RESPONSE
0,1754,0.019192
1,1770,0.010830
2,1465,0.007136
3,1470,0.026201
4,1478,0.019192
...,...,...
42828,67615,0.007136
42829,67938,0.026201
42830,67942,0.006280
42831,67949,0.007136


# Experiments

In [35]:
test_col = set(mailout_test.columns)

In [36]:
train_col = set(mailout_PCA.columns)

In [37]:
len(test_col.intersection(train_col))

437