# IMPORT

In [1]:
!pip install seaborn==0.10.1
!pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /home/ec2-user/anaconda3/envs/amazonei_mxnet_p36/lib/python3.6/site-packages (0.23.1)


In [1]:
# import libraries here; add more as necessary
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import psutil
import re

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin

# magic word for producing visualizations in notebook
%matplotlib inline

# CUSTOM FUNCTIONS

In [2]:
from utils.custom_transformers import cleaning, naning

# DATA STORAGE TO S3

In [3]:
import sagemaker
session = sagemaker.session.Session
bucket = 'auto-ml-exploration'

In [13]:
s3_dataset_path = f's3://{bucket}/dataset'

sagemaker.s3.S3Uploader.upload('Udacity_AZDIAS_052018.csv', s3_dataset_path)
sagemaker.s3.S3Uploader.upload('Udacity_CUSTOMERS_052018.csv', s3_dataset_path)
sagemaker.s3.S3Uploader.upload('Udacity_MAILOUT_052018_TEST.csv', s3_dataset_path)
sagemaker.s3.S3Uploader.upload('Udacity_MAILOUT_052018_TRAIN.csv', s3_dataset_path)

# LOAD DATA

## Data location

In [5]:
# data location on S3
pop_dataset_loc = f's3://{bucket}/dataset/Udacity_AZDIAS_052018.csv'
customers_dataset_loc = f's3://{bucket}/dataset/Udacity_CUSTOMERS_052018.csv'

## Partial loading

In [6]:
# partial loading
# load in the data, first only a few rows to get an idea
# I had a memory issue when loading the complete file
# 1 every 2 rows
pop_df_partial = pd.read_csv(pop_dataset_loc, sep=';', skiprows = lambda x: x%2) # skiprows because much data... Let's see if one can convert some to categorical to reduce memory usage

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
customers_df_partial = pd.read_csv(customers_dataset_loc, sep=';', skiprows = lambda x: x%2)

## Mini loading

In [None]:
pop_df_mini = pop_df_partial.iloc[0:100,:]
customers_df_partial = customers_df_partial.iloc[0:100,:]

## Full loading

In [None]:
pop_df = pd.read_csv(pop_dataset_loc, sep=';', dtype = cat_col)
cat_col_customers = cat_col.copy()
cat_col_customers['CUSTOMER_GROUP'] = 'category'
cat_col_customers['ONLINE_PURCHASE'] = 'category'
cat_col_customers['PRODUCT_GROUP'] = 'category'

customers_df = pd.read_csv(customers_dataset_loc, sep=';', dtype = cat_col_customers)

## Data selection

In [7]:
pop_df_selected = pop_df_partial

In [4]:
pop_df_selected = pd.read_pickle('pop_df_exploration.pkl')

In [None]:
customers_df_selected = customers_df_partial

# Clean Data

Following data cleaning is performed based on the insights gained from data exploration notebook

In [6]:
all_columns = pop_df_selected.columns

clean_pipeline = make_pipeline(naning(attribute_filepath = 'DIAS Attributes - Values 2017.xlsx'))

transformers = [
('all', clean_pipeline, all_columns),
]

combined_pipe = ColumnTransformer(transformers, remainder='drop')

In [7]:
transformed_data = combined_pipe.fit_transform(pop_df_selected)

88 replacements made


In [8]:
all_columns = pop_df_selected.columns

clean_pipeline = make_pipeline(cleaning(attribute_filepath = 'DIAS Attributes - Values 2017.xlsx'))

transformers = [
('all', clean_pipeline, all_columns),
]

combined_pipe = ColumnTransformer(transformers, remainder='drop')

In [None]:
def get_feature_out(estimator, feature_in):
    if hasattr(estimator,'get_feature_names'):
        if isinstance(estimator, _VectorizerMixin):
            # handling all vectorizers
            return [f'vec_{f}' \
                for f in estimator.get_feature_names()]
        else:
            return estimator.get_feature_names(feature_in)
    elif isinstance(estimator, SelectorMixin):
        return np.array(feature_in)[estimator.get_support()]
    else:
        return feature_in


def get_ct_feature_names(ct):
    # handles all estimators, pipelines inside ColumnTransfomer
    # doesn't work when remainder =='passthrough'
    # which requires the input column names.
    output_features = []

    for name, estimator, features in ct.transformers_:
        if name!='remainder':
            if isinstance(estimator, Pipeline):
                current_features = features
                for step in estimator:
                    current_features = get_feature_out(step, current_features)
                features_out = current_features
            else:
                features_out = get_feature_out(estimator, features)
            output_features.extend(features_out)
        elif estimator=='passthrough':
            output_features.extend(ct._feature_names_in[features])

    return output_features

In [None]:
reconstructed = pd.DataFrame(transformed_data, 
             columns=get_ct_feature_names(combined_pipe))

In [None]:
reconstructed

## Removing insignificant columns - again, after cleaning

In [30]:
remove_insignificant_columns(pop_df_selected, thresh = 0.60)

[]

## Now Moving to scikit -> column & index will be lost since scikit works with arrays

In [46]:
# remember columns & index
pop_columns = pop_df_selected.columns
# customers_columns = customers_df_selected.columns

pop_index = pop_df_selected.index
# customers_index = customers_df_selected.index


## Replacing NaN with most frequent value

In [43]:
from sklearn.impute import SimpleImputer

imp_frequent = SimpleImputer(strategy='most_frequent', missing_values=np.nan)
pop_df_selected = imp_frequent.fit_transform(pop_df_selected)

## Identifying categorical vs numeric for further post-treatment

In [52]:
pop_df_numeric = identify_numeric(filename, pop_df_selected) # based on Excel file
pop_df_cat = [col for col in pop_df_selected.columns if col not in pop_df_numeric]

In [58]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, OneHotEncoder

numeric_pipeline = make_pipeline(Normalizer())
categorical_pipeline = make_pipeline(OneHotEncoder())

In [59]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[('categorical_transformer', categorical_pipeline, pop_df_cat),
                                ('numeric_transformer', numeric_pipeline, pop_df_numeric)
                                    ]
                      )

In [60]:
pop_df_selected = ct.fit_transform(pop_df_selected)

In [61]:
pop_df_selected.shape

(363212, 1755)

In [62]:
copy

Unnamed: 0_level_0,AKT_DAT_KL,ALTER_HH,ANZ_HAUSHALTE_AKTIV,ANZ_HH_TITEL,ANZ_KINDER,ANZ_PERSONEN,ANZ_TITEL,ARBEIT,BALLRAUM,CAMEO_INTL_2015,...,UMFELD_JUNG,UNGLEICHENN_FLAG,VERS_TYP,VHA,VHN,W_KEIT_KIND_HH,WOHNDAUER_2008,WOHNLAGE,ZABEOTYP,ALTERSKATEGORIE_GROB
LNR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
910220,9.0,,11.0,0.0,0.0,2.0,0.0,3.0,6.0,51.0,...,3.0,1.0,2.0,0.0,4.0,3.0,9.0,4.0,5.0,1.0
910226,1.0,13.0,1.0,0.0,0.0,0.0,0.0,2.0,4.0,12.0,...,5.0,0.0,1.0,1.0,0.0,,9.0,7.0,3.0,4.0
910244,1.0,10.0,5.0,0.0,0.0,1.0,0.0,2.0,6.0,54.0,...,3.0,0.0,2.0,0.0,2.0,6.0,9.0,7.0,4.0,1.0
910261,1.0,14.0,6.0,0.0,0.0,1.0,0.0,2.0,2.0,14.0,...,4.0,0.0,1.0,0.0,2.0,5.0,9.0,1.0,1.0,1.0
645153,5.0,17.0,9.0,0.0,0.0,1.0,0.0,2.0,6.0,15.0,...,5.0,0.0,2.0,0.0,4.0,4.0,3.0,7.0,4.0,3.0
645169,,,,,,,,,,,...,,,,,,,,,3.0,2.0
612561,8.0,20.0,2.0,0.0,0.0,1.0,0.0,3.0,7.0,33.0,...,3.0,0.0,2.0,0.0,,6.0,3.0,5.0,5.0,1.0
612569,9.0,11.0,1.0,0.0,0.0,1.0,0.0,4.0,1.0,41.0,...,2.0,0.0,2.0,0.0,4.0,6.0,4.0,3.0,3.0,4.0
612577,,,,,,,,,,,...,,,,,,,,,3.0,2.0
612592,9.0,,1.0,0.0,0.0,2.0,0.0,4.0,6.0,34.0,...,4.0,0.0,2.0,0.0,4.0,5.0,6.0,1.0,2.0,3.0


In [54]:
pop_df_selected.drop(['ALTERSKATEGORIE_FEIN', 'EINGEFUEGT_AM', 'EINGEZOGENAM_HH_JAHR', 'VERDICHTUNGSRAUM'], axis=1, inplace=True)

In [51]:
[col for col in pop_df_selected.columns if 'LEBEN' in col]

['D19_LEBENSMITTEL', 'LP_LEBENSPHASE_FEIN']

## Part 1: Customer Segmentation Report

The main bulk of your analysis will come in this part of the project. Here, you should use unsupervised learning techniques to describe the relationship between the demographics of the company's existing customers and the general population of Germany. By the end of this part, you should be able to describe parts of the general population that are more likely to be part of the mail-order company's main customer base, and which parts of the general population are less so.

## Part 2: Supervised Learning Model

Now that you've found which parts of the population are more likely to be customers of the mail-order company, it's time to build a prediction model. Each of the rows in the "MAILOUT" data files represents an individual that was targeted for a mailout campaign. Ideally, we should be able to use the demographic information from each individual to decide whether or not it will be worth it to include that person in the campaign.

The "MAILOUT" data has been split into two approximately equal parts, each with almost 43 000 data rows. In this part, you can verify your model with the "TRAIN" partition, which includes a column, "RESPONSE", that states whether or not a person became a customer of the company following the campaign. In the next part, you'll need to create predictions on the "TEST" partition, where the "RESPONSE" column has been withheld.

In [None]:
mailout_train = pd.read_csv('../../data/Term2/capstone/arvato_data/Udacity_MAILOUT_052018_TRAIN.csv', sep=';')

## Part 3: Kaggle Competition

Now that you've created a model to predict which individuals are most likely to respond to a mailout campaign, it's time to test that model in competition through Kaggle. If you click on the link [here](http://www.kaggle.com/t/21e6d45d4c574c7fa2d868f0e8c83140), you'll be taken to the competition page where, if you have a Kaggle account, you can enter. If you're one of the top performers, you may have the chance to be contacted by a hiring manager from Arvato or Bertelsmann for an interview!

Your entry to the competition should be a CSV file with two columns. The first column should be a copy of "LNR", which acts as an ID number for each individual in the "TEST" partition. The second column, "RESPONSE", should be some measure of how likely each individual became a customer – this might not be a straightforward probability. As you should have found in Part 2, there is a large output class imbalance, where most individuals did not respond to the mailout. Thus, predicting individual classes and using accuracy does not seem to be an appropriate performance evaluation method. Instead, the competition will be using AUC to evaluate performance. The exact values of the "RESPONSE" column do not matter as much: only that the higher values try to capture as many of the actual customers as possible, early in the ROC curve sweep.

In [None]:
mailout_test = pd.read_csv('../../data/Term2/capstone/arvato_data/Udacity_MAILOUT_052018_TEST.csv', sep=';')

# Complete data pipeline

# EXPERIMENTATION

In [8]:
pop_df_selected['D19_BANKEN_DATUM']

0         10
1         10
2         10
3         10
4          8
5         10
6          8
7         10
8         10
9         10
10        10
11         1
12        10
13        10
14         7
15        10
16         8
17        10
18        10
19         9
20         4
21        10
22        10
23        10
24        10
25        10
26        10
27        10
28        10
29        10
          ..
363182    10
363183    10
363184    10
363185    10
363186    10
363187     8
363188    10
363189    10
363190    10
363191     8
363192     5
363193     9
363194    10
363195    10
363196     5
363197    10
363198    10
363199    10
363200    10
363201     8
363202    10
363203    10
363204     1
363205    10
363206    10
363207    10
363208     2
363209    10
363210    10
363211    10
Name: D19_BANKEN_DATUM, Length: 363212, dtype: int64