# IMPORT

In [3]:
!pip install seaborn==0.10.1
!pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /home/ec2-user/anaconda3/envs/amazonei_mxnet_p36/lib/python3.6/site-packages (0.23.1)


In [4]:
# import libraries here; add more as necessary
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import psutil
import re

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin

# magic word for producing visualizations in notebook
%matplotlib inline

# CUSTOM FUNCTIONS

In [5]:
from utils.custom_transformers import cleaning

# DATA STORAGE TO S3

In [6]:
import sagemaker
session = sagemaker.session.Session
bucket = 'auto-ml-exploration'

# LOAD DATA

## Data location

In [7]:
# data location on S3
pop_dataset_loc = f's3://{bucket}/dataset/Udacity_AZDIAS_052018.csv'

## Partial loading

In [8]:
# partial loading
# load in the data, first only a few rows to get an idea
# I had a memory issue when loading the complete file
# 1 every 2 rows
pop_df_partial = pd.read_csv(pop_dataset_loc, sep=';', skiprows = lambda x: x%2) # skiprows because much data... Let's see if one can convert some to categorical to reduce memory usage

  interactivity=interactivity, compiler=compiler, result=result)


## Data selection

In [9]:
pop_df_selected = pop_df_partial

# Clean Data

Following data cleaning is performed based on the insights gained from data exploration notebook

In [10]:
all_columns = pop_df_selected.columns

clean_pipeline = make_pipeline(cleaning(attribute_filepath = 'DIAS Attributes - Values 2017.xlsx'))

transformers = [
('all', clean_pipeline, all_columns),
]

combined_pipe = ColumnTransformer(transformers, remainder='drop')

In [11]:
transformed_data = combined_pipe.fit_transform(pop_df_selected)

columns
ALTER_KIND1
ALTER_KIND2
ALTER_KIND3
ALTER_KIND4
EXTSEL992
KK_KUNDENTYP
will be dropped because theycontain a number of nan above 60.0%
columns
PRAEGENDE_JUGENDJAHRE
ANZ_STATISTISCHE_HAUSHALTE
LP_LEBENSPHASE_GROB
FINANZ_SPARER
FINANZ_SPARER
CJT_TYP_5
CJT_TYP_5
D19_BANKEN_ONLINE_QUOTE_12
D19_BANKEN_ONLINE_DATUM
D19_VERSAND_ANZ_24
ONLINE_AFFINITAET
VK_DHT4A
D19_VERSAND_OFFLINE_DATUM
D19_VERSAND_ONLINE_DATUM
D19_VERSAND_ONLINE_QUOTE_12
D19_VERSAND_DATUM
D19_VERSAND_ONLINE_DATUM
D19_TELKO_ANZ_24
D19_VERSAND_ONLINE_QUOTE_12
D19_VERSAND_ONLINE_QUOTE_12
D19_VERSAND_ONLINE_DATUM
D19_VERSI_ANZ_24
STRUKTURTYP
FINANZ_SPARER
LP_STATUS_GROB
FINANZ_UNAUFFAELLIGER
GEBAEUDETYP_RASTER
VERS_TYP
KBA05_MAXAH
KBA05_SEG6
MOBI_REGIO
KBA05_KRSAQUOT
KBA05_SEG2
KBA05_MOTOR
KBA05_SEG6
MOBI_REGIO
KBA05_KRSHERST1
KBA05_KRSHERST2
KBA05_KRSHERST3
KBA05_SEG6
KBA05_SEG6
KBA05_SEG9
KBA05_ZUL4
KBA05_SEG2
KBA05_SEG5
KBA05_SEG6
KBA05_SEG6
KBA05_SEG5
KBA05_SEG4
KBA05_SEG3
KBA05_SEG2
KBA05_SEG9
KBA05_SEG6
KBA05_SEG6


TypeError: unhashable type: 'list'

In [None]:
def get_feature_out(estimator, feature_in):
    if hasattr(estimator,'get_feature_names'):
        if isinstance(estimator, _VectorizerMixin):
            # handling all vectorizers
            return [f'vec_{f}' \
                for f in estimator.get_feature_names()]
        else:
            return estimator.get_feature_names(feature_in)
    elif isinstance(estimator, SelectorMixin):
        return np.array(feature_in)[estimator.get_support()]
    else:
        return feature_in


def get_ct_feature_names(ct):
    # handles all estimators, pipelines inside ColumnTransfomer
    # doesn't work when remainder =='passthrough'
    # which requires the input column names.
    output_features = []

    for name, estimator, features in ct.transformers_:
        if name!='remainder':
            if isinstance(estimator, Pipeline):
                current_features = features
                for step in estimator:
                    current_features = get_feature_out(step, current_features)
                features_out = current_features
            else:
                features_out = get_feature_out(estimator, features)
            output_features.extend(features_out)
        elif estimator=='passthrough':
            output_features.extend(ct._feature_names_in[features])

    return output_features

In [None]:
reconstructed = pd.DataFrame(transformed_data, 
             columns=get_ct_feature_names(combined_pipe))

In [None]:
reconstructed

# EXPERIMENTATION

In [16]:
def split_cameo(df, column):
    """
    split column into two columns with separate information
    
    Parameters:
    -----------
    df (pandas.DataFrame) : Dataframe in which non-frequent occurences will be replaced
    column (str) : name of column to be split

        
    Returns:
    --------
    df with column split

    
    """    
    
    def spit_content(row):
        feat1, feat2 = list(str(int(df['CAMEO_INTL_2015'].iloc[0])))
        return [int(feat1), int(feat2)]
        
        
    columns = df[column].apply(spit_content)
    columns = pd.DataFrame(columns.tolist(), index= df.index, columns=["CAMEO1", "CAMEO2"])
    df = df.join(columns)
    df.drop(column, axis=1, inplace=True)
    
    return df

In [15]:
pop_df_selected = split_cameo(pop_df_selected, 'CAMEO_INTL_2015')  

In [12]:
split_cameo(reconstructed, 'CAMEO_INTL_2015')  

NameError: name 'split_cameo' is not defined