# Preprocess Identity file and encode it
This note book rewrite the notebook "step_2_preprocess_identity" and add encodding.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn import preprocessing
from tqdm import tqdm

#  Read Dataset

In [2]:
train_identity = pd.read_csv('./datasets/train_identity.csv')

# Substute Label Features
1. Split/Extract Features
- Drop original features

In [3]:
import re

def get_os(info):
    version = get_os_version(info)
    
    if version is np.nan or version is None:
        return info
    
    return info.strip(version).strip()


def get_os_version(info):
    
    results = re.findall('\d+\.\d\.*\d*', info)
    if len(results) > 0:
        return results[0]
    
    results = re.findall('\d+\_\d+\_*\d*', info)
    if len(results) > 0:
        return results[0]
    
    results = re.findall('\d+', info)
    if len(results) > 0:
        return results[0]
    
    return np.nan


def split_browser_info(info):
    
    if info is np.nan:
        return pd.Series([np.nan, np.nan, np.nan])
    
    known_developers = ['android browser',
                       'chrome',
                       'edge',
                        'firefox',
                        'google search application',
                        'google',
                        'Generic/Android',
                       'ie',
                        'line',
                        'Microsoft',
                        'opera',
                        'samsung',
                       'safari',
                        'ZTE/Blade',
                       'other']
    
    developer = np.nan
    version = np.nan
    
    # Grep version
    results = re.findall('\d+\.\d+', info)
    if len(results) > 0:
        str_version = results[0]
        version = float(str_version)
        info = info.replace(str_version, '')
        
    # Grep Developer
    for d in known_developers:
        if d in info:
            developer = d
            info = info.replace(d, '')
    
    # Remove 'for'
    if 'for' in info:
        info = info.replace('for', '')
    
    # remove consecutive whitespaces and keep only 1 whitespace.
    info = ' '.join(info.split())
    if len(info.strip()) == 0:
        info = np.nan
    
    if developer is np.nan:
        developer = info
        info = np.nan
        
    return pd.Series([developer, version, info])


def get_manufacture(device_info, return_nan=True):
    
    if device_info is None:
        return device_info
    
    starts_dict = {'0P': 'HTC',
                   '2P': 'HTC',
                   '40': 'ALCATEL',
                   '50': 'ALCATEL',
                   '60': 'ALCATEL',
                   '70': 'ALCATEL',
                   '80': 'ALCATEL',
                   '90': 'ALCATEL',
                   'Android': 'os-Android', #############
                   'ASUS': 'ASUS',
                   'Aquaris': 'Aquaris',
                   'ALCATEL': 'ALCATEL',
                   'AX': 'Bmobile', # Mexico
                   'B1-': 'Acer',
                   'B3-': 'Acer',
                   'BB': 'BlackBerry',
                   'BLADE': 'ZTE',
                   'BLN': 'HUAWEI',
                   'BLU': 'BLU', # American
                   'BND': 'HUAWEI',
                   'Build/': 'factory-image', ###############
                   'BV': 'Blackview',
                   'C1': 'sony',
                   'C2': 'sony',
                   'C6': 'sony',
                   'D2': 'sony',
                   'D5': 'sony',
                   'D6': 'sony',
                   'E2': 'sony',
                   'E501': 'Hyundai',
                   'E53': 'sony',
                   'E55': 'sony',
                   'E56': 'sony',
                   'E58': 'sony',
                   'E6': 'sony',
                   'E8': 'sony',
                   'en-': 'encoding',  ####################
                   'es-': 'encoding',  ####################
                   'F3': 'sony',
                   'F5': 'sony',
                   'F80': 'F2-mobile',
                   'F81': 'sony',
                   'F83': 'sony',
                   'H1': 'sony',
                   'H3': 'sony',
                   'H5': 'sony',
                   'IdeaTab': 'Lenovo',
                   'G255': 'Hyundai',
                   'G527': 'HUAWEI',
                   'G620': 'HUAWEI',
                   'G630': 'HUAWEI',
                   'G814': 'sony',
                   'G8341': 'sony',
                   'G3': 'sony',
                   'GT-': 'samsung',
                   'HTC': 'HTC',
                   'HUAWEI': 'HUAWEI',
                   'Hisense': 'Hisense',
                   'Ilium': 'Lanix', # Italian Company
                   'iOS': 'os-ios', ###################
                   'iPhone': 'iPhone', #######################
                   'iris': 'Lava',
                   'K1': 'koobee',
                   'K8': 'koobee',
                   'K9': 'koobee',
                   'Linux': 'os-linux', ######################
                   'KF': 'Amazon',  # Kindle Fire
                   'Lenovo': 'Lenovo',
                   'LG': 'LG',
                   'M431': 'Morphe',
                   'M4': 'M4',
                   'MacOS': 'MacOS',
                   'Mi ': 'xiaomi',
                   'Microsoft': 'Microsoft',  #############
                   'Moto': 'Moto',
                   'Nexus': 'google',
                   'ONE': 'ONEPLUS',
                   'P0': 'Asus',
                   'P4': 'Polaroid',
                   'P5': 'Polaroid',
                   'Pixel': 'google',
                   'Q': 'verizon',
                   'Redmi': 'xiaomi',
                   'RCT': 'RCA',
                   'rv:': 'firefox user-agent', ######################
                   'SAMSUNG': 'samsung',
                   'SCH-': 'samsung',
                   'SGH-': 'samsung',
                   'SGP': 'sony',
                   'SLA': 'HUAWEI',
                   'SPH-': 'samsung',
                   'SM-': 'samsung',
                   'STV100': 'BlackBerry',
                   'TA-': 'karbonn',
                   'verykool': 'verykool',
                   'VK': 'LG',
                   'VS': 'LG',
                   'Win': 'Windows',
                   'XT1': 'Motorola',
                   'Z410': 'Acer',
                   'Z5': 'ZTE',
                   'Z7': 'ZTE',
                   'Z8': 'ZTE',
                   'Z9 PLUS': 'QMobile',
                   'Z95': 'ZTE',
                   'Z96': 'ZTE',
                   'Z97': 'ZTE',
                   'Z98': 'ZTE',
                   'ZA': 'Zonda',
                   'ZTE': 'ZTE'}
    
    substring_dict = {'HUAWEI': 'HUAWEI',
                     'Build': 'substring-build'
                     }
    
    for prefix, manufacture in starts_dict.items():
        if device_info.lower().startswith(prefix.lower()):
            return manufacture
        
        
    for substring, manufacture in substring_dict.items():
        if substring.lower() in device_info.lower():
            return manufacture

    if return_nan:
        return
    else:
        return device_info
    
    
def insert_feature(feature_org, feature_new, extractor, df):
    """
    Args:
        feature_new (string):
        feature_org (string):
        df (pandas.DataFrame):
    """
    
    


extractors = [
              ['id_30', 'id_30_os', get_os],
              ['id_30', 'id_30_os_version', get_os_version],
              ['id_31', 'id_31_developer', lambda x: split_browser_info(x)[0]],
              ['id_31', 'id_31_version', lambda x: split_browser_info(x)[1]],
              ['id_31', 'id_31_annotation', lambda x: split_browser_info(x)[2]],
              ['id_33', 'id_33_width', lambda s: float(s.split('x')[0])],
              ['id_33', 'id_33_height', lambda s: float(s.split('x')[1])],
              ['DeviceInfo', 'DeviceInfo_map', lambda x: get_manufacture(x, return_nan=False)]
             ]

def insert_features(extractors, df):
    """
    Extract features of id_30, id_31, id_33, id_34.
    """
    feature_list = []
    
    for feature_org, feature_new, extractor in extractors:
        print('Extract features: {}'.format(feature_org))
        
        feature_list.append(feature_org)
        
        # get index of original feature
        index_feature_org = df.columns.get_loc(feature_org)

        # insert feature
        %time df.insert(index_feature_org+1, \
                    feature_new, \
                     df[feature_org].map(extractor, na_action='ignore'))
    
    
    return feature_list


feature_list = insert_features(extractors, train_identity)

Extract features: id_30
CPU times: user 365 ms, sys: 0 ns, total: 365 ms
Wall time: 364 ms
Extract features: id_30
CPU times: user 310 ms, sys: 0 ns, total: 310 ms
Wall time: 309 ms
Extract features: id_31
CPU times: user 17 s, sys: 19.8 ms, total: 17.1 s
Wall time: 17 s
Extract features: id_31
CPU times: user 17 s, sys: 12 ms, total: 17 s
Wall time: 17 s
Extract features: id_31
CPU times: user 17 s, sys: 20 ms, total: 17 s
Wall time: 17 s
Extract features: id_33
CPU times: user 57.5 ms, sys: 3.99 ms, total: 61.5 ms
Wall time: 61.5 ms
Extract features: id_33
CPU times: user 62.1 ms, sys: 0 ns, total: 62.1 ms
Wall time: 62.1 ms
Extract features: DeviceInfo
CPU times: user 3.23 s, sys: 0 ns, total: 3.23 s
Wall time: 3.23 s


In [4]:
feature = 'DeviceInfo'
index_col = train_identity.columns.get_loc(feature)
print(train_identity.dtypes[index_col: index_col+4])
train_identity.iloc[:5, index_col: index_col+4]

DeviceInfo        object
DeviceInfo_map    object
dtype: object


Unnamed: 0,DeviceInfo,DeviceInfo_map
0,SAMSUNG SM-G892A Build/NRD90M,samsung
1,iOS Device,os-ios
2,Windows,Windows
3,,
4,MacOS,MacOS


# Drop Original Features

In [5]:
feature_list

['id_30', 'id_30', 'id_31', 'id_31', 'id_31', 'id_33', 'id_33', 'DeviceInfo']

In [6]:

# def drop_features(feature_list, df):
#     df = df.drop(columns=feature_list)
    
    
# drop_features(feature_list, train_identity)

train_identity = train_identity.drop(columns=feature_list)

In [7]:
train_identity.columns

Index(['TransactionID', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06',
       'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14',
       'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22',
       'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
       'id_30_os_version', 'id_30_os', 'id_31_annotation', 'id_31_version',
       'id_31_developer', 'id_32', 'id_33_height', 'id_33_width', 'id_34',
       'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo_map'],
      dtype='object')

# Encode Labels
- Encode All Features with Object Type

In [8]:
def get_string_features(df):
    """
    Get features with numpy.dtype of 'object'
    """
    string_features = []
    for feature in df.columns:
        if df[feature].dtype == np.dtype('object'):
            string_features.append(feature)
            
    return string_features

    
string_features = get_string_features(train_identity)

print(string_features)


['id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30_os_version', 'id_30_os', 'id_31_annotation', 'id_31_developer', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo_map']


In [9]:
def transform_label(label, encoder):
    if label in encoder.classes_:
        return encoder.transform([label])[0]
    else:
        return -1
    

def encode_features(string_features, df, encoder_dict=None):
    """
    Encode features and insert back to datafame.
    """
    if encoder_dict is None:
        encoder_dict = {}
        create_encoders = True
    else:
        create_encoders = False
        
    
    for feature in tqdm(string_features):
        print('Encode {}'.format(feature))
        # Get index for inserting encoded feature
        index_col = df.columns.get_loc(feature)
           
        if create_encoders:
            # Encode Feature and insert
            encoder = preprocessing.LabelEncoder()
            encoder.fit(df[feature].dropna().unique())
            
            # add encoder to dictionary
            encoder_dict[feature] = encoder
        else:
            encoder = encoder_dict[feature]
            
        
        # Transform
        df.insert(index_col+1,
                 '{}_encoded'.format(feature),
                  df[feature].map(lambda x: transform_label(x, encoder),
                                                  na_action='ignore'))
        
        
    return encoder_dict
        

%time encoder_dict = encode_features(string_features, train_identity)
# encoder_dict = encode_features(['id_15'], train_identity)

  0%|          | 0/18 [00:00<?, ?it/s]

Encode id_12


  6%|▌         | 1/18 [00:08<02:23,  8.42s/it]

Encode id_15


 11%|█         | 2/18 [00:17<02:16,  8.52s/it]

Encode id_16


 17%|█▋        | 3/18 [00:24<02:03,  8.24s/it]

Encode id_23


 22%|██▏       | 4/18 [00:25<01:22,  5.87s/it]

Encode id_27


 28%|██▊       | 5/18 [00:25<00:54,  4.21s/it]

Encode id_28


 33%|███▎      | 6/18 [00:33<01:05,  5.44s/it]

Encode id_29


 39%|███▉      | 7/18 [00:42<01:09,  6.30s/it]

Encode id_30_os_version


 44%|████▍     | 8/18 [01:01<01:43, 10.32s/it]

Encode id_30_os


 50%|█████     | 9/18 [01:08<01:23,  9.23s/it]

Encode id_31_annotation


 56%|█████▌    | 10/18 [01:15<01:08,  8.62s/it]

Encode id_31_developer


 61%|██████    | 11/18 [01:41<01:36, 13.78s/it]

Encode id_34


 67%|██████▋   | 12/18 [01:46<01:07, 11.18s/it]

Encode id_35


 72%|███████▏  | 13/18 [01:54<00:51, 10.27s/it]

Encode id_36


 78%|███████▊  | 14/18 [02:02<00:38,  9.64s/it]

Encode id_37


 83%|████████▎ | 15/18 [02:10<00:27,  9.18s/it]

Encode id_38


 89%|████████▉ | 16/18 [02:19<00:17,  8.86s/it]

Encode DeviceType


 94%|█████████▍| 17/18 [02:27<00:08,  8.67s/it]

Encode DeviceInfo_map


100%|██████████| 18/18 [04:02<00:00, 34.68s/it]

CPU times: user 4min 2s, sys: 75.9 ms, total: 4min 2s
Wall time: 4min 2s





In [10]:
feature = 'id_15'
index = train_identity.columns.get_loc(feature)
train_identity.iloc[:10, index: index+4]

Unnamed: 0,id_15,id_15_encoded,id_16,id_16_encoded
0,New,1.0,NotFound,1.0
1,New,1.0,NotFound,1.0
2,Found,0.0,Found,0.0
3,New,1.0,NotFound,1.0
4,Found,0.0,Found,0.0
5,Found,0.0,Found,0.0
6,,,,
7,Found,0.0,Found,0.0
8,Found,0.0,Found,0.0
9,New,1.0,NotFound,1.0


# Drop Original Features

In [11]:
string_features

['id_12',
 'id_15',
 'id_16',
 'id_23',
 'id_27',
 'id_28',
 'id_29',
 'id_30_os_version',
 'id_30_os',
 'id_31_annotation',
 'id_31_developer',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38',
 'DeviceType',
 'DeviceInfo_map']

In [12]:
train_identity = train_identity.drop(columns=string_features)

In [13]:
train_identity.columns

Index(['TransactionID', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06',
       'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12_encoded', 'id_13',
       'id_14', 'id_15_encoded', 'id_16_encoded', 'id_17', 'id_18', 'id_19',
       'id_20', 'id_21', 'id_22', 'id_23_encoded', 'id_24', 'id_25', 'id_26',
       'id_27_encoded', 'id_28_encoded', 'id_29_encoded',
       'id_30_os_version_encoded', 'id_30_os_encoded',
       'id_31_annotation_encoded', 'id_31_version', 'id_31_developer_encoded',
       'id_32', 'id_33_height', 'id_33_width', 'id_34_encoded',
       'id_35_encoded', 'id_36_encoded', 'id_37_encoded', 'id_38_encoded',
       'DeviceType_encoded', 'DeviceInfo_map_encoded'],
      dtype='object')

# Save Encoder

In [14]:
import pickle

In [15]:
filename_encoder = './models/encoder_identity.pkl'

with open(filename_encoder, 'wb') as f:
    pickle.dump(encoder_dict, f)

# Load Encoder

In [18]:
with open(filename_encoder, 'rb') as f:
    encoder_dict = pickle.load(f)

# Read Dataset: test_identity.csv

In [19]:
test_identity = pd.read_csv('./datasets/test_identity.csv')

### Extract Features and Insert Back

In [20]:
feature_list = insert_features(extractors, test_identity)

Extract features: id_30
CPU times: user 332 ms, sys: 4 ms, total: 336 ms
Wall time: 338 ms
Extract features: id_30
CPU times: user 282 ms, sys: 0 ns, total: 282 ms
Wall time: 281 ms
Extract features: id_31
CPU times: user 16.7 s, sys: 12 ms, total: 16.7 s
Wall time: 16.7 s
Extract features: id_31
CPU times: user 16.7 s, sys: 22.7 ms, total: 16.7 s
Wall time: 16.7 s
Extract features: id_31
CPU times: user 16.6 s, sys: 16 ms, total: 16.6 s
Wall time: 16.6 s
Extract features: id_33
CPU times: user 60 ms, sys: 0 ns, total: 60 ms
Wall time: 60 ms
Extract features: id_33
CPU times: user 59.7 ms, sys: 0 ns, total: 59.7 ms
Wall time: 59.7 ms
Extract features: DeviceInfo
CPU times: user 3.12 s, sys: 0 ns, total: 3.12 s
Wall time: 3.12 s


### Drop Features

In [21]:
test_identity = test_identity.drop(columns=feature_list)

### Encode Identity Features

In [22]:
string_features_identity = get_string_features(test_identity)
%time encoders = encode_features(string_features_identity, test_identity, encoder_dict)

  0%|          | 0/18 [00:00<?, ?it/s]

Encode id_12


  6%|▌         | 1/18 [00:08<02:20,  8.28s/it]

Encode id_15


 11%|█         | 2/18 [00:16<02:13,  8.32s/it]

Encode id_16


 17%|█▋        | 3/18 [00:24<02:00,  8.03s/it]

Encode id_23


 22%|██▏       | 4/18 [00:24<01:20,  5.72s/it]

Encode id_27


 28%|██▊       | 5/18 [00:24<00:53,  4.10s/it]

Encode id_28


 33%|███▎      | 6/18 [00:32<01:03,  5.26s/it]

Encode id_29


 39%|███▉      | 7/18 [00:40<01:06,  6.07s/it]

Encode id_30_os_version


 44%|████▍     | 8/18 [00:54<01:23,  8.36s/it]

Encode id_30_os


 50%|█████     | 9/18 [01:00<01:08,  7.67s/it]

Encode id_31_annotation


 56%|█████▌    | 10/18 [01:06<00:57,  7.21s/it]

Encode id_31_developer


 61%|██████    | 11/18 [01:31<01:28, 12.65s/it]

Encode id_34


 67%|██████▋   | 12/18 [01:36<01:01, 10.27s/it]

Encode id_35


 72%|███████▏  | 13/18 [01:44<00:47,  9.55s/it]

Encode id_36


 78%|███████▊  | 14/18 [01:52<00:36,  9.04s/it]

Encode id_37


 83%|████████▎ | 15/18 [02:00<00:26,  8.69s/it]

Encode id_38


 89%|████████▉ | 16/18 [02:08<00:16,  8.44s/it]

Encode DeviceType


 94%|█████████▍| 17/18 [02:16<00:08,  8.32s/it]

Encode DeviceInfo_map


100%|██████████| 18/18 [03:46<00:00, 33.04s/it]

CPU times: user 3min 46s, sys: 136 ms, total: 3min 46s
Wall time: 3min 46s





In [24]:
encoders == encoder_dict

True