In [10]:
__author__ = 'Evgeny BAZAROV'

FOLDER = "/home/evgeny/kaggle/input/"

import numpy as np
import pandas as pd
from math import radians

pd.set_option("display.max_columns", 99)

FOLDER = "/home/evgeny/kaggle/input/"

def haversine(pddata, lon1='', lat1='', lon2='', lat2=''):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    lon1 = pddata[lon1].apply(radians).values
    lat1 = pddata[lat1].apply(radians).values
    lon2 = pddata[lon2].apply(radians).values
    lat2 = pddata[lat2].apply(radians).values

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [2]:
print("Loading...")
print("----- LOAD train_merged-part1")
pdtrain1 = pd.read_hdf(FOLDER + "train_merged-part1.h")
print("----- LOAD train_merged-part2")
pdtrain2 = pd.read_hdf(FOLDER + "train_merged-part2.h")
pdtrain = pdtrain1.append(pdtrain2)
print("----- LOAD test_merged")
pdtest = pd.read_hdf(FOLDER + "test_merged.h")
pd_data = pdtrain.append(pdtest)

del pdtrain1
del pdtrain2

print("DONE")

In [5]:
pd_data.columns

Index(['attrsJSON_1', 'attrsJSON_2', 'categoryID_1', 'categoryID_2',
       'description_1', 'description_2', 'generationMethod', 'id',
       'images_array_1', 'images_array_2', 'isDuplicate', 'itemID_1',
       'itemID_2', 'lat_1', 'lat_2', 'locationID_1', 'locationID_2', 'lon_1',
       'lon_2', 'metroID_1', 'metroID_2', 'parentCategoryID_1',
       'parentCategoryID_2', 'price_1', 'price_2', 'regionID_1', 'regionID_2',
       'title_1', 'title_2'],
      dtype='object')

id                         0
itemID_1                   0
itemID_2                   0
categoryID_1               0
title_1                    2
description_1             25
images_array_1        117733
attrsJSON_1            38490
price_1               129371
locationID_1               0
metroID_1             740808
lat_1                      0
lon_1                      0
parentCategoryID_1         0
regionID_1                 0
categoryID_2               0
title_2                    0
description_2             24
images_array_2        117974
attrsJSON_2            38490
price_2               129700
locationID_2               0
metroID_2             740115
lat_2                      0
lon_2                      0
parentCategoryID_2         0
regionID_2                 0
dtype: int64

In [3]:
print("Replacing NaN")
pd_data['title_1'].fillna("", inplace=True)
pd_data['title_2'].fillna("", inplace=True)
pd_data['description_1'].fillna("", inplace=True)
pd_data['description_2'].fillna("", inplace=True)
pd_data['attrsJSON_1'].fillna("", inplace=True)
pd_data['attrsJSON_2'].fillna("", inplace=True)
pd_data['images_array_1'].fillna("", inplace=True)
pd_data['images_array_2'].fillna("", inplace=True)

In [4]:
# feature dataframe
pd_features = pd.DataFrame()

In [6]:
# titles
pd_features['title_diff_len'] = np.abs(pd_data['title_2'].apply(len) - pd_data['title_1'].apply(len))
pd_features['title_dlevenshtein'] = pd.read_hdf(FOLDER + "feat_train_title_demarauleven.h")
pd_features['title_jarowinkler'] = pd.read_hdf(FOLDER + "feat_train_title_jarowinkler.h")
pd_features['title_num_same'] = pd.read_hdf(FOLDER + "feat_train_title_num_same.h")

# description
pd_features['description_diff_len'] = np.abs(pd_data['description_2'].apply(len) - pd_data['description_1'].apply(len))
pd_features['description_dlevenshtein'] = pd.read_hdf(FOLDER + "feat_train_description_demarauleven.h")
pd_features['description_jarowinkler'] = pd.read_hdf(FOLDER + "feat_train_description_jarowinkler.h")

# price
pd_features['price_diff'] = np.abs(pd_data['price_1']-pd_data['price_2'])*1./(np.abs(pd_data['price_1']) + np.abs(pd_data['price_2']))
pd_features['price_diff'].fillna(-9999, inplace=True)

# attrJson
pd_features['attrsJSON_diff_len'] = np.abs(pd_data['attrsJSON_1'].apply(len) - pd_data['attrsJSON_2'].apply(len))
pd_features['attrsJSON_dlevenshtein'] = pd.read_hdf(FOLDER + "feat_train_attrsJSON_demarauleven.h")
pd_features['attrsJSON_jarowinkler'] = pd.read_hdf(FOLDER + "feat_train_attrsJSON_jarowinkler.h")

# images_array
pd_features['images_diff_number'] = np.abs(pd_data['images_array_1'].apply(lambda x: len(x.split(',')))-pd_data['images_array_2'].apply(lambda x: len(x.split(','))))

# geographic features
pd_features['metroID_same'] = 1 * (pd_data['metroID_1'] == pd_data['metroID_2'])
pd_features['locationID_same'] = 1 * (pd_data['locationID_1'] == pd_data['locationID_2'])
pd_features['regionID_same'] = 1 * (pd_data['regionID_1'] == pd_data['regionID_2'])
pd_features['haversine'] = haversine(pd_data, lon1='lon_1', lat1='lat_1', lon2='lon_2', lat2='lat_2')
pd_features['haversine'] = (pd_features['haversine'] - pd_features['haversine'].mean()) / pd_features['haversine'].std()

pd_features['categoryID_same'] = 1 * (pd_data['categoryID_1'] == pd_data['categoryID_2'])
#pd_features['parentCategoryID_same'] = 1 * (pd_data['parentCategoryID_1'] == pd_data['parentCategoryID_2'])

pd_features['isDuplicate'] = pd_data['isDuplicate']
pd_features['id'] = pd_data['id']

In [7]:
pd_features.head()

Unnamed: 0,title_diff_len,title_dlevenshtein,title_jarowinkler,title_num_same,description_diff_len,description_dlevenshtein,description_jarowinkler,price_diff,attrsJSON_diff_len,attrsJSON_dlevenshtein,attrsJSON_jarowinkler,images_diff_number,metroID_same,locationID_same,regionID_same,haversine,categoryID_same,isDuplicate,id
0,0,0.0,1.0,1,0,0.0,1.0,0.0,0,0.0,1.0,1,0,1,1,-0.205083,1,1.0,
1,13,0.666667,0.396825,1,116,0.666667,0.396825,0.2,0,0.666667,0.396825,1,0,1,1,-0.205083,1,0.0,
2,15,0.695652,0.3907,1,11,0.695652,0.3907,0.090909,0,0.695652,0.3907,1,0,1,1,-0.205083,1,0.0,
3,0,0.0,1.0,1,204,0.0,1.0,0.068826,0,0.0,1.0,5,0,1,1,-0.205083,1,1.0,
4,0,0.0,1.0,1,91,0.0,1.0,0.068826,717,0.0,1.0,6,0,1,1,-0.205083,1,1.0,


In [17]:
pd_features.shape

(4035592, 19)

In [18]:
pd_features.to_hdf("D1_20may.p", 'w')