# start

In [33]:
import gc
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import lightgbm as lgb

import sys

###Add https://www.kaggle.com/anttip/wordbatch to your kernel Data Sources,
###until Kaggle admins fix the wordbatch pip package installation
###sys.path.insert(0, '../input/wordbatch/wordbatch/')
import wordbatch

from wordbatch.extractors import WordBag, WordHash
from wordbatch.models import FTRL, FM_FTRL

from nltk.corpus import stopwords
import re

NUM_BRANDS = 4500
NUM_CATEGORIES = 1200

# develop = False
develop = True

## basic funcs 

In [34]:
def rmsle(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y0), 2)))


def split_cat(text):
    try:
        return text.split("/")
    except:
        return ("No Label", "No Label", "No Label")


def handle_missing_inplace(dataset):
    dataset['general_cat'].fillna(value='missing', inplace=True)
    dataset['subcat_1'].fillna(value='missing', inplace=True)
    dataset['subcat_2'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='missing', inplace=True)


def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category1 = dataset['general_cat'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    pop_category2 = dataset['subcat_1'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    pop_category3 = dataset['subcat_2'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    dataset.loc[~dataset['general_cat'].isin(pop_category1), 'general_cat'] = 'missing'
    dataset.loc[~dataset['subcat_1'].isin(pop_category2), 'subcat_1'] = 'missing'
    dataset.loc[~dataset['subcat_2'].isin(pop_category3), 'subcat_2'] = 'missing'


def to_categorical(dataset):
    dataset['general_cat'] = dataset['general_cat'].astype('category')
    dataset['subcat_1'] = dataset['subcat_1'].astype('category')
    dataset['subcat_2'] = dataset['subcat_2'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')


# Define helpers for text normalization
stopwords = {x: 1 for x in stopwords.words('english')}
non_alphanums = re.compile(u'[^A-Za-z0-9]+')


def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords])

In [35]:
normalize_text("neon pink and clear tech 21 iphone 6 case! used for about a month, but shows no major signs of wear! **i have two of these exact ones available if interested** i also have two purple ones, if interested also just let me know!")

u'neon pink clear tech 21 iphone case used month shows major signs wear two exact ones available interested also two purple ones interested also let know'

## features extract 

### basic fearture 

In [36]:
start_time = time.time()
from time import gmtime, strftime
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

# if 1 == 1:
###train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
###test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

train = pd.read_table('../input/train.tsv', engine='c')
test = pd.read_table('../input/test.tsv', engine='c')

print('[{}] Finished to load data'.format(time.time() - start_time))
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)
nrow_test = train.shape[0]  # -dftt.shape[0]
dftt = train[(train.price < 1.0)]
train = train.drop(train[(train.price < 1.0)].index)
del dftt['price']
nrow_train = train.shape[0]
# print(nrow_train, nrow_test)
y = np.log1p(train["price"])
merge = pd.concat([train, dftt, test])
submission = test[['test_id']]

del train
del test
gc.collect()

merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
    zip(*merge['category_name'].apply(lambda x: split_cat(x)))
merge.drop('category_name', axis=1, inplace=True)
print('[{}] Split categories completed.'.format(time.time() - start_time))

handle_missing_inplace(merge)
print('[{}] Handle missing completed.'.format(time.time() - start_time))

cutting(merge)
print('[{}] Cut completed.'.format(time.time() - start_time))

to_categorical(merge)
print('[{}] Convert categorical completed'.format(time.time() - start_time))

2018-02-14 14:56:53
[7.79165887833] Finished to load data
('Train shape: ', (1482535, 8))
('Test shape: ', (693359, 7))
[15.6914129257] Split categories completed.
[16.5526938438] Handle missing completed.
[19.1612000465] Cut completed.
[20.6273288727] Convert categorical completed


In [37]:
merge.head()

Unnamed: 0,brand_name,item_condition_id,item_description,name,price,shipping,test_id,train_id,general_cat,subcat_1,subcat_2
0,missing,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0,Men,Tops,T-shirts
1,Razer,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0,Electronics,Computers & Tablets,Components & Parts
2,Target,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0,Women,Tops & Blouses,Blouse
3,missing,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0,Home,Home Décor,Home Décor Accents
4,missing,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0,Women,Jewelry,Necklaces


In [None]:
merge

### nlp feature

In [45]:
tfidf = TfidfVectorizer( stop_words='english')
t="""Two Travellers, walking in the noonday sun, sought the shade of a widespreading tree to rest. As they lay looking up among the pleasant leaves, they saw that it was a Plane Tree.

"How useless is the Plane!" said one of them. "It bears no fruit whatever, and only serves to litter the ground with leaves."

"Ungrateful creatures!" said a voice from the Plane Tree. "You lie here in my cooling shade, and yet you say I am useless! Thus ungratefully, O Jupiter, do men receive their blessings!"

Our best blessings are often the least appreciated."""

tfs = tfidf.fit_transform(t.split(" "))
str = 'tree cat travellers fruit jupiter'
response = tfidf.transform([str])
feature_names = tfidf.get_feature_names()
for col in response.nonzero()[1]:
    print feature_names[col], ' - ', response[0, col]
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]

n = 3
top_n = feature_array[tfidf_sorting][:n]

tree  -  0.44350971281100476
travellers  -  0.5174614751013837
jupiter  -  0.5174614751013837
fruit  -  0.5174614751013837


NameError: name 'feature_array' is not defined

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit_transform(merge['item_description'])

<2175894x197497 sparse matrix of type '<type 'numpy.float64'>'
	with 31331442 stored elements in Compressed Sparse Row format>

In [5]:
wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0],
                                                                  "hash_size": 2 ** 29, "norm": None, "tf": 'binary',
                                                                  "idf": None,
                                                                  }), procs=8)
wb.dictionary_freeze= True
X_name = wb.fit_transform(merge['name'])
del(wb)
X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

wb = CountVectorizer()
X_category1 = wb.fit_transform(merge['general_cat'])
X_category2 = wb.fit_transform(merge['subcat_1'])
X_category3 = wb.fit_transform(merge['subcat_2'])
print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))




Normalize text
Extract wordbags
[178.317242146] Vectorize `name` completed.
[212.42111516] Count vectorize `categories` completed.


In [6]:
# wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
# wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0,1.0],
#                                                               "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,
#                                                               "idf": 50})
#                          , procs=8)
wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                                  "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,
                                                                  "idf": None})
                         , procs=8)
wb.dictionary_freeze= True
X_description = wb.fit_transform(merge['item_description'])
del(wb)
X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 3), dtype=bool)]
print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time))

lb = LabelBinarizer(sparse_output=True)
X_brand = lb.fit_transform(merge['brand_name'])
print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time))

X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                      sparse=True).values)
print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time))
print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape,
      X_name.shape)

Normalize text
Extract wordbags
[429.900854111] Vectorize `item_description` completed.
[579.087362051] Label binarize `brand_name` completed.
[584.397417068] Get dummies on `item_condition_id` and `shipping` completed.
((2175894, 6), (2175894, 2040339), (2175894, 4501), (2175894, 14), (2175894, 143), (2175894, 977), (2175894, 518467))


In [95]:
# wb_one = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0,1.0,1.0],
#                                                               "hash_size": 2 ** 28, "norm": None, "tf": 'binary',
#                                                               "idf": None,
#                                                               }), procs=8)
wb_one = wordbatch.WordBatch(normalize_text, extractor=(WordHash, {"decode_error":'ignore', "n_features":2 ** 25,
                                             "non_negative":False, "ngram_range":(1,3), "norm":'l2'}), procs=8)

wb_one.dictionary_freeze= True
X_name_one = wb_one.fit_transform(merge['name'])
print X_name_one.shape
X_name_one = X_name_one[:, np.array(np.clip(X_name_one.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
print X_name_one.shape
print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))
X_description_one = wb_one.fit_transform(merge['item_description'])
del(wb_one)
print X_description_one.shape
X_description_one = X_description_one[:, np.array(np.clip(X_description_one.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
print X_description_one.shape

Normalize text
Extract wordhashes
(2175894, 33554432)
(2175894, 1058234)
[45826.9948261] Vectorize `name` completed.
Normalize text
Extract wordhashes
(2175894, 33554432)
(2175894, 6509517)


In [96]:
lb = LabelBinarizer(sparse_output=True)
X_des_length = lb.fit_transform(merge['item_description'].apply(lambda x:len(normalize_text(x))/30))
print('[{}] Label binarize `X_des_length` completed.'.format(time.time() - start_time))

[45982.333497] Label binarize `X_des_length` completed.


In [97]:
X_name_one

<2175894x1058234 sparse matrix of type '<type 'numpy.float64'>'
	with 17154208 stored elements in Compressed Sparse Row format>

In [98]:
sparse_merge = hstack((X_dummies, 
                       X_description,
                       X_brand, X_category1, X_category2,
                       X_category3, 
                       X_name,
                       X_name_one,
                       X_description_one,
                       X_des_length
                      )).tocsr()

print('[{}] Create sparse merge completed'.format(time.time() - start_time))

#    pd.to_pickle((sparse_merge, y), "xy.pkl")
# else:
#    nrow_train, nrow_test= 1481661, 1482535
#    sparse_merge, y = pd.read_pickle("xy.pkl")

# Remove features with document frequency <=1
print(sparse_merge.shape)
mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 3), dtype=bool)
sparse_merge = sparse_merge[:, mask]
print('[{}] Create sparse merge completed'.format(time.time() - start_time))

[46015.333951] Create sparse merge completed
(2175894, 10132231)
[46115.6291699] Create sparse merge completed


In [99]:
#one word for name ;one word for item desc

In [100]:
type(sparse_merge)

scipy.sparse.csr.csr_matrix

### cv build

In [101]:
print(sparse_merge.shape)
mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
sparse_merge = sparse_merge[:, mask]
X = sparse_merge[:nrow_train]
X_test = sparse_merge[nrow_test:]
print(sparse_merge.shape)

gc.collect()
train_X, train_y = X, y
if develop:
    train_X, valid_X, train_y, valid_y = train_test_split(X, y, train_size =0.1,test_size=0.025, random_state=100)
    print train_X.shape,valid_X.shape,train_y.shape,valid_y.shape

(2175894, 10131715)
(2175894, 10131715)
(148166, 10131715) (37042, 10131715) (148166,) (37042,)


### ftrl

In [None]:

model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=30, inv_link="identity", threads=1)

model.fit(train_X, train_y)
print('[{}] Train FTRL completed'.format(time.time() - start_time))
if develop:
    preds = model.predict(X=valid_X)
    print("FTRL dev RMSLE:",rmsle(np.expm1(valid_y), np.expm1(preds)))

predsF = model.predict(X_test)
print('[{}] Predict FTRL completed'.format(time.time() - start_time))

[46256.897579] Train FTRL completed
('FTRL dev RMSLE:', 0.47863714593804524)
[46260.970403] Predict FTRL completed


### fm_ftrl

In [None]:
model = FM_FTRL(alpha=0.03, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01,
                    D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4)

model.fit(train_X, train_y)
print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
if develop:
    preds = model.predict(X=valid_X)
    print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

predsFM = model.predict(X_test)
print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

### lgbm

In [14]:
params = {
        'learning_rate': 0.3,
        'application': 'regression',
        'max_depth': 10,
        'num_leaves': 30,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
#         'bagging_fraction': 0.8,
#         'bagging_freq': 5,
        'feature_fraction': 0.9,
        'nthread': 4,
        'min_data_in_leaf': 20,
#         'max_bin': 10000
    }

# # Remove features with document frequency <=100
# print(sparse_merge.shape)
# mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1), dtype=bool)
# sparse_merge = sparse_merge[:, mask]
# X = sparse_merge[:nrow_train]
# X_test = sparse_merge[nrow_test:]
# print(sparse_merge.shape)

# train_X, train_y = X, y
# if develop:
#     train_X, valid_X, train_y, valid_y = train_test_split(X, y,train_size=0.2,test_size=0.05, random_state=100)

d_train = lgb.Dataset(train_X, label=train_y)
watchlist = [d_train]
if develop:
    d_valid = lgb.Dataset(valid_X, label=valid_y)
    watchlist = [d_train, d_valid]

model = lgb.train(params, train_set=d_train, num_boost_round=500, valid_sets=watchlist, \
                  early_stopping_rounds=100, verbose_eval=30)

if develop:
    preds = model.predict(valid_X)
    print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

predsL = model.predict(X_test)

print('[{}] Predict LGB completed.'.format(time.time() - start_time))

Training until validation scores don't improve for 100 rounds.
[30]	training's rmse: 0.55081	valid_1's rmse: 0.558343
[60]	training's rmse: 0.520466	valid_1's rmse: 0.533792
[90]	training's rmse: 0.503777	valid_1's rmse: 0.522356
[120]	training's rmse: 0.492175	valid_1's rmse: 0.515612
[150]	training's rmse: 0.483006	valid_1's rmse: 0.510733
[180]	training's rmse: 0.474618	valid_1's rmse: 0.506801
[210]	training's rmse: 0.468332	valid_1's rmse: 0.50381
[240]	training's rmse: 0.462052	valid_1's rmse: 0.501793
[270]	training's rmse: 0.456878	valid_1's rmse: 0.500006
[300]	training's rmse: 0.451802	valid_1's rmse: 0.498694
[330]	training's rmse: 0.446665	valid_1's rmse: 0.496706
[360]	training's rmse: 0.442946	valid_1's rmse: 0.495952
[390]	training's rmse: 0.438668	valid_1's rmse: 0.494638
[420]	training's rmse: 0.435109	valid_1's rmse: 0.493974
[450]	training's rmse: 0.432134	valid_1's rmse: 0.493154
[480]	training's rmse: 0.429051	valid_1's rmse: 0.492556
('LGB dev RMSLE:', 0.492130622

In [None]:
0.4560898110533874

## submit

In [None]:
preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5)

submission['price'] = np.expm1(preds)
submission.to_csv("submission_wordbatch_ftrl_fm_lgb.csv", index=False)