In [1]:
import numpy as np
import pandas as pd


import time
import xgboost as xgb
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
#from sklearn import pipeline, model_selection
from sklearn import pipeline, grid_search
#from sklearn.feature_extraction import DictVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD
#from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, make_scorer

import re

import random
random.seed(2017)

pd.set_option("display.max_colwidth", 1000)

In [None]:
#import data_load
#reload(data_load)
#data_load.clean_text()

In [None]:
products = data_load.get_clean_text()

In [None]:
products['attributes_len'] = products['attributes'].map(lambda ss: len(ss))

In [None]:
products['brand'] = products['attributes'].map(lambda kvs: kvs.get('mfg brand name', 'none'))
def b1g(s):
    a = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    if s == 'none':
        a[0] = 1.0
    elif s == 'unbranded':
        a[1] = 1.0
    elif s == 'hampton bay':
        a[2] = 1.0
    elif s == 'kohler':
        a[3] = 1.0
    elif s == 'everbilt':
        a[4] = 1.0
    elif s == 'home decorators collection':
        a[5] = 1.0
    elif s == 'ge':
        a[6] = 1.0
    return a

df_brand_features = products['brand'].apply(lambda s: pd.Series(b1g(s)))
df_brand_features.columns = ['brand_none', 'brand_unbranded', 'brand_hampton', 'brand_kohler', 'brand_ever', 'brand_home', 'brand_ge']
products = pd.concat([products, df_brand_features], axis=1)

In [None]:
def f1g(q):
    if 'sxdli' in q:
        return 1.0
    else:
        return 0.0
products['corrected'] = products['queries'].map(lambda ss: { k:f1g(s) for k,s in ss.items() })

def f1(q):
    if 'sxdli' in q:
        return q[:-6]
    else:
        return q
products['queries'] = products['queries'].map(lambda ss: { k:f1(s) for k,s in ss.items() })

In [None]:
# length in words
products['queries_len'] = products['queries'].map(lambda kvs: {k:    max(1, len(v.split()))      for k,v in kvs.items()})
products['product_title_len'] = products['product_title'].map(lambda x: len(x.split()))
products['product_description_len'] = products['product_description'].map(lambda x:len(x.split()))
products['brand_len'] = products['brand'].map(lambda x:len(x.split()))
# length in words, that have no digits
no_number = re.compile(r"[0-9]")
products['queries_wlen'] = products['queries'].map(lambda kvs: {k:    max(1, len([x for x in v.split() if not no_number.search(x)]))      for k,v in kvs.items()})
products['product_title_wlen'] = products['product_title'].map(lambda x: len([x for x in x.split() if not no_number.search(x)]))
products['product_description_wlen'] = products['product_description'].map(lambda x:len([x for x in x.split() if not no_number.search(x)]))
products['brand_wlen'] = products['brand'].map(lambda x:len([x for x in x.split() if not no_number.search(x)]))
# length in letters
products['queries_let'] = products['queries'].map(lambda kvs: {k:    max(1, len(v))      for k,v in kvs.items()})
products['product_title_let'] = products['product_title'].map(lambda x: len(x))
products['product_description_let'] = products['product_description'].map(lambda x:len(x))
products['brand_let'] = products['brand'].map(lambda x:len(x))

In [None]:
import re
has_digit = re.compile(r"[0-9]")
def str_common_word(str1, str2):
    words, cnt, wcnt = str1.split(), 0, 0
    matched = set()
    wmatched = set()
    for word in words:
        if len(word) > 1 and str2.find(word)>=0:
            cnt+=1
            matched.add(word)
            if not has_digit.search(word):
                wcnt +=1
                wmatched.add(word)
    matched_len = sum([len(w) for w in matched])
    wmatched_len = sum([len(w) for w in wmatched])
    leftest = 15
    if wcnt > 0:
        other_words = str2.split()
        other_words.reverse()
        for i, w in enumerate(other_words):
            if words[-1] == w:
                leftest = i 
                break
    return [cnt, cnt*1.0/max(1.0, len(words)), matched_len, wcnt, wmatched_len, leftest]

def str_whole_word(str1, str2, i_):
    cnt = 0
    total_len = 0
    if len(str1) < 1:
        return [cnt, total_len]
    while i_ < len(str2):
        i_ = str2.find(str1, i_)
        if i_ == -1:
            return [cnt, total_len]
        else:
            cnt += 1
            i_ += len(str1)
            total_len += len(str1)
    return [cnt, total_len]

def bigram_match(str1, str2):
    words, intext, cnt = str1.split(), str2.split(), 0
    bi1 = zip(words, words[1:])
    bi2 = set(zip(intext, intext[1:]))
    matched = set()
    for x in bi1:
        if x in bi2:
            cnt+=1
            matched.add(x)
    matched_len = sum([len(w[0]) + len(w[1]) for w in matched])        
    return [cnt, matched_len]

def f_query_in(r):
    tit = r['product_title']
    desc = r['product_description']
    attrs = ' '.join([ k + ' ' + v for k,v in r['attributes'].items()])
    brand = r['brand']
    els = [tit, desc, attrs, brand]
    qs = r['queries']
    res = { k: [y    for el in els for x in [str_whole_word(v, el, 0) + str_common_word(v, el) + bigram_match(v, el)] for y in x] for k,v in qs.items()}
    return res

products['query_in_product_features'] = products.apply(f_query_in, axis=1)

In [None]:
from collections import defaultdict

query_features = ['corrected', 'queries_len', 'queries_let', 'queries_wlen', 'query_in_product_features']

def combine_feats(r):
    feats = defaultdict(list)
    for c in query_features:
        ds = r[c]
        for k,v in ds.items():
            if type(v) == list:
                feats[k] += v
            else:
                feats[k] += [v]
    return feats

combined_query_features = products[query_features].apply(combine_feats, axis=1)

In [None]:
products['combined_query_features'] = combined_query_features
products.drop(query_features, axis=1, inplace=True)

In [None]:
direct_features = [
"brand_none",
"brand_unbranded",
"brand_hampton",
"brand_kohler",
"brand_ever",
"brand_home",
"brand_ge",
"attributes_len",
"product_title_len",
"product_description_len",
"brand_len",
"product_title_let",
"product_description_let",
"brand_let",
"product_title_wlen",
"product_description_wlen",
"brand_wlen"]

In [None]:
#pd.to_pickle(products, 'RAW_FEATURES')
#products = pd.read_pickle('RAW_FEATURES')

In [None]:
products.columns

In [None]:
product_columns = ['product_uid', 'product_title', 'product_description', 'brand']
def explode(r, rs):
    attrs = ' '.join([k + ' ' + v for k,v in r['attributes'].items()])
    qs = r['queries']
    for i,q in qs.items():
        new_row = [i, r['queries'][i]]
        new_row += [r[f] for f in product_columns]
        new_row += [attrs]
        new_row += [r[f] for f in direct_features]
        new_row += r['combined_query_features'][i]
        rs.append(new_row)
        
rows = []    
_ = products.reset_index().apply(lambda row: explode(row, rows), axis=1)
df_combined_query_features = pd.DataFrame(rows)
df_combined_query_features.columns = ['id', 'query'] + product_columns + ['attrs'] + direct_features + ['c%d' % i for i in range(44)]

In [None]:
queries = df_combined_query_features
queries.set_index('id', inplace=True)
pd.to_pickle(queries, 'FEATURES_WITH_TEXT_1')

In [None]:
df_brand = pd.unique(products.brand.ravel())
d={}
i = 1
for s in df_brand:
    d[s]=i
    i+=1
products['brand_feature'] = products['brand'].map(lambda x: d[x])

In [None]:
products

In [None]:
idx_train = pd.read_pickle('LABELS_TRAIN.df')
idx_test = pd.read_pickle('LABELS_TEST.df')

label_train = idx_train['relevance']
idx_train.drop('relevance', axis=1, inplace=True)
idx_test.drop('relevance', axis=1, inplace=True)

In [None]:
queries = pd.read_pickle('FEATURES_WITH_TEXT_1')
df_train = idx_train.merge(queries, left_index=True, right_index=True)
df_test = idx_test.merge(queries, left_index=True, right_index=True)

In [None]:
from sklearn.ensemble import RandomForestRegressor
#from sklearn import pipeline, model_selection
from sklearn import pipeline, grid_search
#from sklearn.feature_extraction import DictVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD
#from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, make_scorer


def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_

def fmse(ground_truth, predictions):
    return mean_squared_error(ground_truth, predictions)

#RMSE  = make_scorer(fmse, greater_is_better=False)
RMSE  = make_scorer(fmean_squared_error, greater_is_better=False)

tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
tsvd = TruncatedSVD(n_components=10, random_state = 2016)
randomForestRegressor = RandomForestRegressor(n_estimators = 500, min_samples_leaf=3, n_jobs = -1, random_state = 5017, verbose = 1)

In [None]:
#query	product_uid	product_title	product_description	brand	attrs
class cust_regression_vals(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, hd_searches):
        d_col_drops=['query', 'product_uid', 'product_title','product_description', 'brand', 'attrs']
        hd_searches = hd_searches.drop(d_col_drops,axis=1).values
        return hd_searches

class cust_txt_col(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, data_dict):
        return data_dict[self.key].apply(str)

In [None]:
df_train

In [None]:
clf = pipeline.Pipeline([
        ('union', FeatureUnion(
                    transformer_list = [
                        ('cst',  cust_regression_vals()),  
                    
#                         ('txt1', pipeline.Pipeline([('s1', cust_txt_col(key='query')), ('tfidf1', tfidf), ('tsvd1', tsvd)])),
#                         ('txt2', pipeline.Pipeline([('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])),
#                         ('txt3', pipeline.Pipeline([('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])),
#                         ('txt4', pipeline.Pipeline([('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)])),
#                         ('txt5', pipeline.Pipeline([('s5', cust_txt_col(key='attrs')), ('tfidf5', tfidf), ('tsvd5', tsvd)]))
                    
#                         ('txt1', pipeline.Pipeline([ ('s1', cust_txt_col(key='search_term')), ('tfidf1', tfidf)  ])),
#                         ('txt2', pipeline.Pipeline([ ('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf)  ])),
#                         ('txt3', pipeline.Pipeline([ ('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf) ])),
#                         ('txt4', pipeline.Pipeline([ ('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf) ]))
                    
#                         ('brandf', pipeline.Pipeline([ ('s5', cust_txt_col(key='brand_feature')), ('ohenc', ohenc)  ])),
                        ],
                    transformer_weights = {
                        'cst': 1.0,
#                         'txt1': 0.5,
#                         'txt2': 0.25,
#                         'txt3': 0.5,
#                         'txt4': 0.5,
#                         'txt5': 0.5
#                         'brandf': 1.0
                        },
                n_jobs = -1
                ))
#         , 
#         ('rfr', randomForestRegressor)
    ])

#clf.set_params(rfr__max_features=10, rfr__max_depth=20)
#clf.fit(X_train, y_train)
# X_train

In [None]:
a = clf.fit_transform(df_train)
np.save('FEATURES_1z_TRAIN', a)
b = clf.transform(df_test)
np.save('FEATURES_1z_TEST', b)

In [None]:
start_time = time.time()

param_grid = {'rfr__max_features': [2], 'rfr__max_depth': [30]}
model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, n_jobs = -1, cv = 5, verbose = 20, scoring=RMSE)
model.fit(X_train, y_train)

print("--- Training: %s minutes ---" % round(((time.time() - start_time)/60),2))

print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

In [None]:
ALL_TEXT = []
df_train['product_title'].map(lambda x: ALL_TEXT.append(x))
df_train['product_description'].map(lambda x: ALL_TEXT.append(x))
df_train['attrs'].map(lambda x: ALL_TEXT.append(x))

tfidf_common = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', min_df=20)
tfidf_common.fit(ALL_TEXT)

In [None]:
p1 = tfidf_common.transform(df_train['query'])
p2 = tfidf_common.transform(df_train['product_title'])
p3 = tfidf_common.transform(df_train['product_description'])
p4 = tfidf_common.transform(df_train['brand'])
p5 = tfidf_common.transform(df_train['attrs'])

p1t = tfidf_common.transform(df_test['query'])
p2t = tfidf_common.transform(df_test['product_title'])
p3t = tfidf_common.transform(df_test['product_description'])
p4t = tfidf_common.transform(df_test['brand'])
p5t = tfidf_common.transform(df_test['attrs'])

In [None]:
from scipy import sparse
descrs = sparse.vstack([p3, p3t, p2, p2t, p5, p5t])
tsvd_common = TruncatedSVD(n_components=100, random_state = 2016)
tsvd_common.fit(descrs)

In [None]:
tp1 = tsvd_common.transform(p1)
tp2 = tsvd_common.transform(p2)
tp3 = tsvd_common.transform(p3)
tp4 = tsvd_common.transform(p4)
tp5 = tsvd_common.transform(p5)

tp1t = tsvd_common.transform(p1t)
tp2t = tsvd_common.transform(p2t)
tp3t = tsvd_common.transform(p3t)
tp4t = tsvd_common.transform(p4t)
tp5t = tsvd_common.transform(p5t)

In [None]:
TRAIN = np.hstack([tp1, tp2, tp3, tp4, tp5, df_train.drop(['query', 'product_uid', 'product_title','product_description', 'brand', 'attrs'], axis=1)])
TEST = np.hstack([tp1t, tp2t, tp3t, tp4t, tp5t, df_test.drop(['query', 'product_uid', 'product_title','product_description', 'brand', 'attrs'], axis=1)])

In [None]:
TRAIN = np.hstack([p1, p2, p3, p4, p5, df_train.drop(['query', 'product_uid', 'product_title','product_description', 'brand', 'attrs'], axis=1)])
TEST = np.hstack([p1t, p2t, p3t, p4t, p5t, df_test.drop(['query', 'product_uid', 'product_title','product_description', 'brand', 'attrs'], axis=1)])

In [None]:
np.hstack([p1, p2, p3, p4, p5])

In [None]:
p1.shape, p2.shape, df_train.shape, p1t.shape, p2t.shape, df_test.shape, p1.ndim

In [None]:
TRAIN.shape, TEST.shape

In [None]:
np.save('FEATURES_1f_TRAIN', TRAIN)
np.save('FEATURES_1f_TEST', TEST)

In [None]:
label_train.iloc[[0,3,4,9]]

In [None]:
from sklearn.cross_validation import StratifiedKFold

skf = StratifiedKFold(label_train, n_folds=3, shuffle=True, random_state=117)
for train_index, test_index in skf:
    X_train, X_test = a[train_index], a[test_index]
    Y_train, Y_test = label_train.iloc[train_index], label_train.iloc[test_index]
    model = randomForestRegressor.fit(X_train, Y_train)
    yhat = model.predict(X_test)
    err = fmse(yhat, Y_test)
    print(err)

In [None]:
a.shape

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(a, label=label_train)
dtrain.save_binary("train_f1z.buffer")

dtest = xgb.DMatrix(b)
dtest.save_binary("test_f1z.buffer")

In [None]:
a = np.load('FEATURES_1z_TRAIN.npy')
b = np.load('FEATURES_1z_TEST.npy')

dtrain = xgb.DMatrix(a, label=label_train)
dtrain.save_binary("train_f1z.buffer")

dtest = xgb.DMatrix(b)
dtest.save_binary("test_f1z.buffer")

In [None]:
dtrain = xgb.DMatrix("train_f1d.buffer")
dtest = xgb.DMatrix("test_f1d.buffer")
a = np.load('FEATURES_1z_TRAIN.npy')
b = np.load('FEATURES_1z_TEST.npy')

X_train, X_test, y_train, y_test = train_test_split(a, dtrain.get_label(), test_size=0.15, random_state=1513)
gX_train = xgb.DMatrix(data=X_train, label=y_train)
gX_test = xgb.DMatrix(data=X_test, label=y_test)
evallist  = [(gX_train,'train'),(gX_test,'test')]

In [None]:
#  9 -> 778 0.4673
# 10 0.8 0.9 -> 587 0.4653    620 (10->15%) 0.4684
# 11 -> 630 0.4655
# 12 
# 13 -> 552 0.4654
# 15 -> 500 0.4666
param = {'max_depth':7, 
         'eta':0.05, # 'objective':'reg:linear',
         'eval_metric':'rmse', #'maximize': False,
         'colsample_bytree':0.8, #7
         'subsample':0.9,  #8
         'min_child_weight': 4.0,
         'nthread':16,
         'silent': True
        }
num_round = 10000
bst = xgb.train( param, gX_train, num_round, [(gX_train,'train'),(gX_test,'test')], early_stopping_rounds=30, verbose_eval=10)

In [None]:
param = {'max_depth':7, 
         'eta':0.05, # 'objective':'reg:linear',
         'eval_metric':'rmse', #'maximize': False,
         'colsample_bytree':0.8, #7
         'subsample':0.9,  #8
         'min_child_weight': 4.0,
         'nthread':16,
         'silent': True
        }
num_round = 800
bst = xgb.train( param, dtrain, num_round, [(gX_test,'test')], verbose_eval=20)

In [None]:
clf = linear_model.Ridge (alpha = .1)
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
y_hat = np.minimum(np.maximum(y_hat, 1.0), 3.0)
fmean_squared_error(y_hat, y_test)

In [None]:
y_pred = bst.predict(dtest)
y_pred_bounded = np.minimum(np.maximum(y_pred, 1.0), 3.0)
idx_test['relevance'] = y_pred_bounded
idx_test.to_csv('submission_xgboost_spells_0332_0335.csv')

y_pred_t = bst.predict(dtrain)
y_pred_t_bounded = np.minimum(np.maximum(y_pred_t, 1.0), 3.0)
pd.DataFrame({"relevance": y_pred_t_bounded}, index=idx_train.index).to_csv('submission_xgboost_spells_0332_0335__train.csv')

In [None]:
############## COPY


In [None]:
dtrain = xgb.DMatrix("good_feats/train.buffer")
dtest = xgb.DMatrix("good_feats/test.buffer")
evallist  = [(dtrain,'train')]

a = np.load('good_feats/train_data.npy')
b = np.load('good_feats/test_data.npy')
a_brand = np.load('good_feats/features_brand_01_train.npy')
b_brand = np.load('good_feats/features_brand_01_test.npy')
a_other = np.load('FEATURES_1d_TRAIN.npy')
b_other = np.load('FEATURES_1d_TEST.npy')
a = np.hstack((a, a_brand, a_other))
b = np.hstack((b, b_brand, b_other))

X_train, X_test, y_train, y_test = train_test_split(a, dtrain.get_label(), test_size=0.10, random_state=147)
gX_train = xgb.DMatrix(data=X_train, label=y_train)
gX_test = xgb.DMatrix(data=X_test, label=y_test)
evallist  = [(gX_train,'train'),(gX_test,'test')]

def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_

param = {'max_depth':12, 
         'eta':0.01, # 'objective':'reg:linear',
         'eval_metric':'rmse', #'maximize': False,
         'colsample_bytree':0.8, #7
         'subsample':0.9,  #8
         'min_child_weight': 3.0,
         'nthread':16,
         'silent': True
        }
num_round = 5000
bst = xgb.train( param, gX_train, num_round, [(gX_train,'train'),(gX_test,'test')], early_stopping_rounds=15, verbose_eval=10)

In [None]:
ggXtrain = xgb.DMatrix(data=a, label=dtrain.get_label())
ggXtest = xgb.DMatrix(data=b, label=dtest.get_label())

num_round = 500
bst = xgb.train( param, gX_train, num_round, [(gX_test,'test')], early_stopping_rounds=15, verbose_eval=10)

In [None]:
ggXtest = xgb.DMatrix(data=b, label=dtest.get_label())

In [None]:
yhat = bst.predict(ggXtest)

In [None]:
y_pred_bounded = np.minimum(np.maximum(yhat, 1.0), 3.0)
idx_test['relevance'] = y_pred_bounded
idx_test.to_csv('submission_xgboost_3.csv')