In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
#from sklearn import pipeline, model_selection
from sklearn import pipeline, grid_search
#from sklearn.feature_extraction import DictVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD
#from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, make_scorer

import re
import random
random.seed(2016)

In [2]:
LOC = '/Users/rbekbolatov/data/kaggle/homedepot/'
df_train = pd.read_csv(LOC + 'train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv(LOC + 'test.csv', encoding="ISO-8859-1")
df_pro_desc = pd.read_csv(LOC + 'product_descriptions.csv')
df_attr = pd.read_csv(LOC + 'attributes.csv')
df_matches = pd.read_csv(LOC + 'matched_strings_clean.csv').fillna("")

df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"}).fillna("")
num_train = df_train.shape[0]
# (74067, 5), (166693, 4) -> df_train.shape, df_test.shape

In [3]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True) # (240760, 5)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all = pd.merge(df_all, df_brand, how='left', on='product_uid')
df_all = pd.merge(df_all, df_matches, on='id')

In [101]:
df_attr_len = df_attr.groupby('product_uid', as_index=False)['name'].agg({'attr_count':(lambda x: len(list(x)))})

In [102]:
df_all = pd.merge(df_all, df_attr_len, how='left', on='product_uid')

In [104]:
df_all['attr_count'] = df_all['attr_count'].fillna(0)

In [103]:
#df_all.drop('list', axis=1, inplace=True)
df_all.columns

Index([                           u'id',                 u'product_title',
                         u'product_uid',                     u'relevance',
                         u'search_term',           u'product_description',
                               u'brand',                           u'tit',
                                u'tit2',                          u'desc',
                               u'desc2',                    u'attributes',
                            u'mfgbrand',                     u'mfgbrand2',
                        u'product_info',                          u'attr',
                        u'len_of_query',                  u'len_of_title',
                  u'len_of_description',                  u'len_of_brand',
                       u'letters_query',                 u'letters_title',
                        u'letters_desc',                 u'letters_brand',
                      u'query_in_title',          u'query_in_description',
                      u'q

In [5]:
pattern_camel = re.compile(r"([a-z]+)([0-9A]|([A-Z][^ ]+))")
pattern_lcase_number = re.compile(r"([a-z])([0-9])")
pattern_digit_lcase = re.compile(r"([0-9])([a-z])")
pattern_s = re.compile(r"([a-z])'s")
pattern_number_commas = re.compile(r"([0-9]),([0-9])")

    
# 4x2
XBY = "xby"
pattern_xby_d = re.compile(r"(x[0-9])")
pattern_d_xby = re.compile(r"([0-9])x")

# units
pattern_inch = re.compile(r"([0-9])( *)(inches|inch|in|')\.?")
pattern_foot = re.compile(r"([0-9])( *)(foot|feet|ft|''|\")\.?")
pattern_pound = re.compile(r"([0-9])( *)(pounds|pound|lbs|lb)\.?")
pattern_sqft = re.compile(r"([0-9])( *)(square|sq) ?\.?(feet|foot|ft)\.?")
pattern_gallons = re.compile(r"([0-9])( *)(gallons|gallon|gal)\.?")
pattern_oz = re.compile(r"([0-9])( *)(ounces|ounce|oz)\.?")
pattern_cm = re.compile(r"([0-9])( *)(centimeters|cm)\.?")
pattern_mm = re.compile(r"([0-9])( *)(milimeters|mm)\.?")
pattern_deg = re.compile(r"([0-9])( *)(degrees|degree)\.?")
pattern_volt = re.compile(r"([0-9])( *)(volts|volt)\.?")
pattern_watt = re.compile(r"([0-9])( *)(watts|watt)\.?")
pattern_amp = re.compile(r"([0-9])( *)(amperes|ampere|amps|amp)\.?")
pattern_kamp = re.compile(r"([0-9])( *)(kiloamperes|kiloampere|kamps|kamp|ka)\.?")

# split
pattern_split = re.compile('[^0-9a-z]')

known_words = set(["the", "a", "an",
    "this", "that", "which", "whose",
    "other", "and", "or",
    "be", "is", "are", "been",
    "have", "has", "had",
    "can", "could", "will", "would",
    "go", "gone", "see", "seen",
    "all", "some", "any", "most", "several", "no", "none", "nothing",
    "as", "of", "in", "on", "at", "over", "from", "to",
    "with", "through", "for", "when", "then",
    "new", "old",
    "you", "your", "yours", "me", "i", "my", "mine", "it", "its"])

def str_stem(s): 
    if isinstance(s, str) or isinstance(s, unicode):
        
        s = pattern_camel.sub(r"\1 \2", s)
        s = pattern_lcase_number.sub(r"\1 \2", s)
        s = pattern_digit_lcase.sub(r"\1 \2", s)
        s = pattern_number_commas.sub(r"\1\2", s)
        s = pattern_s.sub(r"\1", s)
        
        
        s = s.lower().strip()
        
        # 4ft x 2ft
        s = s.replace(" x "," " + XBY + " ")
        s = s.replace("*"," " + XBY + " ")        
        s = s.replace(" by "," " + XBY)
        s = pattern_xby_d.sub(" " + XBY + " \1", s)
        s = pattern_d_xby.sub("\1 " + XBY + " ", s)
        
        # units
        s = pattern_inch.sub(r"\1 inch ", s)
        s = pattern_foot.sub(r"\1 foot ", s)
        s = pattern_pound.sub(r"\1 pound ", s)
        s = pattern_sqft.sub(r"\1 sqft ", s)
        s = pattern_gallons.sub(r"\1 gal ", s)
        s = pattern_oz.sub(r"\1 oz ", s)
        s = pattern_cm.sub(r"\1 cm ", s)
        s = pattern_mm.sub(r"\1 mm ", s)
        s = pattern_deg.sub(r"\1 deg ", s)
        s = pattern_volt.sub(r"\1 volt ", s)
        s = pattern_watt.sub(r"\1 watt ", s)
        s = pattern_amp.sub(r"\1 amp ", s)
        s = pattern_kamp.sub(r"\1 kamp ", s)
        
        # some by hand
        s = s.replace("whirpool","whirlpool")
        s = s.replace("whirlpoolga", "whirlpool")
        s = s.replace("whirlpoolstainless","whirlpool stainless")
        s = s.replace("pressure-treated","pressure-treated pt")
        
        s = ' '.join([x for x in pattern_split.split(s) if x and x not in known_words])
        return s
    else:
        #raise ValueError("Type of " + str(s) + " is " + str(type(s)))
        #print "HUY"
        return 'null'
    
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stem(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stem(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stem(x))
df_all['brand'] = df_all['brand'].map(lambda x:str_stem(x))

In [6]:
def str_common_word(str1, str2):
    words, cnt = str1.split(), 0
    for word in words:
        if str2.find(word)>=0:
            cnt+=1
    return cnt

def str_whole_word(str1, str2, i_):
    cnt = 0
    while i_ < len(str2):
        i_ = str2.find(str1, i_)
        if i_ == -1:
            return cnt
        else:
            cnt += 1
            i_ += len(str1)
    return cnt

In [25]:
# id	product_title	product_uid	relevance	search_term	product_description	brand
# id, relevance, search_term, product_title, product_description, (product_uid,) brand  [product_info, attr]
class cust_regression_vals(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, hd_searches):
        d_col_drops=['id','relevance','search_term','product_title','product_description','product_info','attr','brand'] + \
        ['tit', 'tit2', 'desc', 'desc2', 'attributes', 'mfgbrand', 'mfgbrand2'] + \
        ['brand_feature'] #['ratio_brand']
        #[] #['ratio_title', 'ratio_description', 'ratio_brand']
        hd_searches = hd_searches.drop(d_col_drops,axis=1).values
        return hd_searches

class cust_txt_col(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, data_dict):
        return data_dict[self.key].apply(str)

def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_

def fmse(ground_truth, predictions):
    return mean_squared_error(ground_truth, predictions)

#RMSE  = make_scorer(fmse, greater_is_better=False)
RMSE  = make_scorer(fmean_squared_error, greater_is_better=False)

In [None]:
len([x for x in "a,s4".split(",") if x and not re.findall(r'[0-9]', x)])
# df_all['query_in_title'] = df_all['tit'].map(lambda x: len([x for x in "".split(",") if x]))
# df_all['query_in_description'] = df_all['desc'].map(lambda x: len([x for x in "".split(",") if x]))
# df_all['query_in_attrs'] = df_all['attributes'].map(lambda x: len([x for x in "".split(",") if x]))
# df_all['query_in_brand'] = df_all['mfgbrand'].map(lambda x: len([x for x in "".split(",") if x]))
if re.match(r'^[0-9]+$', "srs94343"):
    print 1

In [63]:
pd.set_option("display.max_colwidth", 200)

In [83]:
df_attr.groupby('product_uid', as_index=False)['name'].agg({'list':(lambda x: len(list(x)))})

Unnamed: 0,product_uid,list
0,100001,15
1,100002,35
2,100003,32
3,100004,25
4,100005,25
5,100006,48
6,100007,30
7,100008,35
8,100009,27
9,100010,17


In [71]:
def last_word(r):
    title = r['product_title'].split()
    ms = r['tit'].split(",")
    ic = -1
    for m in ms:
        for i, t in enumerate(title):
            if m == t and i > ic:
                ic = i
    if ic < 0:
        ic = 10
    else:
        ic = len(title) - ic - 1
    return ic #, ms, title

In [None]:
def last_word_match(r):
    title = r['product_title'].split()
    ms = r['tit'].split(",")
    if len(title) > 0 and len(ms) > 0 and title[-1] == ms[-1]:
        return 1.0
    else:
        return 0.0

In [119]:
import re, collections

def words(text): return re.findall('[a-z]+', text.lower()) 

def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def edits1(word):
   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
   deletes    = [a + b[1:] for a, b in splits if b]
   transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
   replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
   inserts    = [a + c + b     for a, b in splits for c in alphabet]
   return set(deletes + transposes + replaces + inserts)

def known_edits2(word, NWORDS):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words, NWORDS): return set(w for w in words if w in NWORDS)

def correct(word, NWORDS):
    candidates = known([word], NWORDS) or known(edits1(word), NWORDS) or known_edits2(word, NWORDS) or [word]
    return max(candidates, key=NWORDS.get)

In [122]:
correct("sdaf", {"sdf":1})

'sdf'

In [134]:
a = set("wer") #(set("dfr"))
a.difference(set("dfr"))

{'e', 'w'}

In [135]:
already_matched = {}
def similar_words(w1, w2):
    if (w1, w2) in already_matched:
        return already_matched[(w1, w2)]
    if abs(len(w1) - len(w2)) > 2:
        res = False
    elif len(set(w1).difference(set(w2))) > 2 or len(set(w2).difference(set(w1))) > 2 :
        res = False
    else:
        res = w2 == correct(w1, {w2:1})
    already_matched[(w1, w2)] = res
    return res
    
def last_word_match_query_last(r):
    title = r['product_title'].split(' ')
    ms = r['search_term'].split(' ')
    if len(title) > 0 and len(ms) > 0 and similar_words(title[-1], ms[-1]):
        return 1.0
    else:
        return 0.0

In [137]:
already_matched

{(u'espresso', u'stand'): False,
 (u'qualified', u'light'): False,
 (u'only', u'plastics'): False,
 (u'brown', u'divider'): False,
 (u'silver', u'chairs'): False,
 (u'kids', u'podge'): False,
 (u'engine', u'snowblower'): False,
 (u'bearing', u'door'): False,
 (u'connector', u'vent'): False,
 (u'carport', u'canopy'): False,
 (u'black', u'gas'): False,
 (u'silver', u'medicine'): False,
 (u'pallet', u'35'): False,
 (u'mkz', u'installation'): False,
 (u'shelter', u'shelterlogic'): False,
 (u'enclosure', u'lattice'): False,
 (u'opener', u'h'): False,
 (u'black', u'table'): False,
 (u'blush', u'blush'): True,
 (u'sprayer', u'roundup'): False,
 (u'bar', u'sinl'): False,
 (u'tvs', u'shelv'): False,
 (u'system', u'electic'): False,
 (u'black', u'pan'): False,
 (u'washer', u'megashot'): False,
 (u'pack', u'nuts'): False,
 (u'box', u'potting'): False,
 (u'stud', u'siding'): False,
 (u'white', u'kholer'): False,
 (u'swivel', u'swivel'): True,
 (u'almond', u'pole'): False,
 (u'tablets', u'bait'): F

In [None]:
df_all['desc'].map(lambda y: len([x for x in y.split(",") if x and not re.findall(r'[0-9]', x)]))

In [136]:
#df_all['title_match_last_title_word'] = df_all.apply(last_word, axis=1)
#df_all['title_match_last_word'] = df_all.apply(last_word_match, axis=1)
df_all['title_match_last_word_query'] = df_all.apply(last_word_match_query_last, axis=1)

In [33]:
start_time = time.time()

#comment out the lines below use df_all.csv for further grid search testing
#if adding features consider any drops on the 'cust_regression_vals' class

df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title'] +"\t"+df_all['product_description']
df_all['attr'] = df_all['search_term']+"\t"+df_all['brand']

df_all['len_of_query'] = df_all['search_term'].map(lambda x: max(1, len(x.split()))).astype(np.int64)
df_all['len_of_title'] = df_all['product_title'].map(lambda x: len(x.split())).astype(np.int64)
df_all['len_of_description'] = df_all['product_description'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_brand'] = df_all['brand'].map(lambda x:len(x.split())).astype(np.int64)

df_all['letters_query'] = df_all['search_term'].map(lambda x: len(x)).astype(np.int64)
df_all['letters_title'] = df_all['product_title'].map(lambda x:len(x)).astype(np.int64)
df_all['letters_desc'] = df_all['product_description'].map(lambda x:len(x)).astype(np.int64)
df_all['letters_brand'] = df_all['brand'].map(lambda x:len(x)).astype(np.int64)

###############################
# df_all['query_in_title'] = df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[1],0))
# df_all['query_in_description'] = df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[2],0))

# df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
# df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))
# df_all['word_in_brand'] = df_all['attr'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))

df_all['query_in_title'] = df_all['tit'].map(lambda y: len([x for x in y.split(",") if x and not re.match(r'^[0-9]+$', x)]))
df_all['query_in_description'] = df_all['desc'].map(lambda y: len([x for x in y.split(",") if x and not re.match(r'^[0-9]+$', x)]))
df_all['query_in_attrs'] = df_all['attributes'].map(lambda y: len([x for x in y.split(",") if x and not re.match(r'^[0-9]+$', x)]))
df_all['query_in_brand'] = df_all['mfgbrand'].map(lambda y: len([x for x in y.split(",") if x and not re.match(r'^[0-9]+$', x)]))

df_all['letters_query_in_title'] = df_all['tit'].map(lambda x: len(x)).astype(np.int64)
df_all['letters_query_in_description'] = df_all['desc'].map(lambda x: len(x)).astype(np.int64)
df_all['letters_query_in_attrs'] = df_all['attributes'].map(lambda x: len(x)).astype(np.int64)
df_all['letters_query_in_brand'] = df_all['mfgbrand'].map(lambda x: len(x)).astype(np.int64)


df_all['query_in_title2'] = df_all['tit2'].map(lambda y: len([x for x in y.split(",") if x and not re.match(r'^[0-9]+$', x)]))
df_all['query_in_description2'] = df_all['desc2'].map(lambda y: len([x for x in y.split(",") if x and not re.match(r'^[0-9]+$', x)]))
df_all['query_in_brand2'] = df_all['mfgbrand2'].map(lambda y: len([x for x in y.split(",") if x and not re.match(r'^[0-9]+$', x)]))


df_all['letters_query_in_title2'] = df_all['tit2'].map(lambda x: len(x)).astype(np.int64)
df_all['letters_query_in_description2'] = df_all['desc2'].map(lambda x: len(x)).astype(np.int64)

df_all['ratio_letters_query_in_title'] = df_all['letters_query_in_title2']/(df_all['letters_query_in_title'] + 1)
df_all['ratio_letters_query_in_descr'] = df_all['letters_query_in_description2']/(df_all['letters_query_in_description'] + 1)


df_all['query_in_title_num'] = df_all['tit'].map(lambda y: len([x for x in y.split(",") if x and re.match(r'^[0-9]+$', x)]))
df_all['query_in_description_num'] = df_all['desc'].map(lambda y: len([x for x in y.split(",") if x and re.match(r'^[0-9]+$', x)]))
df_all['query_in_attrs_num'] = df_all['attributes'].map(lambda y: len([x for x in y.split(",") if x and re.match(r'^[0-9]+$', x)]))



#df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
#df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))
#df_all['word_in_brand'] = df_all['attr'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
###############################


df_all['ratio_title'] = df_all['query_in_title']/df_all['len_of_query']
df_all['ratio_description'] = df_all['query_in_description']/df_all['len_of_query']

# df_all['ratio_title'] = df_all['word_in_title']/df_all['len_of_query']
# df_all['ratio_description'] = df_all['word_in_description']/df_all['len_of_query']
# df_all['ratio_brand'] = df_all['word_in_brand']/df_all['len_of_brand']



df_all['title_match_last_title_word'] = df_all.apply(last_word, axis=1)
df_all['title_match_last_word'] = df_all.apply(last_word_match, axis=1)
df_all['title_match_last_word_query'] = df_all.apply(last_word_match_query_last, axis=1)

df_brand = pd.unique(df_all.brand.ravel())
d={}
i = 1
for s in df_brand:
    d[s]=i
    i+=1
df_all['brand_feature'] = df_all['brand'].map(lambda x: d[x])
#df_all['search_term_feature'] = df_all['search_term'].map(lambda x:len(x))

#df_all.to_csv('df_all_322_1.csv')
#df_all = pd.read_csv('df_all.csv', encoding="ISO-8859-1", index_col=0)

df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']
y_train = df_train['relevance'].values
X_train = df_train[:]
X_test = df_test[:]
print("--- Features Set: %s minutes ---" % round(((time.time() - start_time)/60), 2))

--- Features Set: 0.15 minutes ---


In [None]:
df_all[1000:1030]
#df_all['query_in_title'] + 1
#df_all['query_in_title2']/(df_all['query_in_title'] + 1)

In [138]:
# LOAD FROM SAVED
#df_all = pd.read_csv('df_all.csv', encoding="ISO-8859-1", index_col=0)
#df_all = pd.read_csv('df_all_322_1.csv', encoding="ISO-8859-1", index_col=0)

df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']
y_train = df_train['relevance'].values
X_train = df_train[:]
X_test = df_test[:]

In [None]:
#df_all[300:320][['relevance', 'product_title', 'search_term', 'product_description']]
#X_train.columns

In [142]:
tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
#tsvd = TruncatedSVD(n_components=10, random_state = 2016)
# from sklearn.feature_extraction import DictVectorizer
# dictvect = DictVectorizer()
from sklearn.preprocessing import OneHotEncoder
ohenc = OneHotEncoder()
randomForestRegressor = RandomForestRegressor(n_estimators = 500, min_samples_leaf=3, n_jobs = -1, random_state = 5017, verbose = 1)

clf = pipeline.Pipeline([
        ('union', FeatureUnion(
                    transformer_list = [
                        ('cst',  cust_regression_vals()),  
                    
#                         ('txt1', pipeline.Pipeline([('s1', cust_txt_col(key='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd)])),
#                         ('txt2', pipeline.Pipeline([('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])),
#                         ('txt3', pipeline.Pipeline([('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])),
#                         ('txt4', pipeline.Pipeline([('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)]))
                    
#                         ('txt1', pipeline.Pipeline([ ('s1', cust_txt_col(key='search_term')), ('tfidf1', tfidf)  ])),
#                         ('txt2', pipeline.Pipeline([ ('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf)  ])),
#                         ('txt3', pipeline.Pipeline([ ('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf) ])),
#                         ('txt4', pipeline.Pipeline([ ('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf) ]))
                    
#                         ('brandf', pipeline.Pipeline([ ('s5', cust_txt_col(key='brand_feature')), ('ohenc', ohenc)  ])),
                        ],
                    transformer_weights = {
                        'cst': 1.0,
#                         'txt1': 0.5,
#                         'txt2': 0.25,
#                         'txt3': 0.5,
#                         'txt4': 0.5
#                         'brandf': 1.0
                        },
                n_jobs = -1
                )), 
        ('rfr', randomForestRegressor)])

#clf.set_params(rfr__max_features=10, rfr__max_depth=20)
#clf.fit(X_train, y_train)
# X_train

In [143]:
start_time = time.time()

param_grid = {'rfr__max_features': [2], 'rfr__max_depth': [30]}
model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, n_jobs = -1, cv = 5, verbose = 20, scoring=RMSE)
model.fit(X_train, y_train)

print("--- Training: %s minutes ---" % round(((time.time() - start_time)/60),2))

print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   23.3s remaining:   35.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   29.0s remaining:   19.3s
[Parallel(n_jobs=-1)]: Done   4 out of   5 | elapsed:   33.9s remaining:    8.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   36.3s finished
  for name, trans in self.transformer_list)
  for name, trans in self.transformer_list)
  for name, trans in self.transformer_list)
  for name, trans in self.transformer_list)
  for name, trans in self.transformer_list)
[Parallel(n_jobs=-1)]: Done   1 out of 399 | elapsed:    0.2s remaining:  1.2min
[Parallel(n_jobs=-1)]: Done   1 out of  23 | elapsed:    0.1s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done   1 out of 500 | elapsed:    0.2s remaining:  1.6min
[Parallel(n_jobs=-1)]: Done   1 out of  61 | elapsed:    0.2s remaining:   10.0s
[Parallel(n_jobs=-1)]: Done   1 out of  17 | elapsed:    0.1s remaining:    1.8s

[CV] rfr__max_features=2, rfr__max_depth=30 ..........................
[CV] rfr__max_features=2, rfr__max_depth=30 ..........................
[CV] rfr__max_features=2, rfr__max_depth=30 ..........................
[CV] rfr__max_features=2, rfr__max_depth=30 ..........................
[CV] rfr__max_features=2, rfr__max_depth=30 ..........................
[CV]  rfr__max_features=2, rfr__max_depth=30, score=-0.476042 -  14.4s[CV]  rfr__max_features=2, rfr__max_depth=30, score=-0.463203 -  19.0s[CV]  rfr__max_features=2, rfr__max_depth=30, score=-0.460106 -  20.8s[CV]  rfr__max_features=2, rfr__max_depth=30, score=-0.469318 -  18.1s[CV]  rfr__max_features=2, rfr__max_depth=30, score=-0.476565 -  12.8s






[Parallel(n_jobs=-1)]: Done   1 out of 500 | elapsed:    0.1s remaining:   52.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    7.3s finished


--- Training: 0.83 minutes ---
Best parameters found by grid search:
{'rfr__max_features': 2, 'rfr__max_depth': 30}
Best CV score:
-0.469046783771


In [None]:
#X_train.ix[3782]
inds = pd.isnull(X_train).any(1).nonzero()[0]
inds

In [None]:
X_train[0:3]

In [None]:
np.isfinite(X_train.sum())

In [None]:
np.isfinite(X_train).all()

In [106]:
print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

Best parameters found by grid search:
{'rfr__max_features': 2, 'rfr__max_depth': 30}
Best CV score:
-0.477891098235


In [144]:
print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

y_pred = model.predict(X_test)
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission2.csv',index=False)

Best parameters found by grid search:
{'rfr__max_features': 2, 'rfr__max_depth': 30}
Best CV score:
-0.469046783771


[Parallel(n_jobs=8)]: Done   1 out of 159 | elapsed:    0.1s remaining:    8.1s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    3.0s finished


In [None]:
df_all.dtypes