In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
#from sklearn import pipeline, model_selection
from sklearn import pipeline, grid_search
#from sklearn.feature_extraction import DictVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD
#from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, make_scorer

import re
import random
random.seed(2016)

In [2]:
LOC = '/Users/rbekbolatov/data/kaggle/homedepot/'
df_train = pd.read_csv(LOC + 'train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv(LOC + 'test.csv', encoding="ISO-8859-1")
df_pro_desc = pd.read_csv(LOC + 'product_descriptions.csv')
df_attr = pd.read_csv(LOC + 'attributes.csv')
df_matches = pd.read_csv(LOC + 'matched_strings_clean.csv').fillna("")

df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"}).fillna("")
num_train = df_train.shape[0]
# (74067, 5), (166693, 4) -> df_train.shape, df_test.shape

In [3]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True) # (240760, 5)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all = pd.merge(df_all, df_brand, how='left', on='product_uid')
df_all = pd.merge(df_all, df_matches, on='id')

In [4]:
df_all[0:2]

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,brand,tit,tit2,desc,desc2,attributes,mfgbrand,mfgbrand2
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.0,angle bracket,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie,angle,,"angled, angles",,angled,,
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie,,,,,,,


In [5]:
pattern_camel = re.compile(r"([a-z]+)([0-9A]|([A-Z][^ ]+))")
pattern_lcase_number = re.compile(r"([a-z])([0-9])")
pattern_digit_lcase = re.compile(r"([0-9])([a-z])")
pattern_s = re.compile(r"([a-z])'s")
pattern_number_commas = re.compile(r"([0-9]),([0-9])")

    
# 4x2
XBY = "xby"
pattern_xby_d = re.compile(r"(x[0-9])")
pattern_d_xby = re.compile(r"([0-9])x")

# units
pattern_inch = re.compile(r"([0-9])( *)(inches|inch|in|')\.?")
pattern_foot = re.compile(r"([0-9])( *)(foot|feet|ft|''|\")\.?")
pattern_pound = re.compile(r"([0-9])( *)(pounds|pound|lbs|lb)\.?")
pattern_sqft = re.compile(r"([0-9])( *)(square|sq) ?\.?(feet|foot|ft)\.?")
pattern_gallons = re.compile(r"([0-9])( *)(gallons|gallon|gal)\.?")
pattern_oz = re.compile(r"([0-9])( *)(ounces|ounce|oz)\.?")
pattern_cm = re.compile(r"([0-9])( *)(centimeters|cm)\.?")
pattern_mm = re.compile(r"([0-9])( *)(milimeters|mm)\.?")
pattern_deg = re.compile(r"([0-9])( *)(degrees|degree)\.?")
pattern_volt = re.compile(r"([0-9])( *)(volts|volt)\.?")
pattern_watt = re.compile(r"([0-9])( *)(watts|watt)\.?")
pattern_amp = re.compile(r"([0-9])( *)(amperes|ampere|amps|amp)\.?")
pattern_kamp = re.compile(r"([0-9])( *)(kiloamperes|kiloampere|kamps|kamp|ka)\.?")

# split
pattern_split = re.compile('[^0-9a-z]')

known_words = set(["the", "a", "an",
    "this", "that", "which", "whose",
    "other", "and", "or",
    "be", "is", "are", "been",
    "have", "has", "had",
    "can", "could", "will", "would",
    "go", "gone", "see", "seen",
    "all", "some", "any", "most", "several", "no", "none", "nothing",
    "as", "of", "in", "on", "at", "over", "from", "to",
    "with", "through", "for", "when", "then",
    "new", "old",
    "you", "your", "yours", "me", "i", "my", "mine", "it", "its"])

def str_stem(s): 
    if isinstance(s, str) or isinstance(s, unicode):
        
        s = pattern_camel.sub(r"\1 \2", s)
        s = pattern_lcase_number.sub(r"\1 \2", s)
        s = pattern_digit_lcase.sub(r"\1 \2", s)
        s = pattern_number_commas.sub(r"\1\2", s)
        s = pattern_s.sub(r"\1", s)
        
        
        s = s.lower().strip()
        
        # 4ft x 2ft
        s = s.replace(" x "," " + XBY + " ")
        s = s.replace("*"," " + XBY + " ")        
        s = s.replace(" by "," " + XBY)
        s = pattern_xby_d.sub(" " + XBY + " \1", s)
        s = pattern_d_xby.sub("\1 " + XBY + " ", s)
        
        # units
        s = pattern_inch.sub(r"\1 inch ", s)
        s = pattern_foot.sub(r"\1 foot ", s)
        s = pattern_pound.sub(r"\1 pound ", s)
        s = pattern_sqft.sub(r"\1 sqft ", s)
        s = pattern_gallons.sub(r"\1 gal ", s)
        s = pattern_oz.sub(r"\1 oz ", s)
        s = pattern_cm.sub(r"\1 cm ", s)
        s = pattern_mm.sub(r"\1 mm ", s)
        s = pattern_deg.sub(r"\1 deg ", s)
        s = pattern_volt.sub(r"\1 volt ", s)
        s = pattern_watt.sub(r"\1 watt ", s)
        s = pattern_amp.sub(r"\1 amp ", s)
        s = pattern_kamp.sub(r"\1 kamp ", s)
        
        # some by hand
        s = s.replace("whirpool","whirlpool")
        s = s.replace("whirlpoolga", "whirlpool")
        s = s.replace("whirlpoolstainless","whirlpool stainless")
        s = s.replace("pressure-treated","pressure-treated pt")
        
        s = ' '.join([x for x in pattern_split.split(s) if x and x not in known_words])
        return s
    else:
        #raise ValueError("Type of " + str(s) + " is " + str(type(s)))
        #print "HUY"
        return 'null'
    
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stem(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stem(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stem(x))
df_all['brand'] = df_all['brand'].map(lambda x:str_stem(x))

In [6]:
def str_common_word(str1, str2):
    words, cnt = str1.split(), 0
    for word in words:
        if str2.find(word)>=0:
            cnt+=1
    return cnt

def str_whole_word(str1, str2, i_):
    cnt = 0
    while i_ < len(str2):
        i_ = str2.find(str1, i_)
        if i_ == -1:
            return cnt
        else:
            cnt += 1
            i_ += len(str1)
    return cnt

In [25]:
# id	product_title	product_uid	relevance	search_term	product_description	brand
# id, relevance, search_term, product_title, product_description, (product_uid,) brand  [product_info, attr]
class cust_regression_vals(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, hd_searches):
        d_col_drops=['id','relevance','search_term','product_title','product_description','product_info','attr','brand'] + \
        ['tit', 'tit2', 'desc', 'desc2', 'attributes', 'mfgbrand', 'mfgbrand2'] + \
        ['brand_feature'] #['ratio_brand']
        #[] #['ratio_title', 'ratio_description', 'ratio_brand']
        hd_searches = hd_searches.drop(d_col_drops,axis=1).values
        return hd_searches

class cust_txt_col(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, data_dict):
        return data_dict[self.key].apply(str)

def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_

def fmse(ground_truth, predictions):
    return mean_squared_error(ground_truth, predictions)

#RMSE  = make_scorer(fmse, greater_is_better=False)
RMSE  = make_scorer(fmean_squared_error, greater_is_better=False)

In [None]:
len([x for x in "a,s4".split(",") if x and not re.findall(r'[0-9]', x)])
# df_all['query_in_title'] = df_all['tit'].map(lambda x: len([x for x in "".split(",") if x]))
# df_all['query_in_description'] = df_all['desc'].map(lambda x: len([x for x in "".split(",") if x]))
# df_all['query_in_attrs'] = df_all['attributes'].map(lambda x: len([x for x in "".split(",") if x]))
# df_all['query_in_brand'] = df_all['mfgbrand'].map(lambda x: len([x for x in "".split(",") if x]))
if re.match(r'^[0-9]+$', "srs94343"):
    print 1

In [None]:
df_all[0:2]

In [None]:
df_all['desc'].map(lambda y: len([x for x in y.split(",") if x and not re.findall(r'[0-9]', x)]))

In [33]:
start_time = time.time()

#comment out the lines below use df_all.csv for further grid search testing
#if adding features consider any drops on the 'cust_regression_vals' class

df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title'] +"\t"+df_all['product_description']
df_all['attr'] = df_all['search_term']+"\t"+df_all['brand']

df_all['len_of_query'] = df_all['search_term'].map(lambda x: max(1, len(x.split()))).astype(np.int64)
df_all['len_of_title'] = df_all['product_title'].map(lambda x: len(x.split())).astype(np.int64)
df_all['len_of_description'] = df_all['product_description'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_brand'] = df_all['brand'].map(lambda x:len(x.split())).astype(np.int64)

df_all['letters_query'] = df_all['search_term'].map(lambda x: len(x)).astype(np.int64)
df_all['letters_title'] = df_all['product_title'].map(lambda x:len(x)).astype(np.int64)
df_all['letters_desc'] = df_all['product_description'].map(lambda x:len(x)).astype(np.int64)
df_all['letters_brand'] = df_all['brand'].map(lambda x:len(x)).astype(np.int64)

###############################
# df_all['query_in_title'] = df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[1],0))
# df_all['query_in_description'] = df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[2],0))

# df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
# df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))
# df_all['word_in_brand'] = df_all['attr'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))

df_all['query_in_title'] = df_all['tit'].map(lambda y: len([x for x in y.split(",") if x and not re.match(r'^[0-9]+$', x)]))
df_all['query_in_description'] = df_all['desc'].map(lambda y: len([x for x in y.split(",") if x and not re.match(r'^[0-9]+$', x)]))
df_all['query_in_attrs'] = df_all['attributes'].map(lambda y: len([x for x in y.split(",") if x and not re.match(r'^[0-9]+$', x)]))
df_all['query_in_brand'] = df_all['mfgbrand'].map(lambda y: len([x for x in y.split(",") if x and not re.match(r'^[0-9]+$', x)]))

df_all['letters_query_in_title'] = df_all['tit'].map(lambda x: len(x)).astype(np.int64)
df_all['letters_query_in_description'] = df_all['desc'].map(lambda x: len(x)).astype(np.int64)
df_all['letters_query_in_attrs'] = df_all['attributes'].map(lambda x: len(x)).astype(np.int64)
df_all['letters_query_in_brand'] = df_all['mfgbrand'].map(lambda x: len(x)).astype(np.int64)


df_all['query_in_title2'] = df_all['tit2'].map(lambda y: len([x for x in y.split(",") if x and not re.match(r'^[0-9]+$', x)]))
df_all['query_in_description2'] = df_all['desc2'].map(lambda y: len([x for x in y.split(",") if x and not re.match(r'^[0-9]+$', x)]))
df_all['query_in_brand2'] = df_all['mfgbrand2'].map(lambda y: len([x for x in y.split(",") if x and not re.match(r'^[0-9]+$', x)]))


df_all['letters_query_in_title2'] = df_all['tit2'].map(lambda x: len(x)).astype(np.int64)
df_all['letters_query_in_description2'] = df_all['desc2'].map(lambda x: len(x)).astype(np.int64)

df_all['ratio_letters_query_in_title'] = df_all['letters_query_in_title2']/(df_all['letters_query_in_title'] + 1)
df_all['ratio_letters_query_in_descr'] = df_all['letters_query_in_description2']/(df_all['letters_query_in_description'] + 1)


df_all['query_in_title_num'] = df_all['tit'].map(lambda y: len([x for x in y.split(",") if x and re.match(r'^[0-9]+$', x)]))
df_all['query_in_description_num'] = df_all['desc'].map(lambda y: len([x for x in y.split(",") if x and re.match(r'^[0-9]+$', x)]))
df_all['query_in_attrs_num'] = df_all['attributes'].map(lambda y: len([x for x in y.split(",") if x and re.match(r'^[0-9]+$', x)]))



#df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
#df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))
#df_all['word_in_brand'] = df_all['attr'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
###############################


df_all['ratio_title'] = df_all['query_in_title']/df_all['len_of_query']
df_all['ratio_description'] = df_all['query_in_description']/df_all['len_of_query']

# df_all['ratio_title'] = df_all['word_in_title']/df_all['len_of_query']
# df_all['ratio_description'] = df_all['word_in_description']/df_all['len_of_query']
# df_all['ratio_brand'] = df_all['word_in_brand']/df_all['len_of_brand']


df_brand = pd.unique(df_all.brand.ravel())
d={}
i = 1
for s in df_brand:
    d[s]=i
    i+=1
df_all['brand_feature'] = df_all['brand'].map(lambda x: d[x])
#df_all['search_term_feature'] = df_all['search_term'].map(lambda x:len(x))

#df_all.to_csv('df_all_322_1.csv')
#df_all = pd.read_csv('df_all.csv', encoding="ISO-8859-1", index_col=0)

df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']
y_train = df_train['relevance'].values
X_train = df_train[:]
X_test = df_test[:]
print("--- Features Set: %s minutes ---" % round(((time.time() - start_time)/60), 2))

--- Features Set: 0.15 minutes ---


In [30]:
df_all[0:2]
#df_all['query_in_title'] + 1
#df_all['query_in_title2']/(df_all['query_in_title'] + 1)

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,brand,tit,tit2,desc,...,letters_query_in_title2,letters_query_in_description2,query_in_title_num,query_in_description_num,query_in_attrs_num,ratio_title,ratio_description,brand_feature,ratio_letters_query_in_title,ratio_letters_query_in_descr
0,2,simpson strong tie 12 gauge angle,100001,3.0,angle bracket,not only do angles make joints stronger they a...,simpson strong tie,angle,,"angled, angles",...,0,0,0,0,0,0.5,1,{u'brand': 1},0,0
1,3,simpson strong tie 12 gauge angle,100001,2.5,l bracket,not only do angles make joints stronger they a...,simpson strong tie,,,,...,0,0,0,0,0,0.0,0,{u'brand': 1},0,0


In [34]:
# LOAD FROM SAVED
#df_all = pd.read_csv('df_all.csv', encoding="ISO-8859-1", index_col=0)
#df_all = pd.read_csv('df_all_322_1.csv', encoding="ISO-8859-1", index_col=0)

df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']
y_train = df_train['relevance'].values
X_train = df_train[:]
X_test = df_test[:]

In [None]:
#df_all[300:320][['relevance', 'product_title', 'search_term', 'product_description']]
#X_train.columns

In [35]:
tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
#tsvd = TruncatedSVD(n_components=10, random_state = 2016)
# from sklearn.feature_extraction import DictVectorizer
# dictvect = DictVectorizer()
from sklearn.preprocessing import OneHotEncoder
ohenc = OneHotEncoder()
randomForestRegressor = RandomForestRegressor(n_estimators = 100, min_samples_leaf=3, n_jobs = -1, random_state = 3017, verbose = 1)

clf = pipeline.Pipeline([
        ('union', FeatureUnion(
                    transformer_list = [
                        ('cst',  cust_regression_vals()),  
                    
#                         ('txt1', pipeline.Pipeline([('s1', cust_txt_col(key='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd)])),
#                         ('txt2', pipeline.Pipeline([('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])),
#                         ('txt3', pipeline.Pipeline([('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])),
#                         ('txt4', pipeline.Pipeline([('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)]))
                    
#                         ('txt1', pipeline.Pipeline([ ('s1', cust_txt_col(key='search_term')), ('tfidf1', tfidf)  ])),
#                         ('txt2', pipeline.Pipeline([ ('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf)  ])),
#                         ('txt3', pipeline.Pipeline([ ('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf) ])),
#                         ('txt4', pipeline.Pipeline([ ('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf) ]))
                    
#                         ('brandf', pipeline.Pipeline([ ('s5', cust_txt_col(key='brand_feature')), ('ohenc', ohenc)  ])),
                        ],
                    transformer_weights = {
                        'cst': 1.0,
#                         'txt1': 0.5,
#                         'txt2': 0.25,
#                         'txt3': 0.5,
#                         'txt4': 0.5
#                         'brandf': 1.0
                        },
                n_jobs = -1
                )), 
        ('rfr', randomForestRegressor)])

#clf.set_params(rfr__max_features=10, rfr__max_depth=20)
#clf.fit(X_train, y_train)
# X_train

In [36]:
start_time = time.time()

param_grid = {'rfr__max_features': [2], 'rfr__max_depth': [30]}
model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, n_jobs = -1, cv = 3, verbose = 20, scoring=RMSE)
model.fit(X_train, y_train)

print("--- Training: %s minutes ---" % round(((time.time() - start_time)/60),2))

print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] rfr__max_features=2, rfr__max_depth=30 ..........................
[CV] rfr__max_features=2, rfr__max_depth=30 ..........................


  for name, trans in self.transformer_list)
  for name, trans in self.transformer_list)


JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
    ...........................................................................
/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    157     pkg_name = mod_name.rpartition('.')[0]
    158     main_globals = sys.modules["__main__"].__dict__
    159     if alter_argv:
    160         sys.argv[0] = fname
    161     return _run_code(code, main_globals, None,
--> 162                      "__main__", fname, loader, pkg_name)
        fname = '/Library/Python/2.7/site-packages/ipykernel/__main__.py'
        loader = <pkgutil.ImpLoader instance>
        pkg_name = 'ipykernel'
    163 
    164 def run_module(mod_name, init_globals=None,
    165                run_name=None, alter_sys=False):
    166     """Execute a module's code without importing it

...........................................................................
/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py in _run_code(code=<code object <module> at 0x10abee6b0, file "/Lib...2.7/site-packages/ipykernel/__main__.py", line 1>, run_globals={'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/Library/Python/2.7/site-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/Library/Python/2.7/site-packages/ipykernel/kernelapp.pyc'>}, init_globals=None, mod_name='__main__', mod_fname='/Library/Python/2.7/site-packages/ipykernel/__main__.py', mod_loader=<pkgutil.ImpLoader instance>, pkg_name='ipykernel')
     67         run_globals.update(init_globals)
     68     run_globals.update(__name__ = mod_name,
     69                        __file__ = mod_fname,
     70                        __loader__ = mod_loader,
     71                        __package__ = pkg_name)
---> 72     exec code in run_globals
        code = <code object <module> at 0x10abee6b0, file "/Lib...2.7/site-packages/ipykernel/__main__.py", line 1>
        run_globals = {'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/Library/Python/2.7/site-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/Library/Python/2.7/site-packages/ipykernel/kernelapp.pyc'>}
     73     return run_globals
     74 
     75 def _run_module_code(code, init_globals=None,
     76                     mod_name=None, mod_fname=None,

...........................................................................
/Library/Python/2.7/site-packages/ipykernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from ipykernel import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/Library/Python/2.7/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    587         
    588         If a global instance already exists, this reinitializes and starts it
    589         """
    590         app = cls.instance(**kwargs)
    591         app.initialize(argv)
--> 592         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    593 
    594 #-----------------------------------------------------------------------------
    595 # utility functions, for convenience
    596 #-----------------------------------------------------------------------------

...........................................................................
/Library/Python/2.7/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    384     def start(self):
    385         if self.poller is not None:
    386             self.poller.start()
    387         self.kernel.start()
    388         try:
--> 389             ioloop.IOLoop.instance().start()
    390         except KeyboardInterrupt:
    391             pass
    392 
    393 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/Library/Python/2.7/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    146             PollIOLoop.configure(ZMQIOLoop)
    147         return PollIOLoop.instance()
    148     
    149     def start(self):
    150         try:
--> 151             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    152         except ZMQError as e:
    153             if e.errno == ETERM:
    154                 # quietly return on ETERM
    155                 pass

...........................................................................
/Library/Python/2.7/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    835                 self._events.update(event_pairs)
    836                 while self._events:
    837                     fd, events = self._events.popitem()
    838                     try:
    839                         fd_obj, handler_func = self._handlers[fd]
--> 840                         handler_func(fd_obj, events)
        handler_func = <function null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    841                     except (OSError, IOError) as e:
    842                         if errno_from_exception(e) == errno.EPIPE:
    843                             # Happens when the client closes the connection
    844                             pass

...........................................................................
/Library/Python/2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/Library/Python/2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    428             # dispatch events:
    429             if events & IOLoop.ERROR:
    430                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    431                 return
    432             if events & IOLoop.READ:
--> 433                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    434                 if not self.socket:
    435                     return
    436             if events & IOLoop.WRITE:
    437                 self._handle_send()

...........................................................................
/Library/Python/2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    460                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    461         else:
    462             if self._recv_callback:
    463                 callback = self._recv_callback
    464                 # self._recv_callback = None
--> 465                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    466                 
    467         # self.update_state()
    468         
    469 

...........................................................................
/Library/Python/2.7/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    402         close our socket."""
    403         try:
    404             # Use a NullContext to ensure that all StackContexts are run
    405             # inside our blanket exception handler rather than outside.
    406             with stack_context.NullContext():
--> 407                 callback(*args, **kwargs)
        callback = <function null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    408         except:
    409             gen_log.error("Uncaught exception, closing connection.",
    410                           exc_info=True)
    411             # Close the socket on an uncaught exception from a user callback

...........................................................................
/Library/Python/2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/Library/Python/2.7/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    247         if self.control_stream:
    248             self.control_stream.on_recv(self.dispatch_control, copy=False)
    249 
    250         def make_dispatcher(stream):
    251             def dispatcher(msg):
--> 252                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    253             return dispatcher
    254 
    255         for s in self.shell_streams:
    256             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/Library/Python/2.7/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {u'allow_stdin': True, u'code': u'start_time = time.time()\n\nparam_grid = {\'rf...rint("Best CV score:")\nprint(model.best_score_)', u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {u'msg_id': u'0042B04B01144372BBC93CC7A136A800', u'msg_type': u'execute_request', u'session': u'6B6372B8C091458DA5AF65371F589570', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'0042B04B01144372BBC93CC7A136A800', 'msg_type': u'execute_request', 'parent_header': {}})
    208         else:
    209             # ensure default_int_handler during handler call
    210             sig = signal(SIGINT, default_int_handler)
    211             self.log.debug("%s: %s", msg_type, msg)
    212             try:
--> 213                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = ['6B6372B8C091458DA5AF65371F589570']
        msg = {'buffers': [], 'content': {u'allow_stdin': True, u'code': u'start_time = time.time()\n\nparam_grid = {\'rf...rint("Best CV score:")\nprint(model.best_score_)', u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {u'msg_id': u'0042B04B01144372BBC93CC7A136A800', u'msg_type': u'execute_request', u'session': u'6B6372B8C091458DA5AF65371F589570', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'0042B04B01144372BBC93CC7A136A800', 'msg_type': u'execute_request', 'parent_header': {}}
    214             except Exception:
    215                 self.log.error("Exception in message handler:", exc_info=True)
    216             finally:
    217                 signal(SIGINT, sig)

...........................................................................
/Library/Python/2.7/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=['6B6372B8C091458DA5AF65371F589570'], parent={'buffers': [], 'content': {u'allow_stdin': True, u'code': u'start_time = time.time()\n\nparam_grid = {\'rf...rint("Best CV score:")\nprint(model.best_score_)', u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {u'msg_id': u'0042B04B01144372BBC93CC7A136A800', u'msg_type': u'execute_request', u'session': u'6B6372B8C091458DA5AF65371F589570', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'0042B04B01144372BBC93CC7A136A800', 'msg_type': u'execute_request', 'parent_header': {}})
    357         if not silent:
    358             self.execution_count += 1
    359             self._publish_execute_input(code, parent, self.execution_count)
    360 
    361         reply_content = self.do_execute(code, silent, store_history,
--> 362                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    363 
    364         # Flush output before sending the reply.
    365         sys.stdout.flush()
    366         sys.stderr.flush()

...........................................................................
/Library/Python/2.7/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code=u'start_time = time.time()\n\nparam_grid = {\'rf...rint("Best CV score:")\nprint(model.best_score_)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    170 
    171         reply_content = {}
    172         # FIXME: the shell calls the exception handler itself.
    173         shell._reply_content = None
    174         try:
--> 175             shell.run_cell(code, store_history=store_history, silent=silent)
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = u'start_time = time.time()\n\nparam_grid = {\'rf...rint("Best CV score:")\nprint(model.best_score_)'
        store_history = True
        silent = False
    176         except:
    177             status = u'error'
    178             # FIXME: this code right now isn't being used yet by default,
    179             # because the run_cell() call above directly fires off exception

...........................................................................
/Library/Python/2.7/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell=u'start_time = time.time()\n\nparam_grid = {\'rf...rint("Best CV score:")\nprint(model.best_score_)', store_history=True, silent=False, shell_futures=True)
   2897                 self.displayhook.exec_result = result
   2898 
   2899                 # Execute the user code
   2900                 interactivity = "none" if silent else self.ast_node_interactivity
   2901                 self.run_ast_nodes(code_ast.body, cell_name,
-> 2902                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler instance>
   2903 
   2904                 # Reset this so later displayed values do not modify the
   2905                 # ExecutionResult
   2906                 self.displayhook.exec_result = None

...........................................................................
/Library/Python/2.7/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>, <_ast.Print object>, <_ast.Print object>, <_ast.Print object>, <_ast.Print object>, <_ast.Print object>], cell_name='<ipython-input-36-b98650b4b979>', interactivity='none', compiler=<IPython.core.compilerop.CachingCompiler instance>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   3001 
   3002         try:
   3003             for i, node in enumerate(to_run_exec):
   3004                 mod = ast.Module([node])
   3005                 code = compiler(mod, cell_name, "exec")
-> 3006                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x12dec26b0, file "<ipython-input-36-b98650b4b979>", line 5>
        result = <IPython.core.interactiveshell.ExecutionResult object>
   3007                     return True
   3008 
   3009             for i, node in enumerate(to_run_interactive):
   3010                 mod = ast.Interactive([node])

...........................................................................
/Library/Python/2.7/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x12dec26b0, file "<ipython-input-36-b98650b4b979>", line 5>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   3061         outflag = 1  # happens in more places, so it's easier as default
   3062         try:
   3063             try:
   3064                 self.hooks.pre_run_code_hook()
   3065                 #rprint('Running code', repr(code_obj)) # dbg
-> 3066                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x12dec26b0, file "<ipython-input-36-b98650b4b979>", line 5>
        self.user_global_ns = {'BaseEstimator': <class 'sklearn.base.BaseEstimator'>, 'DictVectorizer': <class 'sklearn.feature_extraction.dict_vectorizer.DictVectorizer'>, 'FeatureUnion': <class 'sklearn.pipeline.FeatureUnion'>, 'In': ['', u'import time\nimport numpy as np\nimport pandas...r\n\nimport re\nimport random\nrandom.seed(2016)', u'LOC = \'/Users/rbekbolatov/data/kaggle/homedep...5), (166693, 4) -> df_train.shape, df_test.shape', u"df_all = pd.concat((df_train, df_test), axis=0...\ndf_all = pd.merge(df_all, df_matches, on='id')", u'df_all[0:2]', u'pattern_camel = re.compile(r"([a-z]+)([0-9A]|(...'] = df_all[\'brand\'].map(lambda x:str_stem(x))', u'def str_common_word(str1, str2):\n    words, c...= 1\n            i_ += len(str1)\n    return cnt', u"# id\tproduct_title\tproduct_uid\trelevance\ts...er(fmean_squared_error, greater_is_better=False)", u'start_time = time.time()\n\n#comment out the l...--" % round(((time.time() - start_time)/60), 2))', u'# LOAD FROM SAVED\n#df_all = pd.read_csv(\'df_...lues\nX_train = df_train[:]\nX_test = df_test[:]', u"tfidf = TfidfVectorizer(ngram_range=(1, 1), st...depth=20)\n#clf.fit(X_train, y_train)\n# X_train", u'start_time = time.time()\n\nparam_grid = {\'rf...rint("Best CV score:")\nprint(model.best_score_)', u'start_time = time.time()\n\n#comment out the l...--" % round(((time.time() - start_time)/60), 2))', u'# LOAD FROM SAVED\n#df_all = pd.read_csv(\'df_...lues\nX_train = df_train[:]\nX_test = df_test[:]', u"tfidf = TfidfVectorizer(ngram_range=(1, 1), st...depth=20)\n#clf.fit(X_train, y_train)\n# X_train", u'start_time = time.time()\n\nparam_grid = {\'rf...rint("Best CV score:")\nprint(model.best_score_)', u'start_time = time.time()\n\n#comment out the l...--" % round(((time.time() - start_time)/60), 2))', u"tfidf = TfidfVectorizer(ngram_range=(1, 1), st...depth=20)\n#clf.fit(X_train, y_train)\n# X_train", u"tfidf = TfidfVectorizer(ngram_range=(1, 1), st...depth=20)\n#clf.fit(X_train, y_train)\n# X_train", u"tfidf = TfidfVectorizer(ngram_range=(1, 1), st...depth=20)\n#clf.fit(X_train, y_train)\n# X_train", ...], 'LOC': '/Users/rbekbolatov/data/kaggle/homedepot/', 'OneHotEncoder': <class 'sklearn.preprocessing.data.OneHotEncoder'>, 'Out': {4:    id                      product_title  produc...      

  mfgbrand2  
0            
1            , 30:    id                      product_title  produc...                       0  

[2 rows x 45 columns]}, 'RMSE': make_scorer(fmean_squared_error, greater_is_better=False), 'RandomForestRegressor': <class 'sklearn.ensemble.forest.RandomForestRegressor'>, 'TfidfVectorizer': <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, ...}
        self.user_ns = {'BaseEstimator': <class 'sklearn.base.BaseEstimator'>, 'DictVectorizer': <class 'sklearn.feature_extraction.dict_vectorizer.DictVectorizer'>, 'FeatureUnion': <class 'sklearn.pipeline.FeatureUnion'>, 'In': ['', u'import time\nimport numpy as np\nimport pandas...r\n\nimport re\nimport random\nrandom.seed(2016)', u'LOC = \'/Users/rbekbolatov/data/kaggle/homedep...5), (166693, 4) -> df_train.shape, df_test.shape', u"df_all = pd.concat((df_train, df_test), axis=0...\ndf_all = pd.merge(df_all, df_matches, on='id')", u'df_all[0:2]', u'pattern_camel = re.compile(r"([a-z]+)([0-9A]|(...'] = df_all[\'brand\'].map(lambda x:str_stem(x))', u'def str_common_word(str1, str2):\n    words, c...= 1\n            i_ += len(str1)\n    return cnt', u"# id\tproduct_title\tproduct_uid\trelevance\ts...er(fmean_squared_error, greater_is_better=False)", u'start_time = time.time()\n\n#comment out the l...--" % round(((time.time() - start_time)/60), 2))', u'# LOAD FROM SAVED\n#df_all = pd.read_csv(\'df_...lues\nX_train = df_train[:]\nX_test = df_test[:]', u"tfidf = TfidfVectorizer(ngram_range=(1, 1), st...depth=20)\n#clf.fit(X_train, y_train)\n# X_train", u'start_time = time.time()\n\nparam_grid = {\'rf...rint("Best CV score:")\nprint(model.best_score_)', u'start_time = time.time()\n\n#comment out the l...--" % round(((time.time() - start_time)/60), 2))', u'# LOAD FROM SAVED\n#df_all = pd.read_csv(\'df_...lues\nX_train = df_train[:]\nX_test = df_test[:]', u"tfidf = TfidfVectorizer(ngram_range=(1, 1), st...depth=20)\n#clf.fit(X_train, y_train)\n# X_train", u'start_time = time.time()\n\nparam_grid = {\'rf...rint("Best CV score:")\nprint(model.best_score_)', u'start_time = time.time()\n\n#comment out the l...--" % round(((time.time() - start_time)/60), 2))', u"tfidf = TfidfVectorizer(ngram_range=(1, 1), st...depth=20)\n#clf.fit(X_train, y_train)\n# X_train", u"tfidf = TfidfVectorizer(ngram_range=(1, 1), st...depth=20)\n#clf.fit(X_train, y_train)\n# X_train", u"tfidf = TfidfVectorizer(ngram_range=(1, 1), st...depth=20)\n#clf.fit(X_train, y_train)\n# X_train", ...], 'LOC': '/Users/rbekbolatov/data/kaggle/homedepot/', 'OneHotEncoder': <class 'sklearn.preprocessing.data.OneHotEncoder'>, 'Out': {4:    id                      product_title  produc...      

  mfgbrand2  
0            
1            , 30:    id                      product_title  produc...                       0  

[2 rows x 45 columns]}, 'RMSE': make_scorer(fmean_squared_error, greater_is_better=False), 'RandomForestRegressor': <class 'sklearn.ensemble.forest.RandomForestRegressor'>, 'TfidfVectorizer': <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, ...}
   3067             finally:
   3068                 # Reset our crash handler in place
   3069                 sys.excepthook = old_excepthook
   3070         except SystemExit as e:

...........................................................................
/Users/rbekbolatov/repos/gh/bekbolatov/kaggle/events/hd/notebooks/<ipython-input-36-b98650b4b979> in <module>()
      1 
      2 start_time = time.time()
      3 
      4 param_grid = {'rfr__max_features': [2], 'rfr__max_depth': [30]}
----> 5 model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, n_jobs = -1, cv = 3, verbose = 20, scoring=RMSE)
      6 model.fit(X_train, y_train)
      7 
      8 print("--- Training: %s minutes ---" % round(((time.time() - start_time)/60),2))
      9 
     10 print("Best parameters found by grid search:")
     11 print(model.best_params_)

...........................................................................
/Library/Python/2.7/site-packages/sklearn/grid_search.py in fit(self=GridSearchCV(cv=3, error_score='raise',
       e...ror, greater_is_better=False),
       verbose=20), X=           id                                   ...            0.866667  

[74067 rows x 45 columns], y=array([ 3.  ,  2.5 ,  3.  , ...,  2.33,  3.  ,  2.33]))
    727         y : array-like, shape = [n_samples] or [n_samples, n_output], optional
    728             Target relative to X for classification or regression;
    729             None for unsupervised learning.
    730 
    731         """
--> 732         return self._fit(X, y, ParameterGrid(self.param_grid))
        self._fit = <bound method GridSearchCV._fit of GridSearchCV(...or, greater_is_better=False),
       verbose=20)>
        X =            id                                   ...            0.866667  

[74067 rows x 45 columns]
        y = array([ 3.  ,  2.5 ,  3.  , ...,  2.33,  3.  ,  2.33])
        self.param_grid = {'rfr__max_depth': [30], 'rfr__max_features': [2]}
    733 
    734 
    735 class RandomizedSearchCV(BaseSearchCV):
    736     """Randomized search on hyper parameters.

...........................................................................
/Library/Python/2.7/site-packages/sklearn/grid_search.py in _fit(self=GridSearchCV(cv=3, error_score='raise',
       e...ror, greater_is_better=False),
       verbose=20), X=           id                                   ...            0.866667  

[74067 rows x 45 columns], y=array([ 3.  ,  2.5 ,  3.  , ...,  2.33,  3.  ,  2.33]), parameter_iterable=<sklearn.grid_search.ParameterGrid object>)
    500         )(
    501             delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
    502                                     train, test, self.verbose, parameters,
    503                                     self.fit_params, return_parameters=True,
    504                                     error_score=self.error_score)
--> 505                 for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.grid_search.ParameterGrid object>
    506                 for train, test in cv)
    507 
    508         # Out is a list of triplet: score, estimator, n_test_samples
    509         n_fits = len(out)

...........................................................................
/Library/Python/2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<itertools.islice object>)
    661             if pre_dispatch == "all" or n_jobs == 1:
    662                 # The iterable was consumed all at once by the above for loop.
    663                 # No need to wait for async callbacks to trigger to
    664                 # consumption.
    665                 self._iterating = False
--> 666             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    667             # Make sure that we get a last message telling us we are done
    668             elapsed_time = time.time() - self._start_time
    669             self._print('Done %3i out of %3i | elapsed: %s finished',
    670                         (len(self._output),

    ---------------------------------------------------------------------------
    Sub-process traceback:
    ---------------------------------------------------------------------------
    ValueError                                         Wed Mar 23 03:45:42 2016
PID: 41621                                   Python 2.7.10: /usr/bin/python
...........................................................................
/Library/Python/2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator=Pipeline(steps=[('union', FeatureUnion(n_jobs=-1...=3017,
           verbose=1, warm_start=False))]), X=           id                                   ...            0.866667  

[74067 rows x 45 columns], y=array([ 3.  ,  2.5 ,  3.  , ...,  2.33,  3.  ,  2.33]), scorer=make_scorer(fmean_squared_error, greater_is_better=False), train=array([24689, 24690, 24691, ..., 74064, 74065, 74066]), test=array([    0,     1,     2, ..., 24686, 24687, 24688]), verbose=20, parameters={'rfr__max_depth': 30, 'rfr__max_features': 2}, fit_params={}, return_train_score=False, return_parameters=True, error_score='raise')
   1454 
   1455     try:
   1456         if y_train is None:
   1457             estimator.fit(X_train, **fit_params)
   1458         else:
-> 1459             estimator.fit(X_train, y_train, **fit_params)
   1460 
   1461     except Exception as e:
   1462         if error_score == 'raise':
   1463             raise

...........................................................................
/Library/Python/2.7/site-packages/sklearn/pipeline.pyc in fit(self=Pipeline(steps=[('union', FeatureUnion(n_jobs=-1...=3017,
           verbose=1, warm_start=False))]), X=           id                                   ...            0.866667  

[49378 rows x 45 columns], y=array([ 2.67,  2.  ,  2.33, ...,  2.33,  3.  ,  2.33]), **fit_params={})
    135             pipeline.
    136         y : iterable, default=None
    137             Training targets. Must fulfill label requirements for all steps of
    138             the pipeline.
    139         """
--> 140         Xt, fit_params = self._pre_transform(X, y, **fit_params)
    141         self.steps[-1][-1].fit(Xt, y, **fit_params)
    142         return self
    143 
    144     def fit_transform(self, X, y=None, **fit_params):

...........................................................................
/Library/Python/2.7/site-packages/sklearn/pipeline.pyc in _pre_transform(self=Pipeline(steps=[('union', FeatureUnion(n_jobs=-1...=3017,
           verbose=1, warm_start=False))]), X=           id                                   ...            0.866667  

[49378 rows x 45 columns], y=array([ 2.67,  2.  ,  2.33, ...,  2.33,  3.  ,  2.33]), **fit_params={})
    116             step, param = pname.split('__', 1)
    117             fit_params_steps[step][param] = pval
    118         Xt = X
    119         for name, transform in self.steps[:-1]:
    120             if hasattr(transform, "fit_transform"):
--> 121                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    122             else:
    123                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
    124                               .transform(Xt)
    125         return Xt, fit_params_steps[self.steps[-1][0]]

...........................................................................
/Library/Python/2.7/site-packages/sklearn/pipeline.pyc in fit_transform(self=FeatureUnion(n_jobs=-1,
       transformer_list=... transformer_weights={'cst': 1.0, 'brandf': 1.0}), X=           id                                   ...            0.866667  

[49378 rows x 45 columns], y=array([ 2.67,  2.  ,  2.33, ...,  2.33,  3.  ,  2.33]), **fit_params={})
    449             for name, trans in self.transformer_list)
    450 
    451         Xs, transformers = zip(*result)
    452         self._update_transformer_list(transformers)
    453         if any(sparse.issparse(f) for f in Xs):
--> 454             Xs = sparse.hstack(Xs).tocsr()
    455         else:
    456             Xs = np.hstack(Xs)
    457         return Xs
    458 

...........................................................................
/Library/Python/2.7/site-packages/scipy/sparse/construct.pyc in hstack(blocks=(array([[  1.21724000e+05,   3.00000000e+00,   1....000000e-01,   8.80000000e-01,   8.66666667e-01]]), <1x49378 sparse matrix of type '<type 'numpy.flo... stored elements in Compressed Sparse Row format>), format=None, dtype=None)
    451     >>> hstack([A,B]).toarray()
    452     array([[1, 2, 5],
    453            [3, 4, 6]])
    454 
    455     """
--> 456     return bmat([blocks], format=format, dtype=dtype)
    457 
    458 
    459 def vstack(blocks, format=None, dtype=None):
    460     """

...........................................................................
/Library/Python/2.7/site-packages/scipy/sparse/construct.pyc in bmat(blocks=array([[ <49378x29 sparse matrix of type '<type ...d elements in COOrdinate format>]], dtype=object), format=None, dtype=None)
    568 
    569                 if brow_lengths[i] == 0:
    570                     brow_lengths[i] = A.shape[0]
    571                 else:
    572                     if brow_lengths[i] != A.shape[0]:
--> 573                         raise ValueError('blocks[%d,:] has incompatible row dimensions' % i)
    574 
    575                 if bcol_lengths[j] == 0:
    576                     bcol_lengths[j] = A.shape[1]
    577                 else:

ValueError: blocks[0,:] has incompatible row dimensions
___________________________________________________________________________

In [None]:
#X_train.ix[3782]
inds = pd.isnull(X_train).any(1).nonzero()[0]
inds

In [None]:
X_train[0:3]

In [None]:
np.isfinite(X_train.sum())

In [None]:
np.isfinite(X_train).all()

In [None]:
print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

In [None]:
print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

y_pred = model.predict(X_test)
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False)

In [None]:
df_all.dtypes