In [29]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer



In [30]:
stemmer = SnowballStemmer('english')

df_train = pd.read_csv('input/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('input/test.csv', encoding="ISO-8859-1")
# df_attr = pd.read_csv('../input/attributes.csv')
df_pro_desc = pd.read_csv('input/product_descriptions.csv')

num_train = df_train.shape[0]

def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
	return sum(int(str2.find(word)>=0) for word in str1.split())


In [31]:

df_train = pd.merge(df_train, df_pro_desc, how='left', on='product_uid')

df_test = pd.merge(df_test, df_pro_desc, how='left', on='product_uid')

df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

df_all

Unnamed: 0,id,product_description,product_title,product_uid,relevance,search_term
0,2,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie 12-Gauge Angle,100001,3.00,angle bracket
1,3,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie 12-Gauge Angle,100001,2.50,l bracket
2,9,BEHR Premium Textured DECKOVER is an innovativ...,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.00,deck over
3,16,Update your bathroom with the Delta Vero Singl...,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head
4,17,Update your bathroom with the Delta Vero Singl...,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet
5,18,Achieving delicious results is almost effortle...,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,3.00,convection otr
6,20,Achieving delicious results is almost effortle...,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,2.67,microwave over stove
7,21,Achieving delicious results is almost effortle...,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,3.00,microwaves
8,23,The Quantum Adjustable 2-Light LED Black Emerg...,Lithonia Lighting Quantum 2-Light Black LED Em...,100007,2.67,emergency light
9,27,Get the House of Fara 3/4 in. x 3 in. x 8 ft. ...,House of Fara 3/4 in. x 3 in. x 8 ft. MDF Flut...,100009,3.00,mdf 3/4


In [32]:


df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)

df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']

df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))

df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)

#Filling nan
df_all = df_all.fillna(0)


df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']

y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values



In [33]:
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)



In [34]:
def _round(num):
    if num < 1:
        return 1
    elif num > 3:
        return 3
    else:
        return num
    
y_pred2 = map(_round, y_pred)

In [35]:
pd.DataFrame({"id": id_test, "relevance": y_pred2}).to_csv('submission.csv',index=False)