In [75]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from nltk.stem.snowball import SnowballStemmer

# Create the dataframes

In [76]:
df_train = pd.read_csv('home_depot_data/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('home_depot_data/test.csv', encoding="ISO-8859-1")
df_attr = pd.read_csv('home_depot_data/attributes.csv')
df_pro_desc = pd.read_csv('home_depot_data/product_descriptions.csv')

# Add stemmer

In [77]:
stemmer = SnowballStemmer('english')
def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
	return sum(int(str2.find(word)>=0) for word in str1.split())

In [78]:
df_train

Unnamed: 0,id,product_uid,product_title,search_term,relevance
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.00
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.50
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.00
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67
...,...,...,...,...,...
74062,221457,206638,Atlantic Windowpane 576 CD or 192 DVD Blu-Ray ...,tv riser glass,1.00
74063,221458,206639,Philips 40-Watt Halogen R20 Flood Light Bulb (...,r20 halogen light,3.00
74064,221463,206641,Schlage Camelot In-Active Aged Bronze Handlese...,schlage lock siena half dummy knob with,2.33
74065,221471,206648,Plastec 11 in. x 24 in. Rose Garden Wall Decor...,zen garden decor,3.00


# Dataframe processing

In [79]:
df_train_base = df_train.drop(['search_term','product_title'],axis=1)
df_train['search_term'] = df_train['search_term'].map(lambda x:str_stemmer(x))
df_train['product_title'] = df_train['product_title'].map(lambda x:str_stemmer(x))
df_train['len_of_query'] = df_train['search_term'].map(lambda x:len(x.split())).astype(np.int64)
df_train['product_info'] = df_train['search_term']+"\t"+df_train['product_title']
df_train['word_in_title'] = df_train['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_train = df_train.drop(['search_term','product_title', 'product_info'],axis=1)

In [80]:
train, test = train_test_split(df_train, test_size=0.2, random_state=23)
base_train, base_test = train_test_split(df_train_base, test_size=0.2, random_state=23)

# Define test and training sets

In [81]:
id_test = test['id']
y_train = train['relevance'].values
X_train = train.drop(['id','relevance'],axis=1).values
X_test = test.drop(['id','relevance'],axis=1).values
y_test = test['relevance'].values

In [82]:
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [83]:
id_base_test = base_test['id']
y_base_train = base_train['relevance'].values
X_base_train = base_train.drop(['id','relevance'],axis=1).values
X_base_test = base_test.drop(['id','relevance'],axis=1).values
y_base_test = base_test['relevance'].values

In [None]:
rf_base = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf_base = BaggingRegressor(rf_base, n_estimators=45, max_samples=0.1, random_state=25)
clf_base.fit(X_base_train, y_base_train)
y_base_pred = clf_base.predict(X_base_test)

NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse_base = np.sqrt(mean_squared_error(y_base_test, y_base_pred))
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_base:.4f}")


Root Mean Squared Error (RMSE): 0.4895
