In [1]:
# # Scrapbook for Home-depot Question

# Imports

## Data Wrangling
import numpy as np
import pandas as pd

## Misc
import os
import re
from pprint import pprint as pp
import zipfile

## Machine Learning
from nltk.stem.snowball import SnowballStemmer
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
# Config

data_path = "./data/"
stemmer = SnowballStemmer('english')

In [3]:
# Functions

def str_stemmer(s):
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
    return sum(int(str2.find(word)>=0) for word in str1.split())

def process_data(df):
    df['product_title'] = df['product_title'].apply(lambda x: str_stemmer(x))
    df['search_term'] = df['search_term'].apply(lambda x: str_stemmer(x))
    df['product_description'] = df['product_description'].apply(lambda x: str_stemmer(x))
    df['query_len'] = df['search_term'].apply(lambda x: len(x.split()))
    df['product_info'] = df['search_term']+"\t"+df['product_title']+"\t"+df['product_description']
    df['word_in_title'] = df['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
    df['word_in_description'] = df['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))
    return df

In [4]:
# Import data

zips = [f for f in os.listdir(data_path) if re.search(".zip$", f)]
data_dict = {}
for zipped in zips:
    data_dict[zipped.split('.')[0]] = pd.read_csv(data_path+zipped, compression='zip', encoding='latin1')
test_df = data_dict['test']
train_df = data_dict['train']
prod_desc = data_dict['product_descriptions']
attributes = data_dict['attributes']
sample_sub = data_dict['sample_submission']

In [5]:
# Process data

df_train = pd.merge(train_df, prod_desc, how='left', on='product_uid').drop('id', axis=1)
df_test = pd.merge(test_df, prod_desc, how='left', on='product_uid').drop('id', axis=1)
df_train = process_data(df_train)
df_test = process_data(df_test)

In [6]:
df_train.head()

Unnamed: 0,product_uid,product_title,search_term,relevance,product_description,query_len,product_info,word_in_title,word_in_description
0,100001,simpson strong-ti 12-gaug angl,angl bracket,3.0,"not onli do angl make joint stronger, they als...",2,angl bracket\tsimpson strong-ti 12-gaug angl\t...,1,1
1,100001,simpson strong-ti 12-gaug angl,l bracket,2.5,"not onli do angl make joint stronger, they als...",2,l bracket\tsimpson strong-ti 12-gaug angl\tnot...,1,1
2,100002,behr premium textur deckov 1-gal. #sc-141 tugb...,deck over,3.0,behr premium textur deckov is an innov solid c...,2,deck over\tbehr premium textur deckov 1-gal. #...,1,1
3,100005,delta vero 1-handl shower onli faucet trim kit...,rain shower head,2.33,updat your bathroom with the delta vero single...,3,rain shower head\tdelta vero 1-handl shower on...,1,1
4,100005,delta vero 1-handl shower onli faucet trim kit...,shower onli faucet,2.67,updat your bathroom with the delta vero single...,3,shower onli faucet\tdelta vero 1-handl shower ...,3,2


In [7]:
df_train['product_description'][0]

'not onli do angl make joint stronger, they also provid more consistent, straight corners. simpson strong-ti offer a wide varieti of angl in various size and thick to handl light-duti job or project where a structur connect is needed. some can be bent (skewed) to match the project. for outdoor project or those where moistur is present, use our zmax zinc-coat connectors, which provid extra resist against corros (look for a "z" at the end of the model number).versatil connector for various 90 connect and home repair projectsstrong than angl nail or screw fasten alonehelp ensur joint are consist straight and strongdimensions: 3 in. x 3 in. x 1-1/2 in.mad from 12-gaug steelgalvan for extra corros resistanceinstal with 10d common nail or #9 x 1-1/2 in. strong-driv sd screw'

In [8]:
str_common_word('angl', df_train['product_description'][0])

1

In [9]:
# ML Algo

X_train, X_test, y_train, y_test = train_test_split(df_train[['query_len','word_in_title','word_in_description']], df_train['relevance'], test_size=0.33, random_state=42)
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [10]:
# Print metrics

print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2:', np.sqrt(metrics.r2_score(y_test, y_pred)))

MAE: 0.39326997308706296
MSE: 0.23660484818601166
RMSE: 0.4864204438405233
R2: 0.41119412324880045


In [17]:
df_train.reset_index()[['index','relevance','query_len','word_in_title','word_in_description']].to_csv(data_path+'moomin.csv')

In [21]:
str_common_word('hello friend', 'lo f')

0

In [34]:
i=1
w1 = df_train['product_info'][i].split('\t')[0]
w2 = df_train['product_info'][i].split('\t')[1]
w3 = df_train['product_info'][i].split('\t')[2]

In [35]:
print(w1)
print(w2)
print(w3)

l bracket
simpson strong-ti 12-gaug angl
not onli do angl make joint stronger, they also provid more consistent, straight corners. simpson strong-ti offer a wide varieti of angl in various size and thick to handl light-duti job or project where a structur connect is needed. some can be bent (skewed) to match the project. for outdoor project or those where moistur is present, use our zmax zinc-coat connectors, which provid extra resist against corros (look for a "z" at the end of the model number).versatil connector for various 90 connect and home repair projectsstrong than angl nail or screw fasten alonehelp ensur joint are consist straight and strongdimensions: 3 in. x 3 in. x 1-1/2 in.mad from 12-gaug steelgalvan for extra corros resistanceinstal with 10d common nail or #9 x 1-1/2 in. strong-driv sd screw


In [30]:
str_common_word(w1,w2)

1

In [32]:
[w2.find(w) for w in w1.split()]

[29, -1]

In [36]:
[w3.find(w) for w in w1.split()]

[6, -1]

In [37]:
w1.split()

['l', 'bracket']