## Method with Attribute

In [75]:
import os
from math import sqrt
import numpy as np
import pandas as pd

# sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

# gensim
from gensim.utils import tokenize
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.similarities import MatrixSimilarity
from gensim.models.word2vec import Word2Vec

# other
import Levenshtein
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from scipy import spatial
from xgboost import XGBClassifier

In [2]:
df_train = pd.read_csv('stemmed_data/train.csv', encoding = "ISO-8859-1")
df_test = pd.read_csv('stemmed_data/test.csv', encoding = "ISO-8859-1")
df_desc = pd.read_csv('stemmed_data/product_descriptions.csv', encoding = "ISO-8859-1")
df_desc = df_desc[['product_uid', 'product_description']]

df_attr_material = pd.read_csv('stemmed_data/attr_material.csv', encoding = "ISO-8859-1").dropna(how='any')
df_attr_brand = pd.read_csv('stemmed_data/attr_brand.csv', encoding = "ISO-8859-1").dropna(how='any')
df_attr_bullets = pd.read_csv('stemmed_data/attr_bullets.csv', encoding = "ISO-8859-1").dropna(how='any')

In [3]:
# add bullets to description
df_desc = pd.merge(df_desc, df_attr_bullets, how='left', on='product_uid')
df_desc['product_description'] = df_desc['product_description'].map(lambda x: x + ' ') + df_desc['bullets']
df_desc = df_desc.drop(['bullets'], axis=1)

In [4]:
# add brand and material
df_desc = pd.merge(df_desc, df_attr_brand, how='left', on='product_uid')
df_desc = pd.merge(df_desc, df_attr_material, how='left', on='product_uid')
print(df_desc.head(1))

   product_uid                                product_description  \
0       100001  not onli do angl make joint stronger, they als...   

               brand      material  
0  simpson strong-ti  galvan steel  


In [6]:
df_train = pd.merge(df_train, df_desc, how='left', on='product_uid')
df_train['material'] = df_train['material'].fillna(' ')
print(df_train.head(0))
df_test = pd.merge(df_test, df_desc, how='left', on='product_uid')
df_test['material'] = df_test['material'].fillna(' ')
print(df_test.head(0))

Empty DataFrame
Columns: [id, product_uid, product_title, search_term, product_description, brand, material]
Index: []


In [19]:
df_train.to_csv('midterm_data/df_train.csv', index=False)
df_test.to_csv('midterm_data/df_test.csv', index=False)

Pre-processing ends here.

Pre-processing only needs to be done once. After that, just read data from pre-processed csv and use it.

In [6]:
df_train = pd.read_csv('midterm_data/df_train.csv')
df_train['product_description'] = df_train.apply(lambda x: str(x['product_description']), axis=1)
df_train['brand'] = df_train.apply(lambda x: str(x['brand']), axis=1)
df_train['material'] = df_train.apply(lambda x: str(x['material']), axis=1)
df_test = pd.read_csv('midterm_data/df_test.csv')
df_test['product_description'] = df_test.apply(lambda x: str(x['product_description']), axis=1)
df_test['brand'] = df_test.apply(lambda x: str(x['brand']), axis=1)
df_test['material'] = df_test.apply(lambda x: str(x['material']), axis=1)

In [28]:
print(df_train.head(1))
print(df_test.head(1))

   id  product_uid                   product_title   search_term  relevance  \
0   2       100001  simpson strong-ti 12-gaug angl  angl bracket        3.0   

                                 product_description              brand  \
0  not onli do angl make joint stronger, they als...  simpson strong-ti   

       material  dist_in_title  dist_in_desc           ...             \
0  galvan steel       0.190476      0.021602           ...              

   common_brand  common_material  tfidf_cos_sim_in_title  \
0             0                0                0.274629   

   tfidf_cos_sim_in_desc  tfidf_cos_sim_in_brand  tfidf_cos_sim_in_material  \
0               0.175537                     0.0                        0.0   

   w2v_cos_sim_in_title  w2v_cos_sim_in_desc  w2v_cos_sim_in_brand  \
0              0.483795             0.430817              0.150791   

   w2v_cos_sim_in_material  
0                 0.215702  

[1 rows x 24 columns]
   id  product_uid                   prod

In [11]:
def str_common_word(s1, s2):
    return sum([int(s2.find(word) >= 0) for word in s1.split()])

In [12]:
# use common words number in search term
# train
df_train['common_title'] = \
    df_train.apply(lambda x: str_common_word(x['product_title'], x['search_term']), axis=1)
df_train['common_desc'] = \
    df_train.apply(lambda x: str_common_word(x['product_description'], x['search_term']), axis=1)
df_train['common_brand'] = \
    df_train.apply(lambda x: str_common_word(x['brand'], x['search_term']), axis=1)
df_train['common_material'] = \
    df_train.apply(lambda x: str_common_word(x['material'], x['search_term']), axis=1)

# test
df_test['common_title'] = \
    df_test.apply(lambda x: str_common_word(x['product_title'], x['search_term']), axis=1)
df_test['common_desc'] = \
    df_test.apply(lambda x: str_common_word(x['product_description'], x['search_term']), axis=1)
df_test['common_brand'] = \
    df_test.apply(lambda x: str_common_word(x['brand'], x['search_term']), axis=1)
df_test['common_material'] = \
    df_test.apply(lambda x: str_common_word(x['material'], x['search_term']), axis=1)

In [9]:
# use levenshtein distance
# train
df_train['dist_in_title'] = df_train.apply(lambda x: Levenshtein.ratio(x['search_term'],x['product_title']), axis=1)
df_train['dist_in_desc'] = df_train.apply(lambda x: Levenshtein.ratio(x['search_term'],x['product_description']), axis=1)
df_train['dist_in_brand'] = df_train.apply(lambda x: Levenshtein.ratio(x['search_term'],x['brand']), axis=1)
df_train['dist_in_material'] = df_train.apply(lambda x: Levenshtein.ratio(x['search_term'],x['material']), axis=1)

# test
df_test['dist_in_title'] = df_test.apply(lambda x: Levenshtein.ratio(x['search_term'],x['product_title']), axis=1)
df_test['dist_in_desc'] = df_test.apply(lambda x: Levenshtein.ratio(x['search_term'],x['product_description']), axis=1)
df_test['dist_in_brand'] = df_test.apply(lambda x: Levenshtein.ratio(x['search_term'],x['brand']), axis=1)
df_test['dist_in_material'] = df_test.apply(lambda x: Levenshtein.ratio(x['search_term'],x['material']), axis=1)

In [13]:
print(df_train.head(1))
print(df_test.head(1))

   id  product_uid                   product_title   search_term  relevance  \
0   2       100001  simpson strong-ti 12-gaug angl  angl bracket        3.0   

                                 product_description              brand  \
0  not onli do angl make joint stronger, they als...  simpson strong-ti   

       material  dist_in_title  dist_in_desc  dist_in_brand  dist_in_material  \
0  galvan steel       0.190476      0.021602       0.275862          0.333333   

   common_title  common_desc  common_brand  common_material  
0             1            7             0                0  
   id  product_uid                   product_title       search_term  \
0   1       100001  simpson strong-ti 12-gaug angl  90 degre bracket   

                                 product_description              brand  \
0  not onli do angl make joint stronger, they als...  simpson strong-ti   

       material  dist_in_title  dist_in_desc  dist_in_brand  dist_in_material  \
0  galvan steel       0.17

## Add similarity

In [14]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True, sort=False)
df_all['all_texts'] = df_all['product_title'] + ' . ' + df_all['product_description'] + ' . '
all_text = df_all['all_texts'].values
dictionary = Dictionary(list(tokenize(x, errors='ignore')) for x in all_text)
print('generated dictionary')

generated dictionary


In [15]:
def to_tfidf(text):
    res = tfidf[dictionary.doc2bow(list(tokenize(text, errors='ignore')))]
    return res


def cos_sim(s1, s2):
    t1 = to_tfidf(s1)
    t2 = to_tfidf(s2)
    index = MatrixSimilarity([t1],num_features=len(dictionary))
    sim = index[t2]
    return float(sim[0])

In [16]:
class MyCorpus(object):
    def __iter__(self):
        for x in all_text:
            yield dictionary.doc2bow(list(tokenize(x, errors='ignore')))

corpus = MyCorpus()
tfidf = TfidfModel(corpus)

In [17]:
print('train:')
print('calculating similarity between search terms and product title')
df_train['tfidf_cos_sim_in_title'] = df_train.apply(lambda x: cos_sim(x['search_term'], x['product_title']), axis=1)
print('calculating similarity between search terms and product description')
df_train['tfidf_cos_sim_in_desc'] = df_train.apply(lambda x: cos_sim(x['search_term'], x['product_description']), axis=1)
print('calculating similarity between search terms and brand')
df_train['tfidf_cos_sim_in_brand'] = df_train.apply(lambda x: cos_sim(x['search_term'], x['brand']), axis=1)
print('calculating similarity between search terms and material')
df_train['tfidf_cos_sim_in_material'] = df_train.apply(lambda x: cos_sim(x['search_term'], x['material']), axis=1)
df_train.to_csv('midterm_data/df_train_lev_tfidf_sim.csv', index=False)

print('test:')
print('calculating similarity between search terms and product title')
df_test['tfidf_cos_sim_in_title'] = df_test.apply(lambda x: cos_sim(x['search_term'], x['product_title']), axis=1)
print('calculating similarity between search terms and product description')
df_test['tfidf_cos_sim_in_desc'] = df_test.apply(lambda x: cos_sim(x['search_term'], x['product_description']), axis=1)
print('calculating similarity between search terms and brand')
df_test['tfidf_cos_sim_in_brand'] = df_test.apply(lambda x: cos_sim(x['search_term'], x['brand']), axis=1)
print('calculating similarity between search terms and material')
df_test['tfidf_cos_sim_in_material'] = df_test.apply(lambda x: cos_sim(x['search_term'], x['material']), axis=1)
df_test.to_csv('midterm_data/df_test_lev_tfidf_sim.csv', index=False)

train:
calculating similarity between search terms and product title
calculating similarity between search terms and product description
calculating similarity between search terms and brand
calculating similarity between search terms and material
test:
calculating similarity between search terms and product title
calculating similarity between search terms and product description
calculating similarity between search terms and brand
calculating similarity between search terms and material


In [24]:
def get_vector(text):
    res = np.zeros([128])
    count = 0
    for word in word_tokenize(text):
        res += model[word]
        count+=1
    return res/count


def w2v_cos_sim(t1, t2):
    try:
        w2v1 = get_vector(t1)
        w2v2 = get_vector(t2)
        sim = 1 - spatial.distance.cosine(w2v1, w2v2)
        return float(sim)
    except:
        return float(0)

In [26]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
print('tokenizing')
sentences = [tokenizer.tokenize(x) for x in all_text]
sentences = [y for x in sentences for y in x]
w2v_corpus = [word_tokenize(x) for x in sentences]
print('word 2 vec')
model = Word2Vec(w2v_corpus, size=128, window=5, min_count=5, workers=4)

tokenizing
word 2 vec


In [27]:
print('train:')
print('calculate cosine similarity between word vector and title')
df_train['w2v_cos_sim_in_title'] = df_train.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_title']), axis=1)
print('calculate cosine similarity between word vector and description')
df_train['w2v_cos_sim_in_desc'] = df_train.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_description']), axis=1)
print('calculate cosine similarity between word vector and brand')
df_train['w2v_cos_sim_in_brand'] = df_train.apply(lambda x: w2v_cos_sim(x['search_term'], x['brand']), axis=1)
print('calculate cosine similarity between word vector and material')
df_train['w2v_cos_sim_in_material'] = df_train.apply(lambda x: w2v_cos_sim(x['search_term'], x['material']), axis=1)
df_train.to_csv('midterm_data/df_train_lev_tfidf_sim_w2v_sim.csv', index=False)

print('test:')
print('calculate cosine similarity between word vector and title')
df_test['w2v_cos_sim_in_title'] = df_test.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_title']), axis=1)
print('calculate cosine similarity between word vector and description')
df_test['w2v_cos_sim_in_desc'] = df_test.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_description']), axis=1)
print('calculate cosine similarity between word vector and brand')
df_test['w2v_cos_sim_in_brand'] = df_test.apply(lambda x: w2v_cos_sim(x['search_term'], x['brand']), axis=1)
print('calculate cosine similarity between word vector and material')
df_test['w2v_cos_sim_in_material'] = df_test.apply(lambda x: w2v_cos_sim(x['search_term'], x['material']), axis=1)
df_test.to_csv('midterm_data/df_test_lev_tfidf_sim_w2v_sim.csv', index=False)

train:
calculate cosine similarity between word vector and title


  # Remove the CWD from sys.path while we load stuff.


calculate cosine similarity between word vector and description
calculate cosine similarity between word vector and brand
calculate cosine similarity between word vector and material


  if sys.path[0] == '':


test:
calculate cosine similarity between word vector and title
calculate cosine similarity between word vector and description
calculate cosine similarity between word vector and brand
calculate cosine similarity between word vector and material


In [72]:
# drop unneccessary or unused attributes
X_train = df_train.drop(['id', 'product_uid', 'product_title', 'search_term', 'relevance', 'product_description'\
                      , 'brand', 'material'\
#                       , 'common_title', 'common_desc', 'common_brand', 'common_material'\
                         # keeping these attributes
#                       , 'dist_in_title', 'dist_in_desc', 'dist_in_brand', 'dist_in_material'\
#                       , 'tfidf_cos_sim_in_title' , 'tfidf_cos_sim_in_desc', 'tfidf_cos_sim_in_brand' , 'tfidf_cos_sim_in_material' \
#                         , 'w2v_cos_sim_in_title'\
#                          , 'w2v_cos_sim_in_desc'\
#                          , 'w2v_cos_sim_in_brand'\
                         , 'w2v_cos_sim_in_material'\
                        ], axis=1).values
y_train = df_train['relevance'].values
X_test = df_test.drop(['id', 'product_uid', 'product_title', 'search_term', 'product_description'\
                      , 'brand', 'material'\
#                       , 'common_title', 'common_desc', 'common_brand', 'common_material'\
                       # keeping these attributes
#                       , 'dist_in_title', 'dist_in_desc', 'dist_in_brand', 'dist_in_material'\
#                       , 'tfidf_cos_sim_in_title' , 'tfidf_cos_sim_in_desc', 'tfidf_cos_sim_in_brand' , 'tfidf_cos_sim_in_material' \
#                       , 'w2v_cos_sim_in_title'\
#                        , 'w2v_cos_sim_in_desc'\
#                        , 'w2v_cos_sim_in_brand'\
                       , 'w2v_cos_sim_in_material'\
                      ], axis=1).values
test_ids = df_test['id']

### RandomForest model

In [80]:
# param_grid = {'n_estimators': [30, 50, 80, 100, 120, 140, 160], 'max_depth': [8, 10, 12, 16, 18, 20]}
# grid_search = GridSearchCV(RandomForestRegressor(n_estimators=120, max_depth=10), param_grid, cv=5)
# grid_search.fit(X_train, y_train)
# print(grid_search.best_params_)
rf = RandomForestRegressor(n_estimators=120, max_depth=12)
rf.fit(X_train, y_train)
y_predicted = rf.predict(X_train)
rms = sqrt(mean_squared_error(y_train, y_predicted))
print(rms)
y_pred = rf.predict(X_test)
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('midterm_data/RF_outputs.csv',index=False)

0.350353570413594


### LinearRegression model

In [55]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_predicted = lr.predict(X_train)
rms = sqrt(mean_squared_error(y_train, y_predicted))
print(rms)
y_pred = lr.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('midterm_data/LR_outputs.csv',index=False)

0.4985282255998273


### XGBoost

In [56]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_predicted = xgb.predict(X_train)
rms = sqrt(mean_squared_error(y_train, y_predicted))
print(rms)
y_pred = xgb.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('midterm_data/XGB_outputs.csv',index=False)

  if diff:


0.5810266001031343


  if diff:


### SVR

In [76]:
svr = SVR(C=1.0, epsilon=0.2)
svr.fit(X_train, y_train)
y_predicted = svr.predict(X_train)
rms = sqrt(mean_squared_error(y_train, y_predicted))
print(rms)
y_pred = lr.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('midterm_data/SVR_outputs.csv',index=False)

0.48724452390953804


ValueError: shapes (166693,15) and (11,) not aligned: 15 (dim 1) != 11 (dim 0)

## Method without Attribute

In [1]:
# import necessary library
import numpy as np
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
import nltk
from nltk.tokenize import word_tokenize
import Levenshtein

# gensim
from gensim.utils import tokenize
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.similarities import MatrixSimilarity
from gensim.models.word2vec import Word2Vec

# sklearn
from scipy import spatial
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import RidgeCV
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score

# XGB Boost
from xgboost import XGBRegressor
import xgboost as xgb

In [7]:
# Step 1: read data

# Step 1.1: read the data
df_train = pd.read_csv('../data/train.csv', encoding = "ISO-8859-1")
df_test = pd.read_csv('../data/test.csv', encoding = "ISO-8859-1")
df_desc = pd.read_csv('../data/product_descriptions.csv', encoding = "ISO-8859-1")

# Step 1.2: Add product description
df_train = pd.merge(df_train, df_desc, how='left', on='product_uid')
df_test = pd.merge(df_test, df_desc, how='left', on='product_uid')

In [9]:
df_train.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"Not only do angles make joints stronger, they ..."
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"Not only do angles make joints stronger, they ..."
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DECKOVER is an innovativ...
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...


In [10]:
df_test.head()

Unnamed: 0,id,product_uid,product_title,search_term,product_description
0,1,100001,Simpson Strong-Tie 12-Gauge Angle,90 degree bracket,"Not only do angles make joints stronger, they ..."
1,4,100001,Simpson Strong-Tie 12-Gauge Angle,metal l brackets,"Not only do angles make joints stronger, they ..."
2,5,100001,Simpson Strong-Tie 12-Gauge Angle,simpson sku able,"Not only do angles make joints stronger, they ..."
3,6,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong ties,"Not only do angles make joints stronger, they ..."
4,7,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie hcc668,"Not only do angles make joints stronger, they ..."


In [11]:
# Step 2: Pre-processing
# Step 2.1: import snowball stemmer
stemmer = SnowballStemmer('english')

In [12]:
def snowball_stemmer(str):
    '''
    Stem all description with the following step: lower() -> split() -> stemmer.stem -> join
    :param s: input string
    :return: stemmed string
    '''
    return " ".join([stemmer.stem(word) for word in str.lower().split()])

In [13]:
def common_word(str1, str2):
    '''
    Count common words in two strings
    :param str1: string
    :param str2: string
    :return: number of common words
    '''
    return sum(int(str2.find(word) >= 0) for word in str1.split())

In [14]:
# Step 2.2: Stem all data
df_train['search_term'] = df_train['search_term'].map(lambda x: snowball_stemmer(x))
df_train['product_title'] = df_train['product_title'].map(lambda x: snowball_stemmer(x))
df_train['product_description'] = df_train['product_description'].map(lambda x: snowball_stemmer(x))
df_test['search_term'] = df_test['search_term'].map(lambda x: snowball_stemmer(x))
df_test['product_title'] = df_test['product_title'].map(lambda x: snowball_stemmer(x))
df_test['product_description'] = df_test['product_description'].map(lambda x: snowball_stemmer(x))

In [15]:
# Step 3: Calculate text feature
# Step 3.1: Calculate Levenshtein distance
# Levenshtein distance between search terms and product title
df_train['dist_in_title'] = df_train.apply(lambda x:Levenshtein.ratio(x['search_term'],x['product_title']), axis=1)
# Levenshtein distance between search term and product description
df_train['dist_in_desc'] = df_train.apply(lambda x:Levenshtein.ratio(x['search_term'],x['product_description']), axis=1)
# Combine product title and product description as all texts
df_train['all_texts'] = df_train['product_title'] + ' . ' + df_train['product_description'] + ' . '

df_test['dist_in_title'] = df_test.apply(lambda x:Levenshtein.ratio(x['search_term'],x['product_title']), axis=1)
# Levenshtein distance between search term and product description
df_test['dist_in_desc'] = df_test.apply(lambda x:Levenshtein.ratio(x['search_term'],x['product_description']), axis=1)
# Combine product title and product description as all texts
df_test['all_texts'] = df_test['product_title'] + ' . ' + df_test['product_description'] + ' . '

In [16]:
df_train.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,dist_in_title,dist_in_desc,all_texts
0,2,100001,simpson strong-ti 12-gaug angl,angl bracket,3.0,"not onli do angl make joint stronger, they als...",0.190476,0.030418,simpson strong-ti 12-gaug angl . not onli do a...
1,3,100001,simpson strong-ti 12-gaug angl,l bracket,2.5,"not onli do angl make joint stronger, they als...",0.153846,0.022901,simpson strong-ti 12-gaug angl . not onli do a...
2,9,100002,behr premium textur deckov 1-gal. #sc-141 tugb...,deck over,3.0,behr premium textur deckov is an innov solid c...,0.175,0.017875,behr premium textur deckov 1-gal. #sc-141 tugb...
3,16,100005,delta vero 1-handl shower onli faucet trim kit...,rain shower head,2.33,updat your bathroom with the delta vero single...,0.326087,0.048632,delta vero 1-handl shower onli faucet trim kit...
4,17,100005,delta vero 1-handl shower onli faucet trim kit...,shower onli faucet,2.67,updat your bathroom with the delta vero single...,0.382979,0.054545,delta vero 1-handl shower onli faucet trim kit...


In [17]:
df_test.head()

Unnamed: 0,id,product_uid,product_title,search_term,product_description,dist_in_title,dist_in_desc,all_texts
0,1,100001,simpson strong-ti 12-gaug angl,90 degre bracket,"not onli do angl make joint stronger, they als...",0.173913,0.035309,simpson strong-ti 12-gaug angl . not onli do a...
1,4,100001,simpson strong-ti 12-gaug angl,metal l bracket,"not onli do angl make joint stronger, they als...",0.222222,0.037879,simpson strong-ti 12-gaug angl . not onli do a...
2,5,100001,simpson strong-ti 12-gaug angl,simpson sku abl,"not onli do angl make joint stronger, they als...",0.577778,0.037879,simpson strong-ti 12-gaug angl . not onli do a...
3,6,100001,simpson strong-ti 12-gaug angl,simpson strong tie,"not onli do angl make joint stronger, they als...",0.666667,0.045283,simpson strong-ti 12-gaug angl . not onli do a...
4,7,100001,simpson strong-ti 12-gaug angl,simpson strong tie hcc668,"not onli do angl make joint stronger, they als...",0.618182,0.054863,simpson strong-ti 12-gaug angl . not onli do a...


In [18]:
# Generate a dictionary of all text words
# dictionary = Dictionary(list(tokenize(x, errors='ignore')) for x in df_all['all_texts'].values)
df_all = pd.concat([df_train, df_test], axis=0, ignore_index=True, sort=False)
dictionary = Dictionary(list(tokenize(x, errors='ignore')) for x in df_all['all_texts'].values)

In [19]:
class ProductCorpus(object):
    '''
    Convert dictionary to be bag of words representation.
    '''
    def __iter__(self):
        for x in df_all['all_texts'].values:
            yield dictionary.doc2bow(list(tokenize(x, errors='ignore')))

In [20]:
# new an Corpus instance
corpus = ProductCorpus()
# Calculate TF-IDF on the bag of words vectors
tfidf = TfidfModel(corpus)

In [21]:
def to_tfidf(text):
    '''
    calculate TF-IDF on bag of words vector
    :param text: input text
    :return:
    '''
    res = tfidf[dictionary.doc2bow(list(tokenize(text, errors='ignore')))]
    return res

In [22]:
def cos_sim(text1, text2):
    '''
    Calculate cosine similarity between two texts
    :param text1: input string
    :param text2: input string
    :return: cosine similarity
    '''
    tfidf1 = to_tfidf(text1)
    tfidf2 = to_tfidf(text2)
    index = MatrixSimilarity([tfidf1],num_features=len(dictionary))
    sim = index[tfidf2]
    return float(sim[0])

In [23]:
# Calculate similarity between search terms and product title
df_train['tfidf_cos_sim_in_title'] = df_train.apply(lambda x: cos_sim(x['search_term'], x['product_title']), axis=1)
df_test['tfidf_cos_sim_in_title'] = df_test.apply(lambda x: cos_sim(x['search_term'], x['product_title']), axis=1)
# Calculate similarity between search terms and product description
df_train['tfidf_cos_sim_in_desc'] = df_train.apply(lambda x: cos_sim(x['search_term'], x['product_description']), axis=1)
df_test['tfidf_cos_sim_in_desc'] = df_test.apply(lambda x: cos_sim(x['search_term'], x['product_description']), axis=1)

In [24]:
# Load nltk tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# Convert all texts into a list of sentences, and then convert to be a list of words
sentences = []

In [25]:
for x in df_train['all_texts'].values:
    sentences.append(tokenizer.tokenize(x))

for x in df_test['all_texts'].values:
    sentences.append(tokenizer.tokenize(x))

In [26]:
sentences = [y for x in sentences for y in x]
# Apply tokenizer
w2v_corpus = [word_tokenize(x) for x in sentences]
# Train model
model = Word2Vec(w2v_corpus, size=128, window=5, min_count=5, workers=4)

In [27]:
def get_vector(text):
    '''
    Get the vector representation of input text.
    :param text: input string
    :return:
    '''
    res = np.zeros([128])
    count = 0
    for word in word_tokenize(text):
        res += model[word]
        count+=1
    return res/count

In [28]:
def w2v_cos_sim(text1, text2):
    '''
    Calculate cosine similarity between two word vectors.
    :param text1: input string
    :param text2: input string
    :return: cosine similarity
    '''
    try:
        w2v1 = get_vector(text1)
        w2v2 = get_vector(text2)
        sim = 1 - spatial.distance.cosine(w2v1, w2v2)
        return float(sim)
    except:
        return float(0)

In [29]:
# calculate cosine similarity between word vector and title
df_train['w2v_cos_sim_in_title'] = df_train.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_title']), axis=1)
df_test['w2v_cos_sim_in_title'] = df_test.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_title']), axis=1)
# # calculate cosine similarity between word vector and description
df_train['w2v_cos_sim_in_desc'] = df_train.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_description']), axis=1)
df_test['w2v_cos_sim_in_desc'] = df_test.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_description']), axis=1)
# # drop unnecessary columns
df_train = df_train.drop(['search_term','product_title','product_description','all_texts'],axis=1)
df_test = df_test.drop(['search_term','product_title','product_description','all_texts'],axis=1)

  # Remove the CWD from sys.path while we load stuff.


In [30]:
df_train.head()

Unnamed: 0,id,product_uid,relevance,dist_in_title,dist_in_desc,tfidf_cos_sim_in_title,tfidf_cos_sim_in_desc,w2v_cos_sim_in_title,w2v_cos_sim_in_desc
0,2,100001,3.0,0.190476,0.030418,0.274539,0.182836,0.478532,0.406032
1,3,100001,2.5,0.153846,0.022901,0.0,0.0,0.329999,0.101513
2,9,100002,3.0,0.175,0.017875,0.0,0.053455,0.0,0.43853
3,16,100005,2.33,0.326087,0.048632,0.133577,0.043712,0.545233,0.0
4,17,100005,2.67,0.382979,0.054545,0.39732,0.098485,0.729712,0.0


In [31]:
df_test.head()

Unnamed: 0,id,product_uid,dist_in_title,dist_in_desc,tfidf_cos_sim_in_title,tfidf_cos_sim_in_desc,w2v_cos_sim_in_title,w2v_cos_sim_in_desc
0,1,100001,0.173913,0.035309,0.0,0.0,0.332701,0.196472
1,4,100001,0.222222,0.037879,0.0,0.0,0.312972,0.341399
2,5,100001,0.577778,0.037879,0.318767,0.070763,0.570918,0.096475
3,6,100001,0.666667,0.045283,0.520207,0.145561,0.637237,0.358681
4,7,100001,0.618182,0.054863,0.520207,0.145561,0.0,0.0


In [33]:
# ======== pre-processing done ============

In [32]:
# Save the current data and load the training dataset and testing dataset
df_train.to_pickle('df_train.pkl')
df_test.to_pickle('df_test.pkl')
# df_train = pd.read_pickle('df_train.pkl')
# df_test = pd.read_pickle('df_test.pkl')

In [34]:
# Split another training dataset and drop unnecessary columns
y_train = df_train['relevance'].values
X_train = df_train.drop(['id', 'relevance'], axis=1).values
X_test = df_test.drop(['id'], axis = 1).values

In [35]:
# Record testing id
test_ids = df_test['id']

### Random Forest

In [None]:
# use models from sklearn: RandomForest
params = [1,3,5,6,7,8,9,10]
rf_scores = []
for param in params:
    clf = RandomForestRegressor(n_estimators=30, max_depth=param)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    rf_scores.append(np.mean(test_score))

In [None]:
rf_scores

In [38]:
rf = RandomForestRegressor(n_estimators=500, max_depth=5, min_samples_leaf=6, max_features=0.9, min_samples_split=1.0, n_jobs=-1, random_state=2018)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
    if y_pred[i] < 1:
        y_pred[i] = 1
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('outputs/RF_outputs.csv',index=False)

### Ada Boost

In [39]:
ada_scores = []
clf = AdaBoostRegressor(base_estimator=None, n_estimators=300, learning_rate=0.03, loss='linear', random_state=20180525)
ada_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
ada_scores.append(np.mean(ada_score))

In [40]:
ada_scores

[0.49709580093490413]

In [41]:
ab = AdaBoostRegressor(base_estimator=None, n_estimators=300, learning_rate=0.03, loss='linear', random_state=20180525)
ab.fit(X_train, y_train)
y_pred = ab.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
    if y_pred[i] < 1:
        y_pred[i] = 1
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('outputs/AB_outputs.csv',index=False)

In [42]:
xgb_params0={'colsample_bytree': 1, 'silent': 1, 'nthread': 8, 'min_child_weight': 10,
    'n_estimators': 300, 'subsample': 1, 'learning_rate': 0.09, 'objective': 'reg:linear',
    'seed': 10, 'max_depth': 7, 'gamma': 0.}
xgb_params1={'colsample_bytree': 0.77, 'silent': 1, 'nthread': 8, 'min_child_weight': 15,
    'n_estimators': 500, 'subsample': 0.77, 'learning_rate': 0.035, 'objective': 'reg:linear',
    'seed': 11, 'max_depth': 6, 'gamma': 0.2}

### Bagging Regression

In [43]:
# Bagging Regression

bagging_scores = []
clf = BaggingRegressor(base_estimator=xgb.XGBRegressor(**xgb_params1), n_estimators=10, random_state=np.random.RandomState(2018))
bagging_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
bagging_scores.append(np.mean(bagging_score))

KeyboardInterrupt: 

In [None]:
bagging_scores

In [None]:
# bag = BaggingRegressor(n_estimators=300, max_samples=1.0, max_features=1.0)
bag = BaggingRegressor(base_estimator=xgb.XGBRegressor(**xgb_params1), n_estimators=10, random_state=np.random.RandomState(2018))
bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
    if y_pred[i] < 1:
        y_pred[i] = 1
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('outputs/Bag_outputs.csv',index=False)

### Extra Trees Regression

In [None]:
# Extra Trees Regression
extra_scores = []
for param in params:
    clf = ExtraTreesRegressor(n_estimators=300, max_depth=param)
    extra_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    extra_scores.append(np.mean(extra_score))

In [None]:
extra_scores

In [None]:
et = ExtraTreesRegressor(n_estimators=300, max_depth=16)
et.fit(X_train, y_train)
y_pred = et.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
    if y_pred[i] < 1:
        y_pred[i] = 1
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('outputs/ET_outputs.csv',index=False)

### Gradient Boosting Regression

In [None]:
# Gradient Boosting Regression
gradient_scores = []
for param in params:
    clf = GradientBoostingRegressor(n_estimators=500, max_depth=param, min_samples_split=2, min_samples_leaf=15, learning_rate=0.035, loss='ls',random_state=10)
    gradient_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    gradient_scores.append(np.mean(gradient_score))

In [None]:
gradient_scores

In [None]:
# gb = GradientBoostingRegressor(n_estimators=128, max_depth=16)
gb = GradientBoostingRegressor(n_estimators=500, max_depth=6, min_samples_split=2, min_samples_leaf=15, learning_rate=0.035, loss='ls',random_state=10)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
    if y_pred[i] < 1:
        y_pred[i] = 1
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('outputs/GB_outputs.csv',index=False)

### Linear Regression

In [None]:
# Linear regression
linear_scores = []
clf = LinearRegression()
linear_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
linear_scores.append(np.mean(linear_score))

In [None]:
linear_scores

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
    if y_pred[i] < 1:
        y_pred[i] = 1
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('outputs/LR_outputs.csv',index=False)

### Support Vector Machine

In [None]:
# SVM
svm_scores = []
clf = svm.SVR(kernel='linear')
svm_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
svm_scores.append(np.mean(svm_score))

In [None]:
svm_scores

In [None]:
svm = svm.SVR(kernel='linear')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
    if y_pred[i] < 1:
        y_pred[i] = 1
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('outputs/SVM_outputs.csv',index=False)

### Ridge Regression

In [None]:
# Ridge
ridge = RidgeCV(cv=10)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('outputs/Ridge_outputs.csv',index=False)

### Neural Network

In [None]:
# MLP
# mlp = MLPRegressor(solver='lbfgs', alpha=1e-5)
mlp = MLPRegressor()
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
    if y_pred[i] < 1:
        y_pred[i] = 1
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('outputs/MLP_outputs.csv',index=False)

### XGB Boost

In [None]:
# XGB Boost
xgb = XGBRegressor(**xgb_params1)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
    if y_pred[i] < 1:
        y_pred[i] = 1
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('outputs/XGB_outputs.csv',index=False)

### KNN

In [None]:
# KNN
knn = KNeighborsRegressor(128,  weights="uniform", leaf_size=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
    if y_pred[i] < 1:
        y_pred[i] = 1
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('outputs/KNN_outputs.csv',index=False)

### Decision Tree

In [None]:
# Decision Tree
dec = DecisionTreeRegressor(criterion='mse', splitter='random', max_depth=4, min_samples_split=7, min_samples_leaf=30, min_weight_fraction_leaf=0.0, max_features='sqrt', random_state=None, max_leaf_nodes=None, presort=False)
dec.fit(X_train, y_train)
y_pred = dec.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
    if y_pred[i] < 1:
        y_pred[i] = 1
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('outputs/DecisionTree_outputs.csv',index=False)