In [75]:
# import necessary library
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer
import os
import Levenshtein
from gensim.utils import tokenize
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.similarities import MatrixSimilarity
import nltk
from nltk.tokenize import word_tokenize
from gensim.models.word2vec import Word2Vec
from scipy import spatial
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.svm import SVR

In [2]:
# Step 1: read data

# Step 1.1: read the stemmed data
df_train = pd.read_csv('stemmed_data/train.csv', encoding = "ISO-8859-1")
df_test = pd.read_csv('stemmed_data/test.csv', encoding = "ISO-8859-1")
df_desc = pd.read_csv('stemmed_data/product_descriptions.csv', encoding = "ISO-8859-1")
df_desc = df_desc[['product_uid', 'product_description']]

df_attr_material = pd.read_csv('stemmed_data/attr_material.csv', encoding = "ISO-8859-1").dropna(how='any')
df_attr_brand = pd.read_csv('stemmed_data/attr_brand.csv', encoding = "ISO-8859-1").dropna(how='any')
df_attr_bullets = pd.read_csv('stemmed_data/attr_bullets.csv', encoding = "ISO-8859-1").dropna(how='any')

In [3]:
# add bullets to description
df_desc = pd.merge(df_desc, df_attr_bullets, how='left', on='product_uid')
df_desc['product_description'] = df_desc['product_description'].map(lambda x: x + ' ') + df_desc['bullets']
df_desc = df_desc.drop(['bullets'], axis=1)

In [4]:
# # add brand and material
df_desc = pd.merge(df_desc, df_attr_brand, how='left', on='product_uid')
df_desc = pd.merge(df_desc, df_attr_material, how='left', on='product_uid')
print(df_desc.head(1))

   product_uid                                product_description  \
0       100001  not onli do angl make joint stronger, they als...   

               brand      material  
0  simpson strong-ti  galvan steel  


In [6]:
df_train = pd.merge(df_train, df_desc, how='left', on='product_uid')
df_train['material'] = df_train['material'].fillna(' ')
print(df_train.head(0))
df_test = pd.merge(df_test, df_desc, how='left', on='product_uid')
df_test['material'] = df_test['material'].fillna(' ')
print(df_test.head(0))

Empty DataFrame
Columns: [id, product_uid, product_title, search_term, product_description, brand, material]
Index: []


In [19]:
df_train.to_csv('midterm_data/df_train.csv', index=False)
df_test.to_csv('midterm_data/df_test.csv', index=False)

In [6]:
df_train = pd.read_csv('midterm_data/df_train.csv')
df_train['product_description'] = df_train.apply(lambda x: x['product_description'], axis=1)
df_train['brand'] = df_train.apply(lambda x: x['brand'], axis=1)
df_train['material'] = df_train.apply(lambda x: x['material'], axis=1)
df_test = pd.read_csv('midterm_data/df_test.csv')
df_test['product_description'] = df_test.apply(lambda x: x['product_description'], axis=1)
df_test['brand'] = df_test.apply(lambda x: x['brand'], axis=1)
df_test['material'] = df_test.apply(lambda x: x['material'], axis=1)

In [28]:
print(df_train.head(1))
print(df_test.head(1))

   id  product_uid                   product_title   search_term  relevance  \
0   2       100001  simpson strong-ti 12-gaug angl  angl bracket        3.0   

                                 product_description              brand  \
0  not onli do angl make joint stronger, they als...  simpson strong-ti   

       material  dist_in_title  dist_in_desc           ...             \
0  galvan steel       0.190476      0.021602           ...              

   common_brand  common_material  tfidf_cos_sim_in_title  \
0             0                0                0.274629   

   tfidf_cos_sim_in_desc  tfidf_cos_sim_in_brand  tfidf_cos_sim_in_material  \
0               0.175537                     0.0                        0.0   

   w2v_cos_sim_in_title  w2v_cos_sim_in_desc  w2v_cos_sim_in_brand  \
0              0.483795             0.430817              0.150791   

   w2v_cos_sim_in_material  
0                 0.215702  

[1 rows x 24 columns]
   id  product_uid                   prod

In [11]:
def str_common_word(str1, str2):
    '''
    Count common words in two strings
    :param str1: string
    :param str2: string
    :return: number of common words
    '''
    return sum([int(str2.find(word) >= 0) for word in str1.split()])

In [12]:
# use common words number in search term
# train
df_train['common_title'] = \
    df_train.apply(lambda x: str_common_word(x['product_title'], x['search_term']), axis=1)
df_train['common_desc'] = \
    df_train.apply(lambda x: str_common_word(x['product_description'], x['search_term']), axis=1)
df_train['common_brand'] = \
    df_train.apply(lambda x: str_common_word(x['brand'], x['search_term']), axis=1)
df_train['common_material'] = \
    df_train.apply(lambda x: str_common_word(x['material'], x['search_term']), axis=1)

# test
df_test['common_title'] = \
    df_test.apply(lambda x: str_common_word(x['product_title'], x['search_term']), axis=1)
df_test['common_desc'] = \
    df_test.apply(lambda x: str_common_word(x['product_description'], x['search_term']), axis=1)
df_test['common_brand'] = \
    df_test.apply(lambda x: str_common_word(x['brand'], x['search_term']), axis=1)
df_test['common_material'] = \
    df_test.apply(lambda x: str_common_word(x['material'], x['search_term']), axis=1)

In [9]:
# use levenshtein distance
# train
df_train['dist_in_title'] = df_train.apply(lambda x: Levenshtein.ratio(x['search_term'],x['product_title']), axis=1)
df_train['dist_in_desc'] = df_train.apply(lambda x: Levenshtein.ratio(x['search_term'],x['product_description']), axis=1)
df_train['dist_in_brand'] = df_train.apply(lambda x: Levenshtein.ratio(x['search_term'],x['brand']), axis=1)
df_train['dist_in_material'] = df_train.apply(lambda x: Levenshtein.ratio(x['search_term'],x['material']), axis=1)

# test
df_test['dist_in_title'] = df_test.apply(lambda x: Levenshtein.ratio(x['search_term'],x['product_title']), axis=1)
df_test['dist_in_desc'] = df_test.apply(lambda x: Levenshtein.ratio(x['search_term'],x['product_description']), axis=1)
df_test['dist_in_brand'] = df_test.apply(lambda x: Levenshtein.ratio(x['search_term'],x['brand']), axis=1)
df_test['dist_in_material'] = df_test.apply(lambda x: Levenshtein.ratio(x['search_term'],x['material']), axis=1)

In [13]:
print(df_train.head(1))
print(df_test.head(1))

   id  product_uid                   product_title   search_term  relevance  \
0   2       100001  simpson strong-ti 12-gaug angl  angl bracket        3.0   

                                 product_description              brand  \
0  not onli do angl make joint stronger, they als...  simpson strong-ti   

       material  dist_in_title  dist_in_desc  dist_in_brand  dist_in_material  \
0  galvan steel       0.190476      0.021602       0.275862          0.333333   

   common_title  common_desc  common_brand  common_material  
0             1            7             0                0  
   id  product_uid                   product_title       search_term  \
0   1       100001  simpson strong-ti 12-gaug angl  90 degre bracket   

                                 product_description              brand  \
0  not onli do angl make joint stronger, they als...  simpson strong-ti   

       material  dist_in_title  dist_in_desc  dist_in_brand  dist_in_material  \
0  galvan steel       0.17

In [72]:
X_train = df_train.drop(['id', 'product_uid', 'product_title', 'search_term', 'relevance', 'product_description'\
                      , 'brand', 'material'\
#                       , 'common_title', 'common_desc', 'common_brand', 'common_material'\
                         # keeping these attributes
#                       , 'dist_in_title', 'dist_in_desc', 'dist_in_brand', 'dist_in_material'\
#                       , 'tfidf_cos_sim_in_title' , 'tfidf_cos_sim_in_desc', 'tfidf_cos_sim_in_brand' , 'tfidf_cos_sim_in_material' \
#                         , 'w2v_cos_sim_in_title'\
#                          , 'w2v_cos_sim_in_desc'\
#                          , 'w2v_cos_sim_in_brand'\
                         , 'w2v_cos_sim_in_material'\
                        ], axis=1).values
y_train = df_train['relevance'].values
X_test = df_test.drop(['id', 'product_uid', 'product_title', 'search_term', 'product_description'\
                      , 'brand', 'material'\
#                       , 'common_title', 'common_desc', 'common_brand', 'common_material'\
                       # keeping these attributes
#                       , 'dist_in_title', 'dist_in_desc', 'dist_in_brand', 'dist_in_material'\
#                       , 'tfidf_cos_sim_in_title' , 'tfidf_cos_sim_in_desc', 'tfidf_cos_sim_in_brand' , 'tfidf_cos_sim_in_material' \
#                       , 'w2v_cos_sim_in_title'\
#                        , 'w2v_cos_sim_in_desc'\
#                        , 'w2v_cos_sim_in_brand'\
                       , 'w2v_cos_sim_in_material'\
                      ], axis=1).values
test_ids = df_test['id']

### RandomForest model

In [60]:
# use models from sklearn: RandomForest <-- change to other model from here
params = [1,3,5,6,7,8,9,10,11,12,13,15]
test_scores = []
for param in params:
    print('error for param ' + str(param) + ' is: ', end='')
    clf = RandomForestRegressor(n_estimators=30, max_depth=param)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))
    print(test_score)
    test_scores.append(np.mean(test_score))

error for param 1 is: [0.51482473 0.50731861 0.5033553  0.50902013 0.53395302]
error for param 3 is: [0.5051083  0.49403116 0.49000244 0.49816786 0.5259801 ]
error for param 5 is: [0.49889898 0.4876352  0.48354359 0.49171781 0.51801123]
error for param 6 is: [0.49728693 0.48575806 0.48091492 0.48886799 0.51457923]
error for param 7 is: [0.49474197 0.48428994 0.47928767 0.48676178 0.51205788]
error for param 8 is: [0.49315497 0.48305003 0.47755446 0.48494203 0.50990736]
error for param 9 is: [0.49214502 0.48204016 0.47721106 0.48424345 0.50909867]
error for param 10 is: [0.49087897 0.48115483 0.47596032 0.4833043  0.5074252 ]
error for param 11 is: [0.49038031 0.48041003 0.4747616  0.48147945 0.50665713]
error for param 12 is: [0.49004719 0.48046445 0.47483247 0.48164277 0.5056621 ]
error for param 13 is: [0.48967092 0.48020432 0.47497386 0.48058064 0.50482576]
error for param 15 is: [0.48978513 0.47995163 0.4753768  0.48098616 0.50445624]


In [80]:
# param_grid = {'n_estimators': [140, 160], 'max_depth': [16, 18, 20]}
# grid_search = GridSearchCV(RandomForestRegressor(n_estimators=120, max_depth=10), param_grid, cv=5)
# grid_search.fit(X_train, y_train)
# print(grid_search.best_params_)
rf = RandomForestRegressor(n_estimators=140, max_depth=16)
rf.fit(X_train, y_train)
y_predicted = rf.predict(X_train)
rms = sqrt(mean_squared_error(y_train, y_predicted))
print(rms)
y_pred = rf.predict(X_test)
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('midterm_data/RF_outputs.csv',index=False)

0.350353570413594


### LinearRegression model

In [55]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_predicted = lr.predict(X_train)
rms = sqrt(mean_squared_error(y_train, y_predicted))
print(rms)
y_pred = lr.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('midterm_data/LR_outputs.csv',index=False)

0.4985282255998273


### SVR

In [76]:
svr = SVR(C=1.0, epsilon=0.2)
svr.fit(X_train, y_train)
y_predicted = svr.predict(X_train)
rms = sqrt(mean_squared_error(y_train, y_predicted))
print(rms)
y_pred = lr.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('midterm_data/SVR_outputs.csv',index=False)

0.48724452390953804


ValueError: shapes (166693,15) and (11,) not aligned: 15 (dim 1) != 11 (dim 0)

### XGBoost

In [56]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_predicted = xgb.predict(X_train)
rms = sqrt(mean_squared_error(y_train, y_predicted))
print(rms)
y_pred = xgb.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] > 3:
        y_pred[i] = 3
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('midterm_data/XGB_outputs.csv',index=False)

  if diff:


0.5810266001031343


  if diff:


## Add similarity

In [14]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True, sort=False)
df_all['all_texts'] = df_all['product_title'] + ' . ' + df_all['product_description'] + ' . '
all_text = df_all['all_texts'].values
dictionary = Dictionary(list(tokenize(x, errors='ignore')) for x in all_text)
print('generated dictionary')

generated dictionary


In [15]:
def to_tfidf(text):
    '''
    calculate TF-IDF on bag of words vector
    :param text: input text
    :return:
    '''
    res = tfidf[dictionary.doc2bow(list(tokenize(text, errors='ignore')))]
    return res


def cos_sim(text1, text2):
    '''
    Calculate cosine similarity between two texts
    :param text1: input string
    :param text2: input string
    :return: cosine similarity
    '''
    tfidf1 = to_tfidf(text1)
    tfidf2 = to_tfidf(text2)
    index = MatrixSimilarity([tfidf1],num_features=len(dictionary))
    sim = index[tfidf2]
    return float(sim[0])

In [16]:
class MyCorpus(object):
    '''
    Convert dictionary to be bag of words representation.
    '''
    def __iter__(self):
        for x in all_text:
            yield dictionary.doc2bow(list(tokenize(x, errors='ignore')))

# new an Corpus instance
corpus = MyCorpus()
# Calculate TF-IDF on the bag of words vectors
tfidf = TfidfModel(corpus)

In [17]:
print('train:')
print('calculating similarity between search terms and product title')
df_train['tfidf_cos_sim_in_title'] = df_train.apply(lambda x: cos_sim(x['search_term'], x['product_title']), axis=1)
print('calculating similarity between search terms and product description')
df_train['tfidf_cos_sim_in_desc'] = df_train.apply(lambda x: cos_sim(x['search_term'], x['product_description']), axis=1)
print('calculating similarity between search terms and brand')
df_train['tfidf_cos_sim_in_brand'] = df_train.apply(lambda x: cos_sim(x['search_term'], x['brand']), axis=1)
print('calculating similarity between search terms and material')
df_train['tfidf_cos_sim_in_material'] = df_train.apply(lambda x: cos_sim(x['search_term'], x['material']), axis=1)
df_train.to_csv('midterm_data/df_train_lev_tfidf_sim.csv', index=False)

print('test:')
print('calculating similarity between search terms and product title')
df_test['tfidf_cos_sim_in_title'] = df_test.apply(lambda x: cos_sim(x['search_term'], x['product_title']), axis=1)
print('calculating similarity between search terms and product description')
df_test['tfidf_cos_sim_in_desc'] = df_test.apply(lambda x: cos_sim(x['search_term'], x['product_description']), axis=1)
print('calculating similarity between search terms and brand')
df_test['tfidf_cos_sim_in_brand'] = df_test.apply(lambda x: cos_sim(x['search_term'], x['brand']), axis=1)
print('calculating similarity between search terms and material')
df_test['tfidf_cos_sim_in_material'] = df_test.apply(lambda x: cos_sim(x['search_term'], x['material']), axis=1)
df_test.to_csv('midterm_data/df_test_lev_tfidf_sim.csv', index=False)

train:
calculating similarity between search terms and product title
calculating similarity between search terms and product description
calculating similarity between search terms and brand
calculating similarity between search terms and material
test:
calculating similarity between search terms and product title
calculating similarity between search terms and product description
calculating similarity between search terms and brand
calculating similarity between search terms and material


In [24]:
def get_vector(text):
    '''
    Get the vector representation of input text.
    :param text: input string
    :return:
    '''
    res = np.zeros([128])
    count = 0
    for word in word_tokenize(text):
        res += model[word]
        count+=1
    return res/count


def w2v_cos_sim(text1, text2):
    '''
    Calculate cosine similarity between two word vectors.
    :param text1: input string
    :param text2: input string
    :return: cosine similarity
    '''
    try:
        w2v1 = get_vector(text1)
        w2v2 = get_vector(text2)
        sim = 1 - spatial.distance.cosine(w2v1, w2v2)
        return float(sim)
    except:
        return float(0)

In [26]:
# Load nltk tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# Convert all texts into a list of sentences, and then convert to be a list of words
print('tokenizing')
sentences = [tokenizer.tokenize(x) for x in all_text]
sentences = [y for x in sentences for y in x]
# Apply tokenizer
w2v_corpus = [word_tokenize(x) for x in sentences]
# Train model
print('word 2 vec')
model = Word2Vec(w2v_corpus, size=128, window=5, min_count=5, workers=4)

tokenizing
word 2 vec


In [27]:
print('train:')
print('calculate cosine similarity between word vector and title')
df_train['w2v_cos_sim_in_title'] = df_train.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_title']), axis=1)
print('calculate cosine similarity between word vector and description')
df_train['w2v_cos_sim_in_desc'] = df_train.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_description']), axis=1)
print('calculate cosine similarity between word vector and brand')
df_train['w2v_cos_sim_in_brand'] = df_train.apply(lambda x: w2v_cos_sim(x['search_term'], x['brand']), axis=1)
print('calculate cosine similarity between word vector and material')
df_train['w2v_cos_sim_in_material'] = df_train.apply(lambda x: w2v_cos_sim(x['search_term'], x['material']), axis=1)
df_train.to_csv('midterm_data/df_train_lev_tfidf_sim_w2v_sim.csv', index=False)

print('test:')
print('calculate cosine similarity between word vector and title')
df_test['w2v_cos_sim_in_title'] = df_test.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_title']), axis=1)
print('calculate cosine similarity between word vector and description')
df_test['w2v_cos_sim_in_desc'] = df_test.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_description']), axis=1)
print('calculate cosine similarity between word vector and brand')
df_test['w2v_cos_sim_in_brand'] = df_test.apply(lambda x: w2v_cos_sim(x['search_term'], x['brand']), axis=1)
print('calculate cosine similarity between word vector and material')
df_test['w2v_cos_sim_in_material'] = df_test.apply(lambda x: w2v_cos_sim(x['search_term'], x['material']), axis=1)
df_test.to_csv('midterm_data/df_test_lev_tfidf_sim_w2v_sim.csv', index=False)

train:
calculate cosine similarity between word vector and title


  # Remove the CWD from sys.path while we load stuff.


calculate cosine similarity between word vector and description
calculate cosine similarity between word vector and brand
calculate cosine similarity between word vector and material


  if sys.path[0] == '':


test:
calculate cosine similarity between word vector and title
calculate cosine similarity between word vector and description
calculate cosine similarity between word vector and brand
calculate cosine similarity between word vector and material
