In [1]:
import pandas as pd

X_full = pd.read_csv('train.tsv', sep = '\t', index_col='train_id')
X_full.head()

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [2]:
X_full.dropna(axis=0, subset=['price'], inplace=True)
y = X_full.price
X_full.drop(['price'], axis=1, inplace=True)

In [3]:
categories = X_full['category_name'].str.split('/', expand = True, n=2)

In [4]:
categories.columns = ['cat1', 'cat2', 'cat3']
categories.head()

Unnamed: 0_level_0,cat1,cat2,cat3
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Men,Tops,T-shirts
1,Electronics,Computers & Tablets,Components & Parts
2,Women,Tops & Blouses,Blouse
3,Home,Home Décor,Home Décor Accents
4,Women,Jewelry,Necklaces


In [5]:
X_full.drop(['category_name'], inplace=True, axis=1)
X_full = pd.concat([X_full, categories], axis=1)
X_full.head()

Unnamed: 0_level_0,name,item_condition_id,brand_name,shipping,item_description,cat1,cat2,cat3
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,,1,No description yet,Men,Tops,T-shirts
1,Razer BlackWidow Chroma Keyboard,3,Razer,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,AVA-VIV Blouse,1,Target,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,Leather Horse Statues,1,,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,24K GOLD plated rose,1,,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces


In [6]:
X_full.isnull().sum()

name                      0
item_condition_id         0
brand_name           632682
shipping                  0
item_description          4
cat1                   6327
cat2                   6327
cat3                   6327
dtype: int64

In [7]:
numerical_columns = ['shipping', 'item_condition_id']
categorical_columns = ['name', 'brand_name', 'item_description', 'cat1', 'cat2', 'cat3']

In [8]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='constant', fill_value='Missing')
imputed_X_full = pd.DataFrame(imputer.fit_transform(X_full))

imputed_X_full.columns = X_full.columns
imputed_X_full.isnull().sum()

name                 0
item_condition_id    0
brand_name           0
shipping             0
item_description     0
cat1                 0
cat2                 0
cat3                 0
dtype: int64

In [9]:
X_full.nunique()

name                 1225273
item_condition_id          5
brand_name              4809
shipping                   2
item_description     1281426
cat1                      10
cat2                     113
cat3                     871
dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(imputed_X_full, y, train_size=0.9, test_size=0.1, random_state=0)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer1 = CountVectorizer(lowercase=False, binary=True)
X_train_brand_OH = vectorizer1.fit_transform(X_train['brand_name'].values)
X_valid_brand_OH = vectorizer1.transform(X_valid['brand_name'].values)

vectorizer2 = CountVectorizer(lowercase=False, binary=True)
X_train_cat1_OH = vectorizer2.fit_transform(X_train['cat1'].values)
X_valid_cat1_OH = vectorizer2.transform(X_valid['cat1'].values)

vectorizer3 = CountVectorizer(lowercase=False, binary=True)
X_train_cat2_OH = vectorizer3.fit_transform(X_train['cat2'].values)
X_valid_cat2_OH = vectorizer3.transform(X_valid['cat2'].values)

vectorizer4 = CountVectorizer(lowercase=False, binary=True)
X_train_cat3_OH = vectorizer4.fit_transform(X_train['cat3'].values)
X_valid_cat3_OH = vectorizer4.transform(X_valid['cat3'].values)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

t_vectorizer1 = TfidfVectorizer(ngram_range=(1, 3), min_df=3, max_features=250000)

X_train_name_tfidf = t_vectorizer1.fit_transform(X_train['name'].values)
X_valid_name_tfidf = t_vectorizer1.transform(X_valid['name'].values)

t_vectorizer2 = TfidfVectorizer(ngram_range=(1, 3), min_df=5, max_features=500000)

X_train_description_tfidf = t_vectorizer2.fit_transform(X_train['item_description'].values)
X_valid_description_tfidf = t_vectorizer2.transform(X_valid['item_description'].values)

In [25]:
from scipy.sparse import csr_matrix

X_train_cat = csr_matrix(X_train[numerical_columns].astype(int))

X_valid_cat = csr_matrix(X_valid[numerical_columns].astype(int))

In [26]:
from scipy.sparse import hstack

X_train_final = hstack((X_train_brand_OH, X_train_cat1_OH, X_train_cat2_OH, X_train_cat3_OH, X_train_name_tfidf, X_train_description_tfidf, X_train_cat)).tocsr()

X_valid_final = hstack((X_valid_brand_OH, X_valid_cat1_OH, X_valid_cat2_OH, X_valid_cat3_OH, X_valid_name_tfidf, X_valid_description_tfidf, X_valid_cat)).tocsr()

print(X_valid_final.shape)

(148254, 756218)


In [27]:
import numpy as np
from sklearn.metrics import mean_squared_error
import xgboost as xgb
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1)
xgb_model.fit(X_train_final, y_train)
xgb_pred = xgb_model.predict(X_valid_final)

print(np.sqrt(mean_squared_error(y_valid, xgb_pred)))

30.440960614828562


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(n_estimators=100, max_depth=3, learning_rate=0.1)
gb.fit(X_train_final, y_train)
gb_pred = gb.predict(X_valid_final)

print(np.sqrt(mean_squared_error(y_valid, gb_pred)))

In [None]:
from catboost import CatBoostRegressor
cat_model = CatBoostRegressor(n_estimators=100, max_depth=3, learning_rate=0.1)
cat_model.fit(X_train_final, y_train)
cat_pred = cat_model.predict(X_valid_final)

print(np.sqrt(mean_squared_error(y_valid, cat_pred)))

In [None]:
import lightgbm as lgb
lgb_model = lgb.LGBMRegressor(n_estimators=100, max_depth=3, learning_rate=0.1)
lgb_model.fit(X_train_final, y_train)
lgb_pred = lgb_model.predict(X_valid_final)

print(np.sqrt(mean_squared_error(y_valid, lgb_pred)))

In [28]:
import pickle

with open('../models/xgb_model.pkl', 'wb') as fout:
    pickle.dump((vectorizer1, vectorizer2, vectorizer3, vectorizer4, t_vectorizer1, t_vectorizer2, xgb_model), fout)

In [15]:
print(X_valid_brand_OH.shape, X_valid_cat1_OH.shape, X_valid_cat2_OH.shape, X_valid_cat3_OH.shape, X_valid_name_tfidf.shape, X_valid_description_tfidf.shape, X_valid_cat.shape)

(148254, 5083) (148254, 13) (148254, 143) (148254, 977) (148254, 250000) (148254, 500000) (148254, 7)
