## Load necessary library

In [1]:
import os
import gc
import re
import multiprocessing as mp
from time import time

import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_categorical_dtype
from scipy.sparse import csr_matrix

from sklearn.linear_model import Ridge
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

## Define hyperparameter

In [2]:
input_dir = './input'

In [3]:
# Read train, test data using pandas
train = pd.read_table(os.path.join(input_dir, 'train.tsv'),
                          engine='c',
                          dtype={'item_condition_id': 'category',
                                 'shipping': 'category'})
test = pd.read_table(os.path.join(input_dir, 'test.tsv'),
                     engine='c',
                     dtype={'item_condition_id': 'category',
                            'shipping': 'category'})

In [5]:
train.describe(include='all')

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
count,1482535.0,1482535,1482535.0,1476208,849853,1482535.0,1482535.0,1482531
unique,,1225273,5.0,1287,4809,,2.0,1281426
top,,Bundle,1.0,"Women/Athletic Apparel/Pants, Tights, Leggings",PINK,,0.0,No description yet
freq,,2232,640549.0,60177,54088,,819435.0,82489
mean,741267.0,,,,,26.73752,,
std,427971.1,,,,,38.58607,,
min,0.0,,,,,0.0,,
25%,370633.5,,,,,10.0,,
50%,741267.0,,,,,17.0,,
75%,1111900.0,,,,,29.0,,


## Preprocessing data

In [13]:
def split_category(test):
    try:
        cats = test.split('/')
        return cats[0], cats[1], cats[2], cats[0] + '/' + cats[1]
    except:
        return 'other', 'other', 'other', 'other/other'

In [10]:
def preprocess_data(train : pd.DataFrame, test : pd.DataFrame):
    # Drop all rows have price <= 0 and reset index of data frame
    train = train[train.price > 0].reset_index(drop=True)
    # Get number of rows of train data
    nrows_train = train.shape[0]
    # Get all price of train data and apply function x = log(x + 1) for each of value
    y_train = np.log1p(train['price'])
    # Merge train dataset and test dataset
    merge = pd.concat([train, test])
    
    merge['has_category'] = (merge['category_name'].notnull()).astype('category')
    merge['category_name'] = merge['category_name'] \
                                .fillna('other/other/other') \
                                .str.lower() \
                                .astype(str)
    merge['']
    return merge

In [11]:
merge = preprocess_data(train, test)

In [12]:
merge.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,test_id,has_category
0,0.0,MLB Cincinnati Reds T Shirt Size XL,3,men/tops/t-shirts,,10.0,1,No description yet,,True
1,1.0,Razer BlackWidow Chroma Keyboard,3,electronics/computers & tablets/components & p...,Razer,52.0,0,This keyboard is in great condition and works ...,,True
2,2.0,AVA-VIV Blouse,1,women/tops & blouses/blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,,True
3,3.0,Leather Horse Statues,1,home/home décor/home décor accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,,True
4,4.0,24K GOLD plated rose,1,women/jewelry/necklaces,,44.0,0,Complete with certificate of authenticity,,True
