In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsClassifier
from nltk.corpus import stopwords
from category_encoders import TargetEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import gensim
from gensim.models import Word2Vec
import re

In [None]:
df_train = pd.read_csv("./train.csv",sep=',', nrows = 200000)
df_test = pd.read_csv("./test.csv",sep=',')

In [None]:
def split_cat(text):
    try: return text.split("/")
    except: return ("No Category", "No Category", "No Category")    

In [None]:
df_train['main_category'], df_train['subcategory_1'], df_train['subcategory_2'] = zip(*df_train['category_name'].apply(lambda x: split_cat(x)))
df_test['main_category'], df_test['subcategory_1'], df_test['subcategory_2'] = zip(*df_test['category_name'].apply(lambda x: split_cat(x)))

## Part1 Feature Construction

### 1.1 Category Feature

In [None]:
## updated_brand_name
te = TargetEncoder(cols = ["updated_brand_name"]).fit(df_train,df_train["price"])
df_train['brand_name_te'] = te.transform(df_train)['updated_brand_name']
df_test['brand_name_te'] = te.transform(df_test)['updated_brand_name']

In [None]:
## shipping
te = TargetEncoder(cols = ["shipping"]).fit(df_train,df_train["price"])
df_train['shipping_te'] = te.transform(df_train)['shipping']
df_test['shipping_te'] = te.transform(df_test)['shipping']

In [None]:
## item_conditional_id
te = TargetEncoder(cols = ["item_condition_id"]).fit(df_train,df_train["price"])
df_train['item_condition_id_te'] = te.transform(df_train)['item_condition_id']
df_test['item_condition_id_te'] = te.transform(df_test)['item_condition_id']

In [None]:
## update 3-level categories
#main category
te = TargetEncoder(cols = ["main_category"]).fit(df_train,df_train["price"])
df_train['main_category_te'] = te.transform(df_train)['main_category']
df_test['main_category_te'] = te.transform(df_test)['main_category']
#subcategory_1
te = TargetEncoder(cols = ["subcategory_1"]).fit(df_train,df_train["price"])
df_train['subcategory_1_te'] = te.transform(df_train)['subcategory_1']
df_test['subcategory_1_te'] = te.transform(df_test)['subcategory_1']
#subcategory_2
te = TargetEncoder(cols = ["subcategory_2"]).fit(df_train,df_train["price"])
df_train['subcategory_2_te'] = te.transform(df_train)['subcategory_2']
df_test['subcategory_2_te'] = te.transform(df_test)['subcategory_2']

In [None]:
category_features_list = ['brand_name_te','shipping_te', 'item_condition_id_te', 
                          'main_category_te', 'subcategory_1_te', 'subcategory_2_te', 'price']

In [None]:
df_train_category = df_train[category_features_list]
df_test_category = df_test[category_features_list]

In [None]:
df_train_category.to_csv('./df_train_category.csv')
df_test_category.to_csv('./df_test_category.csv')

In [None]:
df_train_category

Unnamed: 0,brand_name_te,shipping_te,item_condition_id_te,main_category_te,subcategory_1_te,subcategory_2_te,price
0,23.219157,30.275798,26.744256,19.702845,18.632705,18.039436,24.0
1,18.967176,22.541553,26.390406,35.731533,30.307661,13.593852,11.0
2,16.101190,30.275798,26.744256,29.054339,18.081113,15.475879,9.0
3,37.570218,30.275798,26.390406,24.486381,28.327181,29.075931,31.0
4,26.807762,30.275798,27.768790,19.702845,20.709276,27.989540,18.0
...,...,...,...,...,...,...,...
199995,20.844595,22.541553,27.768790,29.054339,33.909643,35.294710,15.0
199996,18.967176,30.275798,26.390406,25.077433,29.838863,21.262522,29.0
199997,18.909014,30.275798,26.744256,29.054339,28.625748,34.641769,16.0
199998,18.967176,22.541553,27.768790,35.731533,30.307661,13.593852,25.0


### 1.2 Text Feature

#### 1.2.1 TF-IDF

In [None]:
vector = TfidfVectorizer()
train_tfidf = vector.fit_transform(df_train['item_description'].apply(lambda x: np.str_(x)))
test_tfidf = vector.transform(df_test['item_description'].apply(lambda x: np.str_(x)))

In [None]:
df_train['item_description']

0         These are new however the tip is cut off the l...
1         Black Waterproof Waist Punch Cover For Apple i...
2         American eagle black mesh and lace blouse. Has...
3                                        No description yet
4         Used only a few times. Great for creating beac...
                                ...                        
199995    New black rain coat by Columbia. Doesn't have ...
199996    Brand new condition BUT no ice packs or meal c...
199997    Size: XL Black women's UA Storm pants NWOT con...
199998    Light up case, iphone 7. Great for pictures or...
199999                                   No description yet
Name: item_description, Length: 200000, dtype: object

In [None]:
test_tfidf.shape

(148254, 60523)

In [None]:
#remove stopwords
stop_words = set(stopwords.words('english'))
tfidf = TfidfVectorizer(stop_words=stop_words,max_features=400)
train_tfidf = tfidf.fit_transform(df_train['item_description'].apply(lambda x: np.str_(x)))
test_tfidf = tfidf.transform(df_test['item_description'].apply(lambda x: np.str_(x)))

In [None]:
len(tfidf.vocabulary_)

400

In [None]:
train_tfidf.shape

(200000, 400)

In [None]:
# # from sklearn.feature_selection import SelectKBest,f_regression
# select_model = SelectKBest(f_regression, k=1000)
# train_tfidf_1000 = select_model.fit_transform(train_tfidf, df_train['price'])
# test_tfidf_1000 = select_model.transform(test_tfidf)

In [None]:
column_name = []
for i in range(400):
    column_name.append("tfidf_"+ str(i))

In [None]:
df_tfidf_train = pd.DataFrame(train_tfidf.todense(), columns = column_name)
df_tfidf_test = pd.DataFrame(test_tfidf.todense(),columns = column_name)

In [None]:
df_tfidf_train.to_csv('./df_train_tfidf.csv')
df_tfidf_test.to_csv('./df_test_tfidf.csv')

In [None]:
df_tfidf_test

Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,tfidf_390,tfidf_391,tfidf_392,tfidf_393,tfidf_394,tfidf_395,tfidf_396,tfidf_397,tfidf_398,tfidf_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.713724,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
199996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
199997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.385913,0.0,0.0,0.000000,0.0,0.0
199998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0


#### 1.2.2 Word2Vec

In [None]:
from nltk.corpus import stopwords
stopwords_set = set(stopwords.words('english'))

sentences = []
for single_des in df_train["item_description"].values:
    for s in re.split("\.", str(single_des)):
        if len(s) > 2:
            sentence = []
            for word in s.split(" "):
                if len(word)>1 and word not in stopwords_set:
                    sentence.append(word.strip().lower())
            sentences.append(sentence)


In [None]:
VECTOR_SIZE = 150
model = Word2Vec(sentences, vector_size=VECTOR_SIZE, window=5, min_count=5, workers=4)

In [None]:
model.wv.most_similar('computer', topn=10)

[('wifi', 0.8667345643043518),
 ('phone,', 0.8560323119163513),
 ('device,', 0.8514837622642517),
 ('connect', 0.8485516905784607),
 ('camera,', 0.8468049764633179),
 ('wi-fi', 0.8423641324043274),
 ('connects', 0.8376858234405518),
 ('port', 0.8368887901306152),
 ('webcam', 0.8363166451454163),
 ('battery,', 0.8337953090667725)]

In [None]:
words_vob = list(model.wv.index_to_key)
w2v_vector_train = np.zeros((df_train.shape[0], VECTOR_SIZE))

for i in range(df_train.shape[0]):
    word_list = str(df_train['item_description'][i]).split(" ")
    single_vector = np.zeros(VECTOR_SIZE)
    cnt = 0
    for word in word_list:
        word = word.strip().lower()
        if word in words_vob:
            single_vector += model.wv[word]
            cnt += 1
    if cnt > 0:
        w2v_vector_train[i] = single_vector / cnt 

In [None]:
w2v_vector_test = np.zeros((df_test.shape[0], VECTOR_SIZE))

for i in range(df_test.shape[0]):
    word_list = str(df_train['item_description'][i]).split(" ")
    single_vector = np.zeros(VECTOR_SIZE)
    cnt = 0
    for word in word_list:
        word = word.strip().lower()
        if word in words_vob:
            single_vector += model.wv[word]
            cnt += 1
    if cnt > 0:
        w2v_vector_test[i] = single_vector / cnt    

In [None]:
column_name = []
for i in range(VECTOR_SIZE):
    column_name.append("w2v_"+ str(i))

In [None]:
df_w2v_train = pd.DataFrame(w2v_vector_train, columns = column_name)
df_w2v_test = pd.DataFrame(w2v_vector_test, columns = column_name)

In [None]:
df_w2v_train.to_csv('./df_train_w2v.csv')
df_w2v_test.to_csv('./df_test_w2v.csv')

#### 1.2.3 Bert