In [47]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from get_sample import get_sample
from get_tfidf_df import apply_normalize
from sklearn.neighbors import KNeighborsRegressor
import pandas as pd
from sklearn.ensemble import VotingRegressor

In [48]:
X_train, X_test, y_train, y_test = get_sample(cutoff=20000, test_size=0.33)

In [49]:
# Turn category names into numbers for ML model
category_cols = ['item_condition_id', 'category_name', 'brand_name']

category_transformer =  ColumnTransformer([
    ('preprocessing', OneHotEncoder(handle_unknown='ignore'), category_cols),
])

In [50]:
category_model = Pipeline([
    ('preprocessing', category_transformer),
    ('model', KNeighborsRegressor(n_neighbors=10))
])

In [51]:
# https://stackoverflow.com/a/65298286/3675086
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')

tfidf_transformer =  ColumnTransformer([
    ('tfidf', tfidf_vectorizer, 'combined_desc')
], sparse_threshold=0)

In [52]:
tfidf_model = Pipeline([
    ('normalize', FunctionTransformer(apply_normalize)),
    ('tfidf', tfidf_transformer),
    ('model', KNeighborsRegressor(n_neighbors=10))
])

In [53]:
combined_model = VotingRegressor(estimators=[
    ('category_model', category_model),
    ('tfidf_model', tfidf_model)
])

In [54]:
X_train['combined_desc'] = X_train[['name', 'item_description']].agg(' '.join, axis=1)

X_test['combined_desc'] = X_test[['name', 'item_description']].agg(' '.join, axis=1)

In [55]:
combined_model.fit(X_train, y_train)

100%|██████████| 13400/13400 [02:43<00:00, 81.77it/s]


VotingRegressor(estimators=[('category_model',
                             Pipeline(steps=[('preprocessing',
                                              ColumnTransformer(transformers=[('preprocessing',
                                                                               OneHotEncoder(handle_unknown='ignore'),
                                                                               ['item_condition_id',
                                                                                'category_name',
                                                                                'brand_name'])])),
                                             ('model',
                                              KNeighborsRegressor(n_neighbors=10))])),
                            ('tfidf_model',
                             Pipeline(steps=[('normalize',
                                              FunctionTransformer(func=<function apply_normalize at 0x000001C63865BA30>)),
          

In [56]:
predictions = combined_model.predict(X_test)

100%|██████████| 6600/6600 [01:32<00:00, 71.68it/s]


In [57]:
pd.DataFrame({"item name": X_test['name'], 'desc': X_test['item_description'], "actual price": y_test, "pred price": predictions}).tail(50)

Unnamed: 0,item name,desc,actual price,pred price
5096,Minnetonka Boots,"very good condition, super cute",16.0,34.9
12918,Rise of the Tomb Raider Xbox One,This Rise of the Tomb Raider on Xbox One. Case...,20.0,27.6
19577,Lularoe tc leggings,Heathered lilac,17.0,36.5
1528,iPod 6th Generation,Fully functional without scratches or anything...,38.0,52.7
16756,BKE Daytrip Tank Top,No signs of wear. Interested in all offers!,14.0,11.45
6946,Ultra plus micro uhs-1 16 gb,Ultra plus micro uhs-1 16 gb Brand new still s...,10.0,61.9
965,Stila Liquid Lipstick,Stila stay all day Liquid Lipstick in the colo...,8.0,13.4
9450,Sparkly Boots,Arizona boots size 7 used some seams missing,12.0,28.45
1662,Pure Romance Island Breeze getaway,"After a long hard day, slip away to the Island...",21.0,14.7
455,Magic throwback Nick Anderson,"Champion Orlando Magic Nick Anderson Jersey, o...",25.0,18.3
