In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from get_sample import get_sample
from get_tfidf_df import apply_normalize
from sklearn.neighbors import KNeighborsRegressor
import pandas as pd
from sklearn.ensemble import VotingRegressor

[nltk_data] Downloading package punkt to /Users/Tyler/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/Tyler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
X_train, X_test, y_train, y_test = get_sample(cutoff=100, test_size=0.33)

In [3]:
# Turn category names into numbers for ML model
category_cols = ['item_condition_id', 'category_name', 'brand_name']

category_transformer =  ColumnTransformer([
    ('preprocessing', OneHotEncoder(handle_unknown='ignore'), category_cols),
])

In [4]:
category_model = Pipeline([
    ('preprocessing', category_transformer),
    ('model', KNeighborsRegressor(n_neighbors=10))
])

In [5]:
# https://stackoverflow.com/a/65298286/3675086
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')

tfidf_transformer =  ColumnTransformer([
    ('tfidf', tfidf_vectorizer, 'item_description')
], sparse_threshold=0)

In [6]:
tfidf_model = Pipeline([
    ('normalize', FunctionTransformer(apply_normalize)),
    ('tfidf', tfidf_transformer),
    ('model', KNeighborsRegressor(n_neighbors=10))
])

In [7]:
category_model.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('preprocessing',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['item_condition_id',
                                                   'category_name',
                                                   'brand_name'])])),
                ('model', KNeighborsRegressor(n_neighbors=10))])

In [8]:
tfidf_model.fit(X_train[['item_description']], y_train)

100%|██████████| 67/67 [00:01<00:00, 41.44it/s]


Pipeline(steps=[('normalize',
                 FunctionTransformer(func=<function apply_normalize at 0x7fa106461310>)),
                ('tfidf',
                 ColumnTransformer(sparse_threshold=0,
                                   transformers=[('tfidf',
                                                  TfidfVectorizer(stop_words='english'),
                                                  'item_description')])),
                ('model', KNeighborsRegressor(n_neighbors=10))])

In [9]:
combined_model = VotingRegressor(estimators=[
    ('category_model', category_model),
    ('tfidf_model', tfidf_model)
])

In [10]:
combined_model.fit(X_train, y_train)

100%|██████████| 67/67 [00:00<00:00, 75.61it/s]


VotingRegressor(estimators=[('category_model',
                             Pipeline(steps=[('preprocessing',
                                              ColumnTransformer(transformers=[('preprocessing',
                                                                               OneHotEncoder(handle_unknown='ignore'),
                                                                               ['item_condition_id',
                                                                                'category_name',
                                                                                'brand_name'])])),
                                             ('model',
                                              KNeighborsRegressor(n_neighbors=10))])),
                            ('tfidf_model',
                             Pipeline(steps=[('normalize',
                                              FunctionTransformer(func=<function apply_normalize at 0x7fa106461310>)),
              

In [11]:
predictions = combined_model.predict(X_test)

100%|██████████| 33/33 [00:00<00:00, 62.92it/s]


In [12]:
pd.DataFrame({"item name": X_test['name'], 'desc': X_test['item_description'], "actual price": y_test, "pred price": predictions}).tail(50)

Unnamed: 0,item name,desc,actual price,pred price
83,Eyebrows Essential Kit MEDIUM; Brown,Eyebrows Essential Kit Everything you need t...,6.0,21.6
53,PINK by Victoria's Secret lace bandeau,Victoria's Secret PINK white/cream colored lac...,7.0,23.75
70,Adidas Ultraboost Shoes,Overall good condition. A few signs of wear,61.0,43.35
45,Woman's north face puffer vest,"Black outside medium gray inside. Authentic, s...",51.0,52.5
44,Glass Christmas Bowl✨,Brand new! Never used smoking bowl. Just bough...,12.0,22.65
39,Victoria secret 34 c corest top,Victoria secret 34 c corest top Will bundle to...,10.0,47.0
22,Galaxy S7 Edge (Unlocked) 32GB,"Reasonable offers welcomed. But if you ask ""lo...",386.0,57.0
80,Maternity top bundle,Sheer black flowy top with cute flower design....,16.0,46.75
10,Smashbox primer,0.25 oz Full size is 1oz for [rm] in Sephora,8.0,21.3
0,MLB Cincinnati Reds T Shirt Size XL,No description yet,10.0,45.3
