In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from get_sample import get_sample
from get_tfidf_df import apply_normalize
from sklearn.neighbors import KNeighborsRegressor
import pandas as pd
from sklearn.ensemble import VotingRegressor

In [None]:
X_train, X_test, y_train, y_test = get_sample(cutoff=100, test_size=0.33)

In [None]:
# Turn category names into numbers for ML model
category_cols = ['item_condition_id', 'category_name', 'brand_name']

category_transformer =  ColumnTransformer([
    ('preprocessing', OneHotEncoder(handle_unknown='ignore'), category_cols),
])

In [None]:
category_model = Pipeline([
    ('preprocessing', category_transformer),
    ('model', KNeighborsRegressor(n_neighbors=10))
])

In [None]:
# https://stackoverflow.com/a/65298286/3675086
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')

tfidf_transformer =  ColumnTransformer([
    ('tfidf', tfidf_vectorizer, 'item_description')
], sparse_threshold=0)

In [None]:
tfidf_model = Pipeline([
    ('normalize', FunctionTransformer(apply_normalize)),
    ('tfidf', tfidf_transformer),
    ('model', KNeighborsRegressor(n_neighbors=10))
])

In [None]:
combined_model = VotingRegressor(estimators=[
    ('category_model', category_model),
    ('tfidf_model', tfidf_model)
])

In [None]:
combined_model.fit(X_train, y_train)

In [None]:
predictions = combined_model.predict(X_test)

In [None]:
pd.DataFrame({"item name": X_test['name'], 'desc': X_test['item_description'], "actual price": y_test, "pred price": predictions}).tail(50)