In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from get_sample import get_sample
from get_tfidf_df import get_tfidf_df
from get_tfidf_df import normalize
from get_category_df import get_category_df
from sklearn.neighbors import KNeighborsRegressor
import pandas as pd


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tyler\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tyler\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
X_train, X_test, y_train, y_test = get_sample(cutoff=10000, test_size=0.33)

In [3]:
tfidf =  ColumnTransformer([
    ('tfidf', TfidfVectorizer(analyzer='word',stop_words= 'english'), 'item_description')
], sparse_threshold=0)

In [4]:
ct =  ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['item_condition_id', 'category_name', 'brand_name']),
])

In [5]:
# transformer = Pipeline(
#     steps=[
#         ("categorical", ct),
#         ('normalize', FunctionTransformer(thing)),
#         ("tfidf", TfidfVectorizer(analyzer='word',stop_words= 'english', min_df=3, max_df=0.1))
#     ]
# )

In [6]:
categ = Pipeline([
    ('ct', ct),
    ('model', KNeighborsRegressor(n_neighbors=21))
])

In [7]:
categ.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['item_condition_id',
                                                   'category_name',
                                                   'brand_name'])])),
                ('model', KNeighborsRegressor(n_neighbors=21))])

In [8]:
categ.predict(X_test)

array([21.95238095, 23.85714286, 13.76190476, ..., 20.52380952,
       40.23809524, 22.        ])

In [9]:
tfidf_model = Pipeline([
    ('tfidf', tfidf),
    ('model', KNeighborsRegressor(n_neighbors=21))
])

In [10]:
tfidf_model.fit(X_train[['item_description']], y_train)

Pipeline(steps=[('tfidf',
                 ColumnTransformer(sparse_threshold=0,
                                   transformers=[('tfidf',
                                                  TfidfVectorizer(stop_words='english'),
                                                  'item_description')])),
                ('model', KNeighborsRegressor(n_neighbors=21))])

In [11]:
tfidf_model.predict(X_test)

array([20.9047619 , 28.33333333, 17.85714286, ..., 24.38095238,
       32.33333333, 19.        ])

In [12]:
from sklearn.ensemble import VotingRegressor

In [22]:
vr = VotingRegressor(estimators=[('categ', categ), ('tfidf_model', tfidf_model)])

In [23]:
vr.fit(X_train, y_train)

VotingRegressor(estimators=[('tfidf_model',
                             Pipeline(steps=[('tfidf',
                                              ColumnTransformer(sparse_threshold=0,
                                                                transformers=[('tfidf',
                                                                               TfidfVectorizer(stop_words='english'),
                                                                               'item_description')])),
                                             ('model',
                                              KNeighborsRegressor(n_neighbors=21))]))])

In [24]:
preds = vr.predict(X_test)

In [25]:
pd.DataFrame({"item name": X_test['name'], 'desc': X_test['item_description'], "actual price": y_test, "pred price": preds}).tail(50)

Unnamed: 0,item name,desc,actual price,pred price
3001,WHITE TOMMY BAHAMA DRESS W/ POCKETS [rm],"white tommy bahama dress with pocket, stretchy...",16.0,26.047619
167,My Pure by Karen Low perfume,Never been used - I will ship with the perfume...,7.0,24.142857
6111,Adidas track pants,Size small in women's! Great condition. Soccer...,9.0,36.904762
4622,OS NWOT Lularoe Leggings,Worn once. Multi color. Smoke free home.,18.0,25.0
2865,Instyler Blu Mini Blow dryer,Never Used Travel Companion Really cute! Will ...,17.0,19.0
8544,Nike sports bra,Price for 2 Both size large Like new,34.0,19.952381
4702,NEW Lancôme Renergie Lift Multi-action,NEW Lancôme Renergie Lift Multi-action bundle....,26.0,23.904762
2883,Ray Ban Round sunglasses NEW!,"Black frames black lenses! Brand new, they jus...",64.0,27.714286
3031,SCE SDCC Thumbs Up Bb-8 Funko Pop!,I have a Star Wars Thumbs Up Bb-8 for sale for...,12.0,26.047619
8183,BNIP Chawa rainbow cake squishy,BNIP 27$ 3$ shipping,30.0,31.095238
