In [59]:
import pandas as pd
import py7zr
import RAKE

In [60]:
# Extract tsv files
with py7zr.SevenZipFile('data/train.tsv.7z', mode='r') as z:
    z.extract(path="data")
with py7zr.SevenZipFile('data/test.tsv.7z', mode='r') as z:
    z.extract(path="data")

In [61]:
data_df = pd.read_csv("data/train.tsv", sep='\t')

In [69]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = []
cat_attribs = ['name', 'category_name', 'brand_name', 'item_description']

num_pipeline = Pipeline([
    ('label', OrdinalEncoder(handle_unknown='ignore')),
    ('std_scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    #('num', num_pipeline, num_attribs),   #returns a dense matrix
    ('cat', OneHotEncoder(), cat_attribs) #returns a sparse matrix
])

In [63]:
data_df = data_df.drop('shipping',axis=1)
data_df = data_df.dropna()
data_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,item_description
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,Adorable top with a hint of lace and a key hol...
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.0,Size small but straps slightly shortened to fi...
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.0,You get three pairs of Sophie cheer shorts siz...
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,19.0,Girls Size small Plus green. Three shorts total.


In [64]:
from rake_nltk import Rake

r = Rake()

def rake_implement(x):
    string = ""
    r.extract_keywords_from_text(x)
    # temp is list of keywords from the above function
    temp = r.get_ranked_phrases()

    # concats the list items to a string
    for i in temp:
        string = string + i + " "
    return string

for column in cat_attribs:
    data_df[column] = data_df[column].apply(lambda x: rake_implement(x))
data_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,item_description
1,1,razer blackwidow chroma keyboard,3,tablets parts electronics computers components,razer,52.0,razer synapse app works like work perfectly gr...
2,2,viv blouse ava,1,women tops blouses blouse,target,10.0,pale pink key hole adorable top 3x available w...
6,6,acacia pacific tides santorini top,3,women two swimwear piece,acacia swimwear,64.0,straps slightly shortened size small perfect c...
7,7,tumbling bundle girls cheer 7,3,sports outdoors girls apparel,soffe,6.0,boy shorts spandex matching sets sophie cheer ...
8,8,girls nike pro shorts,3,sports outdoors girls apparel,nike,19.0,girls size small plus green three shorts total


In [86]:
X_train = data_df[:100000].copy()
y_train = X_train['price'].copy()
X_train = X_train.drop('price', axis=1)

X_test = data_df[50000:52000].copy()
y_test = X_test['price'].copy()
X_test = X_test.drop('price', axis=1)

In [87]:
full_pipeline = full_pipeline.fit(X_train)
X_train_prep = full_pipeline.transform(X_train)
X_test_prep = full_pipeline.transform(X_test)

In [88]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=10)

knn.fit(X_train_prep, y_train)

KNeighborsRegressor(n_neighbors=10)

In [89]:
predictions = knn.predict(X_test_prep)

In [90]:
X_test['actual_price'] = y_test
X_test['new_price'] = predictions
X_test

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,item_description,actual_price,new_price
87825,87825,shirt size xs pink victoria secret,1,women tops shirts blouses,pink,shirt size xs pink victoria secret shirt size ...,27.0,20.6
87826,87826,gathering seize magic day,3,toys kids games,wizards coast,gathering seize free shipping odyssey magic day,4.0,9.2
87828,87828,nwt micheal kors wallet gansevoort,1,women women wallets accessories,michael kors,pearl grey fits fold msrp brand new bill witho...,70.0,38.8
87829,87829,large puma tank,2,women tops tank cami blouses,puma,like new size large grey 65 polyester 35 cotton,10.0,15.7
87830,87830,time vintage toys plush land,3,action figures toys statues kids,gund,new unopened table cloth large gund little foo...,34.0,37.6
...,...,...,...,...,...,...,...,...
91326,91326,air jordan 7 raptors,3,shoes men athletic,jordan,box condition come authentic 8 10,50.0,67.7
91332,91332,hello kitty pajama bottoms,3,women pants,hello kitty,soft fleece like feel hello kitty pajama botto...,5.0,22.2
91335,91335,bnwt br seamless show socks 10,1,athletic apparel women socks,banana republic,show mini socks set tags banana republic one s...,16.0,15.6
91336,91336,original apple iphone charger,1,cell phones electronics cradles chargers acces...,apple,authentic apple iphone cable adapter iphones 5...,10.0,14.9
