In [1]:
import pandas as pd
import py7zr

In [2]:
# Extract tsv files
with py7zr.SevenZipFile('data/train.tsv.7z', mode='r') as z:
    z.extract(path="data")
with py7zr.SevenZipFile('data/test.tsv.7z', mode='r') as z:
    z.extract(path="data")

In [3]:
data_df = pd.read_csv("data/train.tsv", sep='\t')

In [4]:
data_df = data_df.drop('shipping',axis=1)
data_df = data_df.dropna()
# Only using the instances reuired for test/train
data_df = data_df[:60000]
data_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,item_description
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,Adorable top with a hint of lace and a key hol...
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.0,Size small but straps slightly shortened to fi...
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.0,You get three pairs of Sophie cheer shorts siz...
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,19.0,Girls Size small Plus green. Three shorts total.


In [5]:
# Identifying each attribute
num_attribs = ['item_condition_id']
cat_attribs = ['name', 'category_name', 'brand_name', 'item_description']

In [6]:
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer

# Rakes categories that exist in cat_attribs, outputs a String
def rake_implement(x):
    r = Rake()
    string = ""
    r.extract_keywords_from_text(x)
    # temp is list of keywords from the above function
    temp = r.get_ranked_phrases()

    # concats the list items to a string
    for i in temp:
        string = string + i + " "
    return string

# Uses bag of words to obtain all keywords, then merges both dataframes
def bag_of_words(df, cat):
    bow = CountVectorizer(strip_accents='ascii')
    for column in cat:
        # Applying rake algorithm to string
        df[column] = df[column].apply(lambda x: rake_implement(x))
        # Fitting and transforming the raked string
        bow.fit(df[column])
        X = bow.transform(df[column])
        # Drop the old column (since df will be merged)
        df = df.drop([column], axis=1)

    X_df = pd.DataFrame(X.toarray(), columns=sorted(bow.vocabulary_))
    return pd.merge(df, X_df, left_index=True, right_index=True)

In [7]:
data_df = bag_of_words(data_df, cat_attribs)
data_df

Unnamed: 0,train_id,item_condition_id,price_x,00,000,000278,001,0018,002,003,...,zumba,zumies,zumiez,zur,zurich,zutano,zx,zx110nc,zxcvbn,zzz
1,1,3,52.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1,10.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,3,64.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,7,3,6.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,8,3,19.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59991,59991,2,17.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59992,59992,2,21.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59995,59995,1,16.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59997,59997,1,30.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

num_pipeline = Pipeline([
    ('label', OrdinalEncoder()),
    ('std_scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),   #returns a dense matrix
    #('str', OneHotEncoder(), cat_attribs) #returns a sparse matrix
])

In [9]:
X_train = data_df[:50000].copy()
y_train = X_train['price_x'].copy()
X_train = X_train.drop('price_x', axis=1)

In [10]:
X_test = data_df[52000:56000].copy()
y_test = X_test['price_x'].copy()
X_test = X_test.drop('price_x', axis=1)

In [11]:
X_test

Unnamed: 0,train_id,item_condition_id,00,000,000278,001,0018,002,003,004,...,zumba,zumies,zumiez,zur,zurich,zutano,zx,zx110nc,zxcvbn,zzz


In [12]:
full_pipeline = full_pipeline.fit(X_train)
X_train_prep = full_pipeline.transform(X_train)
X_test_prep = full_pipeline.transform(X_test)
X_train_prep

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=10)

knn.fit(X_train_prep, y_train)

KNeighborsRegressor(n_neighbors=10)

In [None]:
predictions = knn.predict(X_test_prep)

In [None]:
X_test['actual_price'] = y_test
X_test['new_price'] = predictions
X_test.head(50)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,item_description,actual_price,new_price
262892,262892,flower dress cute zip,2,women mini knee dresses,material girl,short dress zips also fit waist size size stre...,11.0,26.9
262895,262895,plants vs zombies xbox one,2,video games games electronics consoles,xbox,excellent condition brand new pretty much like...,10.0,38.7
262896,262896,sparkly ankle cut heels,3,women shoes pumps,charlotte russe,mac victoria secret adidas nike bape guess guc...,14.0,62.5
262900,262900,body shop white musk smoky rose trio,1,women fragrance beauty,body shop,brand new eau de toilette body wash body lotio...,16.0,47.4
262901,262901,men jean size 40 32,3,straight leg men jeans classic,claiborne,new home looking closet cleaning,5.0,44.0
262902,262902,white maxi skirt,3,women skirts maxi,victoria secret,secret website victoria purchase,14.0,44.8
262904,262904,elf duo lip primer plumper,1,makeup lips beauty,l f e,☆ always authentic ☆ creating long lasting lip...,11.0,65.0
262905,262905,puma sneakers size 6 women,3,women shoes athletic,puma,tazon 5 running shoes white style teal accents...,35.0,63.1
262907,262907,reserved bundle holly,1,athletic apparel women,lululemon,lulu swiftly blu sportbra,51.0,39.4
262908,262908,platform royal blue heels lace,2,women shoes pumps,bebe,please feel free electric royal blue size 5 pl...,36.0,49.9
