In [10]:
import pandas as pd
import py7zr

In [11]:
# Extract tsv files
with py7zr.SevenZipFile('data/train.tsv.7z', mode='r') as z:
    z.extract(path="data")
with py7zr.SevenZipFile('data/test.tsv.7z', mode='r') as z:
    z.extract(path="data")

In [12]:
data_df = pd.read_csv("data/train.tsv", sep='\t')

In [13]:
data_df = data_df.drop('shipping',axis=1)
data_df = data_df.dropna()
# Only using the instances reuired for test/train
data_df = data_df[:2000]
data_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,item_description
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,Adorable top with a hint of lace and a key hol...
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.0,Size small but straps slightly shortened to fi...
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.0,You get three pairs of Sophie cheer shorts siz...
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,19.0,Girls Size small Plus green. Three shorts total.


In [14]:
# Combining descriptive data to one column
data_df['description'] = data_df[['name', 'brand_name', 'item_description']].agg(' '.join, axis=1)
data_df = data_df.drop(['name', 'brand_name', 'item_description', 'train_id'], axis=1)
data_df

Unnamed: 0,item_condition_id,category_name,price,description
1,3,Electronics/Computers & Tablets/Components & P...,52.0,Razer BlackWidow Chroma Keyboard Razer This ke...
2,1,Women/Tops & Blouses/Blouse,10.0,AVA-VIV Blouse Target Adorable top with a hint...
6,3,Women/Swimwear/Two-Piece,64.0,Acacia pacific tides santorini top Acacia Swim...
7,3,Sports & Outdoors/Apparel/Girls,6.0,Girls cheer and tumbling bundle of 7 Soffe You...
8,3,Sports & Outdoors/Apparel/Girls,19.0,Girls Nike Pro shorts Nike Girls Size small Pl...
...,...,...,...,...
3474,2,Women/Underwear/Bras,9.0,Victoria Secret Bra Victoria's Secret Push-up ...
3478,3,Men/Tops/Button-Front,53.0,Two Burberry Shirts One Money Burberry Both ge...
3481,3,Women/Sweaters/Crewneck,31.0,PINK varsity crew PINK I've gotten so many com...
3482,2,"Women/Athletic Apparel/Pants, Tights, Leggings",10.0,Lot of 2 Aeropostale Work out crops Aeropostal...


In [15]:
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Rakes categories that exist in cat_attribs, outputs a String
def rake_implement(x):
    r = Rake()
    string = ""
    r.extract_keywords_from_text(x)
    # temp is list of keywords from the above function
    temp = r.get_ranked_phrases()

    # concats the list items to a string
    for i in temp:
        string = string + i + " "
    return string

# Uses bag of words to obtain all keywords, then merges both dataframes
def bag_of_words(df, cat, feat=8000):
    tfidf = TfidfVectorizer(strip_accents='ascii', max_features=feat, max_df=0.95, min_df=1)
    # Applying rake algorithm to string
    df[cat] = df[cat].apply(lambda x: rake_implement(x))
    # Fitting and transforming the raked string
    tfidf.fit(df[cat])
    X = tfidf.transform(df[cat])
    # Drop the old column (since df will be merged)
    df = df.drop([cat], axis=1)

    X_df = pd.DataFrame(X.toarray(), columns=sorted(tfidf.vocabulary_))
    return pd.merge(df, X_df, left_index=True, right_index=True)

In [16]:
data_df = bag_of_words(data_df, 'description')
data_df = bag_of_words(data_df, 'category_name')
data_df

Unnamed: 0,item_condition_id,price_x,00,00g,00gauge,00gb,01,015,020,034oz,...,vest_y,video_y,vintage_y,wallets,watches_y,windbreaker_y,women_y,wrap_y,wraps_y,zip_y
1,3,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.227785,0.0,0.0,0.0
2,1,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.177848,0.0,0.0,0.0
6,3,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.280270,0.0,0.0,0.0
7,3,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.200598,0.0,0.0,0.0
8,3,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1142,1,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.311442,0.0,0.0,0.0
1145,3,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.209605,0.0,0.0,0.0
1146,1,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1148,1,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [17]:
label = data_df['price_x'].copy()
features = data_df.drop('price_x', axis=1)
features

Unnamed: 0,item_condition_id,00,00g,00gauge,00gb,01,015,020,034oz,05,...,vest_y,video_y,vintage_y,wallets,watches_y,windbreaker_y,women_y,wrap_y,wraps_y,zip_y
1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.227785,0.0,0.0,0.0
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.177848,0.0,0.0,0.0
6,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.280270,0.0,0.0,0.0
7,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.200598,0.0,0.0,0.0
8,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1142,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.311442,0.0,0.0,0.0
1145,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.209605,0.0,0.0,0.0
1146,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1148,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# Identifying each attribute for pipeline use
num_attribs = ['item_condition_id']
cat_attribs = []

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),   #returns a dense matrix
    ('cat', OrdinalEncoder(), cat_attribs) #returns a sparse matrix
])

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.15, random_state=42)

In [20]:
full_pipeline = full_pipeline.fit(X_train)
X_train_prep = full_pipeline.transform(X_train)
X_test_prep = full_pipeline.transform(X_test)

In [21]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=10)

knn.fit(X_train_prep, y_train)

KNeighborsRegressor(n_neighbors=10)

In [22]:
predictions = knn.predict(X_test_prep)

In [23]:
X_test['actual_price'] = y_test
X_test['new_price'] = predictions
X_test.tail(50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['actual_price'] = y_test
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['new_price'] = predictions


Unnamed: 0,item_condition_id,00,00g,00gauge,00gb,01,015,020,034oz,05,...,vintage_y,wallets,watches_y,windbreaker_y,women_y,wrap_y,wraps_y,zip_y,actual_price,new_price
347,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,23.3
126,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,22.8
967,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,24.2
131,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,24.2
1091,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.189409,0.0,0.0,0.0,17.0,24.2
513,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,23.3
384,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.0,24.2
778,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.247937,0.0,0.0,0.0,14.0,24.2
302,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,131.0,22.8
267,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.273197,0.0,0.0,0.0,23.0,22.8


In [24]:
# Testing to whether predictions are good(True) or bad(False)
X_test['valid'] = X_test.apply(lambda x: True if abs(x['pred price'] -
                     x['actual price']) < 5 else False, axis=1)

KeyError: 'pred price'

In [None]:
X_test['valid'].value_counts()