In [13]:
import pandas as pd
import py7zr

In [14]:
# Extract tsv files
with py7zr.SevenZipFile('data/train.tsv.7z', mode='r') as z:
    z.extract(path="data")
with py7zr.SevenZipFile('data/test.tsv.7z', mode='r') as z:
    z.extract(path="data")

In [15]:
data_df = pd.read_csv("data/train.tsv", sep='\t')

In [16]:
data_df = data_df.drop('shipping',axis=1)
data_df = data_df.dropna()
# Only using the instances reuired for test/train
data_df = data_df[:150000]
data_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,item_description
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,Adorable top with a hint of lace and a key hol...
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.0,Size small but straps slightly shortened to fi...
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.0,You get three pairs of Sophie cheer shorts siz...
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,19.0,Girls Size small Plus green. Three shorts total.


In [17]:
# Identifying each attribute
num_attribs = ['item_condition_id']
cat_attribs = ['category_name']

data_df['description'] = data_df[['name', 'brand_name', 'item_description']].agg('-'.join, axis=1)
data_df = data_df.drop(['name', 'brand_name', 'item_description'], axis=1)
data_df

Unnamed: 0,train_id,item_condition_id,category_name,price,description
1,1,3,Electronics/Computers & Tablets/Components & P...,52.0,Razer BlackWidow Chroma Keyboard-Razer-This ke...
2,2,1,Women/Tops & Blouses/Blouse,10.0,AVA-VIV Blouse-Target-Adorable top with a hint...
6,6,3,Women/Swimwear/Two-Piece,64.0,Acacia pacific tides santorini top-Acacia Swim...
7,7,3,Sports & Outdoors/Apparel/Girls,6.0,Girls cheer and tumbling bundle of 7-Soffe-You...
8,8,3,Sports & Outdoors/Apparel/Girls,19.0,Girls Nike Pro shorts-Nike-Girls Size small Pl...
...,...,...,...,...,...
262884,262884,2,Kids/Girls (4+)/Dresses,24.0,Polo dress iE small (7)-Ralph Lauren-Polo swea...
262886,262886,1,Women/Tops & Blouses/T-Shirts,18.0,NWT Victorias Secret Pink Long Tee-Victoria's ...
262888,262888,3,Women/Shoes/Athletic,31.0,Nike Zoom Structure 19 shoes 9.5-Nike-Like new...
262889,262889,2,"Women/Athletic Apparel/Pants, Tights, Leggings",14.0,TWO PAIR DANSKIN WORK OUT LEGGINGS-Danskin-Bot...


In [84]:
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer

# Rakes categories that exist in cat_attribs, outputs a String
def rake_implement(x):
    r = Rake()
    string = ""
    r.extract_keywords_from_text(x)
    # temp is list of keywords from the above function
    temp = r.get_ranked_phrases()

    # concats the list items to a string
    for i in temp:
        string = string + i + " "
    return string

# Uses bag of words to obtain all keywords, then merges both dataframes
def bag_of_words(df, cat):
    bow = CountVectorizer(strip_accents='ascii', max_features=8000)
    # Applying rake algorithm to string
    df[cat] = df[cat].apply(lambda x: rake_implement(x))
    # Fitting and transforming the raked string
    bow.fit(df[cat])
    X = bow.transform(df[cat])
    # Drop the old column (since df will be merged)
    df = df.drop([cat], axis=1)

    X_df = pd.DataFrame(X.toarray(), columns=sorted(bow.vocabulary_))
    return pd.merge(df, X_df, left_index=True, right_index=True)

In [85]:
data_df = bag_of_words(data_df, 'description')
data_df

MemoryError: Unable to allocate 5.10 GiB for an array with shape (85499, 8000) and data type int64

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

num_pipeline = Pipeline([
    ('label', OrdinalEncoder()),
    ('std_scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),   #returns a dense matrix
    ('cat', OneHotEncoder(), cat_attribs) #returns a sparse matrix
])

In [None]:
label = data_df['price_x'].copy()
features = data_df.drop('price_x', axis=1)
features

Unnamed: 0,train_id,item_condition_id,00,000,001,002,003,004,007,00am,...,zombie,zombies,zone,zoo,zoom,zumba,zumies,zumiez,zurich,zx
1,1,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,7,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,8,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149994,149994,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
149996,149996,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
149997,149997,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
149998,149998,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.15, random_state=42)

In [None]:
full_pipeline = full_pipeline.fit(X_train)
X_train_prep = full_pipeline.transform(X_train)
X_test_prep = full_pipeline.transform(X_test)
X_train_prep

array([[-0.04021152],
       [-0.04021152],
       [-1.14883524],
       ...,
       [-1.14883524],
       [ 1.0684122 ],
       [-1.14883524]])

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=10)

knn.fit(X_train_prep, y_train)

KNeighborsRegressor(n_neighbors=10)

In [None]:
predictions = knn.predict(X_test_prep)

In [None]:
X_test['actual_price'] = y_test
X_test['new_price'] = predictions
X_test.head(50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['actual_price'] = y_test
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['new_price'] = predictions


Unnamed: 0,train_id,item_condition_id,00,000,001,002,003,004,007,00am,...,zone,zoo,zoom,zumba,zumies,zumiez,zurich,zx,actual_price,new_price
31927,31927,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,35.0,36.7
25591,25591,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,19.0,41.0
47704,47704,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,19.0,28.9
149867,149867,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,20.0,28.9
109597,109597,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,22.0,28.9
37949,37949,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,25.0,36.7
59305,59305,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9.0,28.9
10127,10127,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5.0,41.0
7705,7705,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9.0,41.0
75422,75422,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,10.0,36.7


In [None]:
X_test

Unnamed: 0,train_id,item_condition_id,00,000,001,002,003,004,007,00am,...,zone,zoo,zoom,zumba,zumies,zumiez,zurich,zx,actual_price,new_price
31927,31927,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,35.0,49.619048
25591,25591,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,19.0,43.761905
47704,47704,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,19.0,23.619048
149867,149867,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,20.0,23.619048
109597,109597,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,22.0,23.619048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122141,122141,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,38.0,23.619048
140759,140759,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.0,43.761905
22016,22016,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.0,49.619048
66224,66224,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9.0,49.619048
