In [1]:
import sys
print(sys.executable)

import pandas as pd
import numpy as np
import tpot as tp

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline 

import math
from keras.preprocessing.text import Tokenizer

from subprocess import check_output

/home/faisal/anaconda3/envs/gpu/bin/python3


Using TensorFlow backend.


In [2]:
train = pd.read_csv('../data/train.tsv', sep='\t')
test = pd.read_csv('../data/test.tsv', sep='\t')

train = train.set_index('train_id')

train.head()

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [3]:
train.columns

Index(['name', 'item_condition_id', 'category_name', 'brand_name', 'price',
       'shipping', 'item_description'],
      dtype='object')

In [4]:
train.dtypes

name                  object
item_condition_id      int64
category_name         object
brand_name            object
price                float64
shipping               int64
item_description      object
dtype: object

In [5]:
category_split = train['category_name'].str.split('/', expand=True, n=2)

In [6]:
category_split[0].value_counts()

Women                     664385
Beauty                    207828
Kids                      171689
Electronics               122690
Men                        93680
Home                       67871
Vintage & Collectibles     46530
Other                      45351
Handmade                   30842
Sports & Outdoors          25342
Name: 0, dtype: int64

In [7]:
# Percentage missing values per column

(train.isnull().sum() / train.shape[0]) * 100

name                  0.000000
item_condition_id     0.000000
category_name         0.426769
brand_name           42.675687
price                 0.000000
shipping              0.000000
item_description      0.000270
dtype: float64

In [8]:
train.item_condition_id = train.item_condition_id.astype('str')

item_condition_dummies = pd.get_dummies(train.item_condition_id, prefix='item_condition')

In [9]:
item_condition_dummies.head()

Unnamed: 0_level_0,item_condition_1,item_condition_2,item_condition_3,item_condition_4,item_condition_5
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,1,0,0
1,0,0,1,0,0
2,1,0,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0


In [10]:
#Source: https://www.kaggle.com/marknagelberg/rmsle-function
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5

In [11]:
#HANDLE MISSING VALUES
print("Handling missing values...")
def handle_missing(dataset):
    dataset.category_name.fillna(value="missing", inplace=True)
    dataset.brand_name.fillna(value="missing", inplace=True)
    dataset.item_description.fillna(value="missing", inplace=True)
    return (dataset)

train = handle_missing(train)
test = handle_missing(test)

print(train.shape)
print(test.shape)

Handling missing values...
(1482535, 7)
(693359, 7)


In [12]:
#PROCESS CATEGORICAL DATA
print("Handling categorical variables...")
le = LabelEncoder()

le.fit(np.hstack([train.category_name, test.category_name]))
train.category_name = le.transform(train.category_name)
test.category_name = le.transform(test.category_name)

le.fit(np.hstack([train.brand_name, test.brand_name]))
train.brand_name = le.transform(train.brand_name)
test.brand_name = le.transform(test.brand_name)
del le

train.head(3)

Handling categorical variables...


Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,829,5265,10.0,1,No description yet
1,Razer BlackWidow Chroma Keyboard,3,86,3889,52.0,0,This keyboard is in great condition and works ...
2,AVA-VIV Blouse,1,1277,4588,10.0,1,Adorable top with a hint of lace and a key hol...


In [13]:
#PROCESS TEXT: RAW

print("Text to seq process...")
raw_text = np.hstack([train.item_description.str.lower(), train.name.str.lower()])

print("   Fitting tokenizer...")
tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)
print("   Transforming text to seq...")

# TODO: clean the data further, including stemming and lemmatization
train["seq_item_description"] = tok_raw.texts_to_sequences(train.item_description.str.lower())
test["seq_item_description"] = tok_raw.texts_to_sequences(test.item_description.str.lower())
train["seq_name"] = tok_raw.texts_to_sequences(train.name.str.lower())
test["seq_name"] = tok_raw.texts_to_sequences(test.name.str.lower())
train.head(3)

Text to seq process...
   Fitting tokenizer...
   Transforming text to seq...


Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description,seq_item_description,seq_name
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,829,5265,10.0,1,No description yet,"[12, 68, 79]","[3852, 8823, 6896, 208, 84, 6, 155]"
1,Razer BlackWidow Chroma Keyboard,3,86,3889,52.0,0,This keyboard is in great condition and works ...,"[29, 2627, 10, 7, 39, 17, 1, 207, 51, 19, 1113...","[10760, 25565, 16369, 2627]"
2,AVA-VIV Blouse,1,1277,4588,10.0,1,Adorable top with a hint of lace and a key hol...,"[604, 60, 9, 4, 5347, 11, 192, 1, 4, 886, 1290...","[7634, 10563, 666]"


In [14]:
#SEQUENCES VARIABLES ANALYSIS
max_name_seq = np.max([np.max(train.seq_name.apply(lambda x: len(x))), np.max(test.seq_name.apply(lambda x: len(x)))])
max_seq_item_description = np.max([np.max(train.seq_item_description.apply(lambda x: len(x)))
                                   , np.max(test.seq_item_description.apply(lambda x: len(x)))])
print("max name seq "+str(max_name_seq))
print("max item desc seq "+str(max_seq_item_description))

max name seq 17
max item desc seq 269


In [15]:
train.head()

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description,seq_item_description,seq_name
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,829,5265,10.0,1,No description yet,"[12, 68, 79]","[3852, 8823, 6896, 208, 84, 6, 155]"
1,Razer BlackWidow Chroma Keyboard,3,86,3889,52.0,0,This keyboard is in great condition and works ...,"[29, 2627, 10, 7, 39, 17, 1, 207, 51, 19, 1113...","[10760, 25565, 16369, 2627]"
2,AVA-VIV Blouse,1,1277,4588,10.0,1,Adorable top with a hint of lace and a key hol...,"[604, 60, 9, 4, 5347, 11, 192, 1, 4, 886, 1290...","[7634, 10563, 666]"
3,Leather Horse Statues,1,503,5265,35.0,1,New with tags. Leather horses. Retail for [rm]...,"[5, 9, 61, 178, 6528, 230, 3, 21, 166, 1085, 2...","[178, 2610, 14248]"
4,24K GOLD plated rose,1,1204,5265,44.0,0,Complete with certificate of authenticity,"[807, 9, 6888, 11, 1997]","[4884, 104, 1032, 280]"


In [16]:
train["target"] = np.log(train.price+1)
target_scaler = MinMaxScaler(feature_range=(-1, 1))

In [17]:
MAX_NAME_SEQ = 10
MAX_ITEM_DESC_SEQ = 75
MAX_TEXT = np.max([np.max(train.seq_name.max())
                   , np.max(test.seq_name.max())
                  , np.max(train.seq_item_description.max())
                  , np.max(test.seq_item_description.max())])+2
MAX_CATEGORY = np.max([train.category_name.max(), test.category_name.max()])+1
MAX_BRAND = np.max([train.brand_name.max(), test.brand_name.max()])+1
MAX_CONDITION = 5

#EXTRACT DEVELOPTMENT TEST
dtrain, dvalid = train_test_split(train, random_state=123, train_size=0.99)
print(dtrain.shape)
print(dvalid.shape)



(1467709, 10)
(14826, 10)


In [18]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer

# model.fit(X_train, dtrain.target)

# train_score = model.score([X_train, dtrain.target])
# val_score = model.score([X_valid, dvalid.target])

# model.export('./_tpot/pipeline.export.py')

# print("Training score:", train_score)
# print("Validation score:", val_score)
# print("Testing score:", test_scote)

def create_dummies(inputs, col, dummy_na):
    """
    INPUT:
    inputs - the dataframe containing the data
    col - the column to transform
    dummy_na - a boolean to indicate weather or not to create a dummy column for na values
    OUTPUT:
    inputs - the dataframe along with the new dummy columns
    Creates dummy vars for a specified column in a dataframe.
    The original categorical column is dropped.
    """
    dummies_df = pd.get_dummies(inputs[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)
    
    return pd.concat([inputs.drop(col, axis=1), dummies_df], axis=1)


def pad_sequence_col(dataset, col, maxlen, prefix='', drop=True):
    # if prefix is not set, use column name
    if len(prefix) == 0:
        prefix = col
    
    padded = pd.DataFrame(pad_sequences(dataset[col], maxlen=maxlen), index=dataset.index).add_prefix(col)
    
    if drop == True:
        dataset = dataset.drop(labels=[col], axis=1)
    
    return pd.concat([dataset, padded], axis=1)


def create_other_category(inputs, col, n_top=5):
    """
    INPUT:
    inputs - the dataframe containing the data
    col - the column to transform
    n_top - the number of top categories to keep, below which all categories become 'other'
    OUTPUT:
    inputs - the transformed dataframe
    Given a categorial column, replace all non-common values with 'other'
    The threshold to determine which values are common is provided by n_top argument.
    """
    common_types = inputs[col].value_counts().head(n_top).index.values
    inputs.loc[~inputs[col].isin(common_types), col] = 'other'
    
    return inputs


dtrain = create_other_category(dtrain, 'brand_name', n_top=250)
dtrain = create_other_category(dtrain, 'category_name', n_top=50)

dtrain_binarized = pad_sequence_col(dtrain, 'seq_name', MAX_NAME_SEQ, prefix='name_seq_')
dtrain_binarized = pad_sequence_col(dtrain, 'seq_item_description', MAX_ITEM_DESC_SEQ, prefix='item_desc_seq_')

dtrain_binarized = dtrain_binarized.drop(labels=['name', 'price', 'target', 'item_description', 'seq_name'], axis=1)

dtrain_binarized = create_dummies(dtrain_binarized, 'item_condition_id', False)
dtrain_binarized = create_dummies(dtrain_binarized, 'brand_name', False)
dtrain_binarized = create_dummies(dtrain_binarized, 'category_name', False)

dtrain_binarized.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0_level_0,shipping,seq_item_description0,seq_item_description1,seq_item_description2,seq_item_description3,seq_item_description4,seq_item_description5,seq_item_description6,seq_item_description7,seq_item_description8,...,category_name_1283,category_name_1284,category_name_1285,category_name_1288,category_name_1291,category_name_1299,category_name_1305,category_name_1308,category_name_1309,category_name_other
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
407853,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
748699,1,0,0,0,1232,2593,1292,643,152,6,...,0,0,0,0,0,0,0,0,0,0
320295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1094144,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1310650,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [25]:
dtrain_binarized.to_csv('../data/X_train.csv')
dtrain.target.to_csv('../data/Y_train.csv')

In [None]:
X_train = pd.read_csv('../data/X_train.csv')
Y_train = pd.read_csv('../data/Y_train.csv')

model = tp.TPOTRegressor(scoring=rmsle, verbosity=2, memory='_tpot')

model.fit(X_train, Y_train)

# #EVLUEATE THE MODEL ON DEV TEST: What is it doing?
# val_preds = model.predict(X_valid)
# # val_preds = target_scaler.inverse_transform(val_preds)
# val_preds = np.exp(val_preds)+1

# #mean_absolute_error, mean_squared_log_error
# y_true = np.array(dvalid.price.values)
# y_pred = val_preds[:,0]
# v_rmsle = rmsle(y_true, y_pred)
# print(" RMSLE error on dev test: "+str(v_rmsle))