In [1]:
#Loading required Libraries
import nltk
import string
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Loading the data

In [3]:
train = pd.read_table('train.tsv')
test = pd.read_table('test_stg2.tsv')

In [4]:
print ('Number of data points in train: ', train.shape[0])
print ('Number of features/variables:', train.shape[1])

print ('Number of data points in test: ', test.shape[0])

train.head()

Number of data points in train:  1482535
Number of features/variables: 8
Number of data points in test:  3460725


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [5]:
#Removing the products whose price < 3
train =train[train['price']> 3]

print('Number of data points in train data after eliminating price<3.00 :', train.shape[0])

Number of data points in train data after eliminating price<3.00 : 1462958


### Handling the missing values

In [6]:
def handle_missing_inplace(dataset):
    dataset['category_name'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='missing', inplace=True)

In [7]:
handle_missing_inplace(train)

In [8]:
handle_missing_inplace(test)

In [9]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,missing,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,missing,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,missing,44.0,0,Complete with certificate of authenticity


### Feature Engineering

In [10]:
#Function to get name and description lengths
def wordCount(text):
    try:
        if text == 'No description yet':
            return 0
        else:
            text = text.lower()
            words = [w for w in text.split(" ")]
            return len(words)
    except: 
        return 0

train['desc_len'] = train['item_description'].apply(lambda x: wordCount(x))
test['desc_len'] = test['item_description'].apply(lambda x: wordCount(x))
train['name_len'] = train['name'].apply(lambda x: wordCount(x))
test['name_len'] = test['name'].apply(lambda x: wordCount(x))
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,desc_len,name_len
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,missing,10.0,1,No description yet,0,7
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,36,4
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,29,2
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,missing,35.0,1,New with tags. Leather horses. Retail for [rm]...,32,3
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,missing,44.0,0,Complete with certificate of authenticity,5,4


### Featurizing the text features

In [11]:
y_true = train['price'].values

In [12]:
import warnings
warnings.filterwarnings("ignore")

data = pd.concat([train,test])

In [15]:
data.shape

(4923683, 11)

In [16]:
data[4923678:]

Unnamed: 0,brand_name,category_name,desc_len,item_condition_id,item_description,name,name_len,price,shipping,test_id,train_id
3460720,missing,Beauty/Fragrance/Women,118,1,It cosmetics Bye Bye Foundation Full Coverage ...,DARK SAMPLE BYE FOUNDATION MOISTURIZER,5,,1,3460720.0,
3460721,missing,Women/Pants/Other,14,2,♡2 pairs of omighty trackiez. one m and one s....,bundle for @brandystash,3,,1,3460721.0,
3460722,Nike,Kids/Girls 0-24 Mos/Shoes,29,3,Size 5 (toddler). Pink high top converse shoes...,Toddler High-top Converse,3,,0,3460722.0,
3460723,Sanuk,Women/Shoes/Sandals,14,1,super super comfy. i have a pair but ordered t...,Yoga Sling Sandals,3,,1,3460723.0,
3460724,missing,Women/Coats & Jackets/Other,51,3,100% Genuine leather and faux fur insulated co...,Stylish ladies M/M insulated brown coat,6,,1,3460724.0,


In [17]:
print("Vectorizing data...")
default_preprocessor = CountVectorizer().build_preprocessor()
def build_preprocessor(field):
    field_idx = list(data.columns).index(field)
    return lambda x: default_preprocessor(x[field_idx])

vectorizer = FeatureUnion([
    ('name', CountVectorizer(
        max_features=15000,
        preprocessor=build_preprocessor('name'))),
    ('category_name', CountVectorizer(
        token_pattern='.+',
        preprocessor=build_preprocessor('category_name'))),
    ('brand_name', CountVectorizer(
        token_pattern='.+',
        preprocessor=build_preprocessor('brand_name'))),
    ('item_description', TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=30000,
        preprocessor=build_preprocessor('item_description')))
])

data = vectorizer.fit_transform(data.values)

Vectorizing data...


In [19]:
train = data[:1462958]

In [20]:
train.shape

(1462958, 52656)

In [24]:
test = data[1462958:]

In [25]:
test.shape

(3460725, 52656)

### Splitting the data

In [28]:
x_train, x_cv, y_train, y_cv = train_test_split(train, y_true, test_size=0.2,  random_state=0)

In [31]:
x_train.shape, x_cv.shape

((1170366, 52656), (292592, 52656))

In [32]:
x_test = test
x_test.shape

(3460725, 52656)

In [33]:
#Scaling target variable to log.
y_train = np.log1p(y_train)
y_train = y_train.reshape(-1, 1)

In [34]:
y_cv = np.log1p(y_cv)
y_cv = y_cv.reshape(-1, 1)

In [35]:
#Defining function for calculating RMSLE
#Since, Y_true and Y_pred are already in log scale, there is no need to log them in the function.

def rmsle(Y_true, Y_pred):
    assert Y_true.shape == Y_pred.shape
    return np.sqrt(np.mean(np.square(Y_pred - Y_true)))

## Machine Learning Models

### Ridge Regression

In [40]:
#References:
#1. https://towardsdatascience.com/ridge-regression-for-better-usage-2f19b3a202db
#2. https://stats.stackexchange.com/questions/52653/what-is-ridge-regression

print("Fitting Ridge model...")
ridge_model = Ridge(solver='auto', fit_intercept=True, alpha=1.0,max_iter=100, normalize=False, tol=0.05, random_state = 1)
ridge_model.fit(x_train, y_train)
y_preds_ridge = ridge_model.predict(x_cv)
print("Ridge RMSL error on cv set:", rmsle(y_cv, y_preds_ridge))

Fitting Ridge model...
Ridge RMSL error on cv set: 0.46463591056241244


### RidgeCV Regression

In [41]:
print("Fitting RidgeCV model..")
ridgeCV_model = RidgeCV(fit_intercept=True, alphas=[5.0],normalize=False, cv = 2, scoring='neg_mean_squared_error')
ridgeCV_model.fit(x_train, y_train)
y_preds_ridgeCV = ridgeCV_model.predict(x_cv)
print("RidgeCV RMSL error on CV data:", rmsle(y_cv, y_preds_ridgeCV))

Fitting RidgeCV model..
RidgeCV RMSL error on CV data: 0.46383529845505655


In [43]:
ridge_preds = ridge_model.predict(x_test)
ridge_preds = np.exp(ridge_preds)-1

In [44]:
ridgeCV_preds = ridgeCV_model.predict(x_test)
ridgeCV_preds = np.exp(ridgeCV_preds)-1

In [45]:
print(ridge_preds)

[[10.4648652 ]
 [11.42134661]
 [52.07151396]
 ...
 [15.94261092]
 [18.55083114]
 [24.8209926 ]]


In [46]:
print(ridgeCV_preds)

[[10.43858862]
 [11.63016207]
 [50.39251729]
 ...
 [15.98535331]
 [18.21889264]
 [24.67385955]]


In [48]:
test = pd.read_table('test_stg2.tsv')

In [49]:
#Saving the ridge preds to csv
submission_ridge = test[["test_id"]]
submission_ridge["price"] = ridge_preds
submission_ridge.to_csv("ridgesubmission.csv", index=False)

In [50]:
#Saving the ridgeCV preds to csv
submission_ridgeCV = test[["test_id"]]
submission_ridgeCV["price"] = ridgeCV_preds
submission_ridgeCV.to_csv("ridgeCVsubmission.csv", index=False)

In [54]:
#Loading the LSTM predictions
preds = pd.read_csv('rnnsubmission.csv')
rnn_preds = preds[['price']]

## LSTM + Ridge + RidgeCV

In [56]:
def aggregate_predicts3(P1, P2, P3, ratio1, ratio2):
    assert P1.shape == P2.shape
    return P1 * ratio1 + P2 * ratio2 + P3 * (1.0 - ratio1-ratio2)

In [57]:
preds = aggregate_predicts3(rnn_preds, ridgeCV_preds, ridge_preds, 0.6, 0.2)

In [60]:
final_submission = test[["test_id"]]
final_submission["price"] = preds
final_submission.to_csv("./rnn_ridge_submission.csv", index=False)

## Conclusion

1. Initially, we had the data in the tsv format with 7 features and 1.4M data points in train data and 3.4M in test data.
2. I've performed basic exploratory data analysis to understand the top categories in each feature.
3. After EDA, for applying the DL models, I've initially transformed the text data to sequences.
4. Later, I've processed the categorical feature using the label encoder.
5. I've now splitted the whole data into train and cv and applied padding for the X_train, X_CV and X_test data for all the required features.
6. Now, I've applied scaling on target variable to log values since we need log values to calculate the RMSLE value, which is our metric.
7. Then I've applied LSTM with 3 layers which gave me a CV RMSLE value of 0.46 and applied the LSTM on test data and predicted the prices and stored the predictions in a CSV file.
8. Now, I've reloaded the train and test data again and this time, I've featurized the text features using BOW.
9. After vectorizing the text features, I've applied Ridge Regression and RidgeCV Regression on top of it and calculated the RMSLE on CV data which were 0.464 and 0.463 respectively and applied both the regression models top redict on test data.
10. I've reloaded the LSTM predictions and aggregated the LSTM + Ridge + RidgeCV predictions by giving weights to each models to get the final predictions which gave me a RMSLE of 0.47 on the Test data.

In [1]:
from prettytable import PrettyTable
    
x = PrettyTable()

x.field_names = ["Model", "CV RMSLE"]

x.add_row(["LSTM - 3 Layers", 0.460])
x.add_row(["Ridge Regression", 0.464])
x.add_row(["RidgeCV Regression", 0.463])

print(x)

+--------------------+----------+
|       Model        | CV RMSLE |
+--------------------+----------+
|  LSTM - 3 Layers   |   0.46   |
|  Ridge Regression  |  0.464   |
| RidgeCV Regression |  0.463   |
+--------------------+----------+


<img src='merc-final.jpg'>