All the necessary imports:

### Ridge Implementation

In [1]:
from datetime import datetime 
start_real = datetime.now()
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



In [2]:
import math
# set seed
np.random.seed(123)

# Define RMSL Error Function

This is for checking the predictions at the end. 

In [3]:
def rmsle(Y, Y_pred):
    assert Y.shape == Y_pred.shape
    return np.sqrt(np.mean(np.square(Y_pred - Y )))

# Load train and test data¶

In [17]:
train_df = pd.read_csv('D:\\Classes\\train.tsv', sep='\t')
test_df = pd.read_csv('D:\\Classes\\test.tsv', sep='\t')

Preprossing the data for Ridge model.
Remove low prices, anything below 3. Mercari does not allow postings below 3 so below that is an error. Removing them helps the models.

In [18]:
# remove low prices
train_df = train_df.drop(train_df[(train_df.price < 3.0)].index)
train_df.shape

(1481661, 8)

Mercari also does not allow postings over 2000.So removing those values as well.

In [19]:
train_df = train_df.drop(train_df[(train_df.price > 2000)].index)
train_df.shape


(1481658, 8)

# attempt to find missing brand names

In [20]:
data_set = pd.concat([train_df,test_df])
all_brands = set(data_set['brand_name'].values)
train_df.brand_name.fillna(value="missing", inplace=True)
test_df.brand_name.fillna(value="missing", inplace=True)

# get to finding!
premissing = len(train_df.loc[train_df['brand_name'] == 'missing'])
def brandfinder(line):
    brand = line[0]
    name = line[1]
    namesplit = name.split(' ')
    if brand == 'missing':
        for x in namesplit:
            if x in all_brands:
                return name
    if name in all_brands:
        return name
    return brand

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [21]:
train_df['brand_name'] = train_df[['brand_name','name']].apply(brandfinder, axis = 1)
test_df['brand_name'] = test_df[['brand_name','name']].apply(brandfinder, axis = 1)
found = premissing-len(train_df.loc[train_df['brand_name'] == 'missing'])
print(found)

137342


In [22]:
def split_cat(text):
   try: return text.split("/")
   except: return ("No Label", "No Label", "No Label")
train_df['subcat_0'], train_df['subcat_1'], train_df['subcat_2'] = \
zip(*train_df['category_name'].apply(lambda x: split_cat(x)))
test_df['subcat_0'], test_df['subcat_1'], test_df['subcat_2'] = \
zip(*test_df['category_name'].apply(lambda x: split_cat(x)))

Standard split the train test for validation and log the price

In [27]:
# get name and description lengths
def wordCount(text):
    try:
        if text == 'No description yet':
            return 0
        else:
            text = text.lower()
            words = [w for w in text.split(" ")]
            return len(words)
    except: 
        return 0
train_df['desc_len'] = train_df['item_description'].apply(lambda x: wordCount(x))
test_df['desc_len'] = test_df['item_description'].apply(lambda x: wordCount(x))
train_df['name_len'] = train_df['name'].apply(lambda x: wordCount(x))
test_df['name_len'] = test_df['name'].apply(lambda x: wordCount(x))
train_df.head()

Wall time: 22 s


In [23]:
# Scale target variable to log.
train_df["target"] = np.log1p(train_df.price)

# Split training examples into train/dev examples.
train_df, dev_df = train_test_split(train_df, random_state=123, train_size=0.99)

# Calculate number of train/dev/test examples.
n_trains = train_df.shape[0]
n_devs = dev_df.shape[0]
n_tests = test_df.shape[0]
print("Training on", n_trains, "examples")
print("Validating on", n_devs, "examples")
print("Testing on", n_tests, "examples")

Training on 1466841 examples
Validating on 14817 examples
Testing on 693359 examples


# Ridge model
faster than the RNN.

In [29]:
 ## Concatenate train - dev - test data .
complete_df = pd.concat([train_df, dev_df, test_df])


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [25]:
train_df.columns

Index(['train_id', 'name', 'item_condition_id', 'category_name', 'brand_name',
       'price', 'shipping', 'item_description', 'subcat_0', 'subcat_1',
       'subcat_2', 'target'],
      dtype='object')

Handle missing data and convert data type to string
All inputs must be strings in a ridge model.

In [30]:
%%time

print("Handling missing values...")
complete_df['category_name'] = complete_df['category_name'].fillna('missing').astype(str)
complete_df['subcat_0'] = complete_df['subcat_0'].astype(str)
complete_df['subcat_1'] = complete_df['subcat_1'].astype(str)
complete_df['subcat_2'] = complete_df['subcat_2'].astype(str)
complete_df['brand_name'] = complete_df['brand_name'].fillna('missing').astype(str)
complete_df['shipping'] = complete_df['shipping'].astype(str)
complete_df['item_condition_id'] = complete_df['item_condition_id'].astype(str)
complete_df['desc_len'] = complete_df['desc_len'].astype(str)
complete_df['name_len'] = complete_df['name_len'].astype(str)
complete_df['item_description'] = complete_df['item_description'].fillna('No description yet').astype(str)

Handling missing values...
Wall time: 12.2 s


Vectorizing all the data

In [31]:
%%time

print("Vectorizing data...")
default_preprocessor = CountVectorizer().build_preprocessor()
def build_preprocessor(field):
    field_idx = list(complete_df.columns).index(field)
    return lambda x: default_preprocessor(x[field_idx])

vectorizer = FeatureUnion([
    ('name', CountVectorizer(
        ngram_range=(1, 2),
        max_features=50000,
        preprocessor=build_preprocessor('name'))),
#     ('category_name', CountVectorizer(
#         token_pattern='.+',
#         preprocessor=build_preprocessor('category_name'))),
    ('subcat_0', CountVectorizer(token_pattern='.+',
        preprocessor=build_preprocessor('subcat_0'))),
    ('subcat_1', CountVectorizer(
        token_pattern='.+',
        preprocessor=build_preprocessor('subcat_1'))),
    ('subcat_2', CountVectorizer(
        token_pattern='.+',
        preprocessor=build_preprocessor('subcat_2'))),
    ('brand_name', CountVectorizer(
        token_pattern='.+',
        preprocessor=build_preprocessor('brand_name'))),
    ('shipping', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('shipping'))),
    ('item_condition_id', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('item_condition_id'))),
    ('desc_len', CountVectorizer(token_pattern='\d+',
        preprocessor=build_preprocessor('desc_len'))),
    ('name_len', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('name_len'))),
    ('item_description', TfidfVectorizer(
        ngram_range=(1, 3),
        max_features=100000,
        preprocessor=build_preprocessor('item_description'))),
])

X = vectorizer.fit_transform(complete_df.values)

X_train = X[:n_trains]
Y_train = train_df.target.values.reshape(-1, 1)

X_dev = X[n_trains:n_trains+n_devs]
Y_dev = dev_df.target.values.reshape(-1, 1)

X_test = X[n_trains+n_devs:]
print(X.shape, X_train.shape, X_dev.shape, X_test.shape)

Vectorizing data...
(2175017, 323858) (1466841, 323858) (14817, 323858) (693359, 323858)
Wall time: 13min 31s


Fitting Ridge model on training data

In [32]:
print("Fitting Ridge model on training examples...")
ridge_model = Ridge(
    solver='auto', fit_intercept=True, alpha=1.0,
    max_iter=100, normalize=False, tol=0.05, random_state = 1,
)
ridge_modelCV = RidgeCV(
    fit_intercept=True, alphas=[5.0],
    normalize=False, cv = 2, scoring='neg_mean_squared_error',
)
ridge_model.fit(X_train, Y_train)
ridge_modelCV.fit(X_train, Y_train)

Fitting Ridge model on training examples...


RidgeCV(alphas=[5.0], cv=2, fit_intercept=True, gcv_mode=None,
    normalize=False, scoring='neg_mean_squared_error',
    store_cv_values=False)

Evaluating Ridge model on dev data¶

In [33]:
Y_dev_preds_ridge = ridge_model.predict(X_dev)
Y_dev_preds_ridge = Y_dev_preds_ridge.reshape(-1, 1)
print("RMSL error on dev set:", rmsle(Y_dev, Y_dev_preds_ridge))

RMSL error on dev set: 0.6359953770508623


In [37]:
from sklearn.metrics import mean_squared_error

y_pred_train=ridge_model.predict(X_train)
rmse = np.sqrt(mean_squared_error(Y_train, y_pred_train))
print("Root Mean Squared Error: {}".format(rmse))

Root Mean Squared Error: 0.4131545453525329


### Random Forest model implementation

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=3)
regressor.fit(X_train, Y_train)
y_test_pred = regressor.predict(X_test)
#print(predictions)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_test_pred)