In [228]:
%matplotlib inline

from path import Path

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pylab as plt
from dmba import stepwise_selection
from dmba import classificationSummary, gainsChart, liftChart
from dmba import AIC_score

In [229]:
ereg = pd.read_csv('eBayAuctions.csv')

In [230]:
ereg2 = ereg.copy()

In [231]:
ereg.head()

Unnamed: 0,Category,currency,sellerRating,Duration,endDay,ClosePrice,OpenPrice,Competitive?
0,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0
1,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0
2,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0
3,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0
4,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0


In [232]:
ereg.shape

(1972, 8)

In [233]:
ereg.dtypes

Category         object
currency         object
sellerRating      int64
Duration          int64
endDay           object
ClosePrice      float64
OpenPrice       float64
Competitive?      int64
dtype: object

In [234]:
ereg.Category.unique()

array(['Music/Movie/Game', 'Automotive', 'SportingGoods', 'Home/Garden',
       'Books', 'Collectibles', 'Business/Industrial', 'Toys/Hobbies',
       'Electronics', 'Computer', 'EverythingElse',
       'Clothing/Accessories', 'Coins/Stamps', 'Antique/Art/Craft',
       'Jewelry', 'Pottery/Glass', 'Health/Beauty', 'Photography'],
      dtype=object)

In [235]:
ereg.currency.unique()

array(['US', 'GBP', 'EUR'], dtype=object)

In [236]:
ereg.endDay.unique()

array(['Mon', 'Tue', 'Fri', 'Thu', 'Sat', 'Sun', 'Wed'], dtype=object)

In [237]:
ereg['Duration'] = ereg['Duration'].astype('category')

In [238]:
ereg.dtypes

Category          object
currency          object
sellerRating       int64
Duration        category
endDay            object
ClosePrice       float64
OpenPrice        float64
Competitive?       int64
dtype: object

In [239]:
ereg2 = pd.get_dummies(ereg2)

In [240]:
ereg2.columns

Index(['sellerRating', 'Duration', 'ClosePrice', 'OpenPrice', 'Competitive?',
       'Category_Antique/Art/Craft', 'Category_Automotive', 'Category_Books',
       'Category_Business/Industrial', 'Category_Clothing/Accessories',
       'Category_Coins/Stamps', 'Category_Collectibles', 'Category_Computer',
       'Category_Electronics', 'Category_EverythingElse',
       'Category_Health/Beauty', 'Category_Home/Garden', 'Category_Jewelry',
       'Category_Music/Movie/Game', 'Category_Photography',
       'Category_Pottery/Glass', 'Category_SportingGoods',
       'Category_Toys/Hobbies', 'currency_EUR', 'currency_GBP', 'currency_US',
       'endDay_Fri', 'endDay_Mon', 'endDay_Sat', 'endDay_Sun', 'endDay_Thu',
       'endDay_Tue', 'endDay_Wed'],
      dtype='object')

In [241]:
table1 = pd.pivot_table(ereg, values = 'Competitive?', columns = ['Category'], aggfunc = np.mean)
table1


Category,Antique/Art/Craft,Automotive,Books,Business/Industrial,Clothing/Accessories,Coins/Stamps,Collectibles,Computer,Electronics,EverythingElse,Health/Beauty,Home/Garden,Jewelry,Music/Movie/Game,Photography,Pottery/Glass,SportingGoods,Toys/Hobbies
Competitive?,0.564972,0.353933,0.5,0.666667,0.504202,0.297297,0.577406,0.666667,0.8,0.235294,0.171875,0.656863,0.365854,0.602978,0.846154,0.35,0.725806,0.529915


In [242]:
table2 = pd.pivot_table(ereg, values = 'Competitive?', columns = ['currency'], aggfunc = np.mean)
table2

currency,EUR,GBP,US
Competitive?,0.551595,0.687075,0.51935


Use the information in the tables to reduce the number of dummies that will be used in the model. For example, categories that appear most similar with respect to the distribution of competitive auctions could be combined.

In [243]:
#FULL MODEL

predictors = ['sellerRating', 'Duration', 'ClosePrice', 'OpenPrice',
       'Category_Antique/Art/Craft', 'Category_Automotive', 'Category_Books',
       'Category_Business/Industrial', 'Category_Clothing/Accessories',
       'Category_Coins/Stamps', 'Category_Collectibles', 'Category_Computer',
       'Category_Electronics', 'Category_EverythingElse',
       'Category_Health/Beauty', 'Category_Home/Garden', 'Category_Jewelry',
       'Category_Music/Movie/Game', 'Category_Photography',
       'Category_Pottery/Glass', 'Category_SportingGoods',
       'Category_Toys/Hobbies', 'currency_EUR', 'currency_GBP', 'currency_US',
       'endDay_Fri', 'endDay_Mon', 'endDay_Sat', 'endDay_Sun', 'endDay_Thu',
       'endDay_Tue', 'endDay_Wed']

outcome = ['Competitive?']

X = ereg2[predictors]
y = ereg2[outcome]

#partition data
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.4, random_state = 1)

#fit a logistic regression (set penalty=l2 and C=1e42 to avoid regularization)
logit_reg_a = LogisticRegression(penalty='l2', C=1e42, solver = 'liblinear')
logit_reg_a.fit(train_X, train_y)
print('intercept ', logit_reg_a.intercept_[0])
print(pd.DataFrame({'coefficient': logit_reg_a.coef_[0]}, index=X.columns).transpose())

intercept  -0.18697984601993767
             sellerRating  Duration  ClosePrice  OpenPrice  \
coefficient     -0.000045   0.01315    0.089081  -0.106752   

             Category_Antique/Art/Craft  Category_Automotive  Category_Books  \
coefficient                    0.228771            -0.223756        0.629146   

             Category_Business/Industrial  Category_Clothing/Accessories  \
coefficient                      1.154953                      -0.906596   

             Category_Coins/Stamps  ...  currency_EUR  currency_GBP  \
coefficient              -1.580442  ...     -0.774538      0.794849   

             currency_US  endDay_Fri  endDay_Mon  endDay_Sat  endDay_Sun  \
coefficient    -0.207291    0.233032    0.704958   -0.356875   -0.162761   

             endDay_Thu  endDay_Tue  endDay_Wed  
coefficient    -0.23391    0.048305    -0.41973  

[1 rows x 32 columns]


  y = column_or_1d(y, warn=True)


In [244]:
classificationSummary(train_y, logit_reg_a.predict(train_X))

Confusion Matrix (Accuracy 0.7853)

       Prediction
Actual   0   1
     0 445 108
     1 146 484


In [245]:
#STEPWISE SELECTION FOR TRAINING SET WITH HIGHEST ACCURACY
#BEST PREDICTIVE MODEL
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)
    return AIC_score(train_y, model.predict(train_X[variables]), model)

best_model, best_variables = stepwise_selection(train_X.columns, train_model, score_model, verbose=True)

print(best_variables)

Variables: sellerRating, Duration, ClosePrice, OpenPrice, Category_Antique/Art/Craft, Category_Automotive, Category_Books, Category_Business/Industrial, Category_Clothing/Accessories, Category_Coins/Stamps, Category_Collectibles, Category_Computer, Category_Electronics, Category_EverythingElse, Category_Health/Beauty, Category_Home/Garden, Category_Jewelry, Category_Music/Movie/Game, Category_Photography, Category_Pottery/Glass, Category_SportingGoods, Category_Toys/Hobbies, currency_EUR, currency_GBP, currency_US, endDay_Fri, endDay_Mon, endDay_Sat, endDay_Sun, endDay_Thu, endDay_Tue, endDay_Wed
Start: score=1716.20, constant
Step: score=1676.05, add endDay_Mon
Step: score=1643.10, add ClosePrice
Step: score=1595.18, add OpenPrice
Step: score=1565.92, add Category_Health/Beauty
Step: score=1543.14, add currency_GBP
Step: score=1526.20, add Category_Coins/Stamps
Step: score=1512.50, add Category_Automotive
Step: score=1506.68, add Category_EverythingElse
Step: score=1500.57, add Catego

In [246]:
classificationSummary(valid_y, logit_reg_a.predict(valid_X))

Confusion Matrix (Accuracy 0.7554)

       Prediction
Actual   0   1
     0 279  74
     1 119 317


In [247]:
#STEPWISE SELECTION FOR VALIDATION SET WITH HIGHEST ACCURACY
#BEST FITTING MODEL
def valid_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(valid_X[variables], valid_y)
    return model

def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(valid_y, [valid_y.mean()] * len(valid_y), model, df=1)
    return AIC_score(valid_y, model.predict(valid_X[variables]), model)

best_model, best_variables = stepwise_selection(valid_X.columns, valid_model, score_model, verbose=True)

print(best_variables)

Variables: sellerRating, Duration, ClosePrice, OpenPrice, Category_Antique/Art/Craft, Category_Automotive, Category_Books, Category_Business/Industrial, Category_Clothing/Accessories, Category_Coins/Stamps, Category_Collectibles, Category_Computer, Category_Electronics, Category_EverythingElse, Category_Health/Beauty, Category_Home/Garden, Category_Jewelry, Category_Music/Movie/Game, Category_Photography, Category_Pottery/Glass, Category_SportingGoods, Category_Toys/Hobbies, currency_EUR, currency_GBP, currency_US, endDay_Fri, endDay_Mon, endDay_Sat, endDay_Sun, endDay_Thu, endDay_Tue, endDay_Wed
Start: score=1140.52, constant
Step: score=1111.14, add ClosePrice
Step: score=1075.15, add OpenPrice
Step: score=1062.75, add endDay_Mon
Step: score=1048.69, add sellerRating
Step: score=1041.58, add Category_Health/Beauty
Step: score=1035.83, add Category_Jewelry
Step: score=1029.53, add Category_Automotive
Step: score=1024.28, add Category_EverythingElse
Step: score=1019.26, add endDay_Sat


In [248]:
#MODEL WITHOUT CLOSING PRICE

predictors = ['sellerRating', 'Duration', 'OpenPrice',
       'Category_Antique/Art/Craft', 'Category_Automotive', 'Category_Books',
       'Category_Business/Industrial', 'Category_Clothing/Accessories',
       'Category_Coins/Stamps', 'Category_Collectibles', 'Category_Computer',
       'Category_Electronics', 'Category_EverythingElse',
       'Category_Health/Beauty', 'Category_Home/Garden', 'Category_Jewelry',
       'Category_Music/Movie/Game', 'Category_Photography',
       'Category_Pottery/Glass', 'Category_SportingGoods',
       'Category_Toys/Hobbies', 'currency_EUR', 'currency_GBP', 'currency_US',
       'endDay_Fri', 'endDay_Mon', 'endDay_Sat', 'endDay_Sun', 'endDay_Thu',
       'endDay_Tue', 'endDay_Wed']

outcome = ['Competitive?']

X = ereg2[predictors]
y = ereg2[outcome]

#partition data
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.4, random_state = 1)

#fit a logistic regression (set penalty=l2 and C=1e42 to avoid regularization)
logit_reg_b = LogisticRegression(penalty='l2', C=1e42, solver = 'liblinear')
logit_reg_b.fit(train_X, train_y)
print('intercept ', logit_reg_b.intercept_[0])
print(pd.DataFrame({'coefficient': logit_reg_b.coef_[0]}, index=X.columns).transpose())

intercept  0.528857876704452
             sellerRating  Duration  OpenPrice  Category_Antique/Art/Craft  \
coefficient     -0.000046 -0.041606  -0.004612                    0.182549   

             Category_Automotive  Category_Books  \
coefficient            -0.522079        0.057692   

             Category_Business/Industrial  Category_Clothing/Accessories  \
coefficient                      1.592844                      -0.197247   

             Category_Coins/Stamps  Category_Collectibles  ...  currency_EUR  \
coefficient               -1.96865               0.289225  ...         -0.33   

             currency_GBP  currency_US  endDay_Fri  endDay_Mon  endDay_Sat  \
coefficient      1.140313    -0.281455    0.147357    1.017021   -0.239297   

             endDay_Sun  endDay_Thu  endDay_Tue  endDay_Wed  
coefficient   -0.073222   -0.360292    0.134311    -0.09702  

[1 rows x 31 columns]


  y = column_or_1d(y, warn=True)


In [249]:
classificationSummary(train_y, logit_reg_b.predict(train_X))

Confusion Matrix (Accuracy 0.6796)

       Prediction
Actual   0   1
     0 343 210
     1 169 461


In [250]:
classificationSummary(valid_y, logit_reg_b.predict(valid_X))

Confusion Matrix (Accuracy 0.6375)

       Prediction
Actual   0   1
     0 206 147
     1 139 297


How does this model compare to the full model with respect to predictive accuracy?

The model without closing price has a lower accuracy compared with the full one with closing price.

The closing price has a coefficient of 0.089081. This means closing price has a slight linear relationship with competitiveness.Since 0.09 is less than 0.1,closing price is not statistically significant for predicting competitiveness of auctions.

Use stepwise regression as described in Section 6.4 to find the model with the best fit to the training data (highest accuracy). Which predictors are used:

endDay_Mon, ClosePrice, OpenPrice, Category_Health/Beauty, currency_GBP, Category_Coins/Stamps, Category_Automotive, Category_EverythingElse, Category_Jewelry, Category_Clothing/Accessories, sellerRating, Category_SportingGoods, Category_Business/Industrial, Category_Pottery/Glass, endDay_Sat, Category_Electronics, Duration, Category_Photography, endDay_Tue, endDay_Fri, Category_Music/Movie/Game, Category_Books, Category_Collectibles, endDay_Thu, Category_Computer, currency_EUR, Category_Antique/Art/Craft, Category_Home/Garden, endDay_Sun

29 variables were selected out of the initial 32

Use stepwise regression to find the model with the highest accuracy on the validation data. Which predictors are used? 

ClosePrice, OpenPrice, endDay_Mon, sellerRating, Category_Health/Beauty, Category_Jewelry, Category_Automotive, Category_EverythingElse, endDay_Sat, Category_Electronics, Category_Books, currency_GBP, endDay_Thu, Duration, Category_Home/Garden, Category_Collectibles, endDay_Tue, Category_Toys/Hobbies, currency_EUR, Category_Business/Industrial, endDay_Sun, Category_SportingGoods, Category_Clothing/Accessories, endDay_Fri, Category_Pottery/Glass, Category_Coins/Stamps, Category_Computer, Category_Photography, Category_Antique/Art/Craft, endDay_Wed

30 variables were selected out of the initial 32

What is the danger of using the best predictive model that you found?

The best predictive model might not have the best accuracy for a new test data that it has never seen before.

In [251]:
#Explain how and why the best-fitting model and the best predictive models are the same or different.

In [252]:
#FULL MODEL

predictors = ['sellerRating', 'Duration', 'ClosePrice', 'OpenPrice',
       'Category_Antique/Art/Craft', 'Category_Automotive', 'Category_Books',
       'Category_Business/Industrial', 'Category_Clothing/Accessories',
       'Category_Coins/Stamps', 'Category_Collectibles', 'Category_Computer',
       'Category_Electronics', 'Category_EverythingElse',
       'Category_Health/Beauty', 'Category_Home/Garden', 'Category_Jewelry',
       'Category_Music/Movie/Game', 'Category_Photography',
       'Category_Pottery/Glass', 'Category_SportingGoods',
       'Category_Toys/Hobbies', 'currency_EUR', 'currency_GBP', 'currency_US',
       'endDay_Fri', 'endDay_Mon', 'endDay_Sat', 'endDay_Sun', 'endDay_Thu',
       'endDay_Tue', 'endDay_Wed']

outcome = ['Competitive?']

X = ereg2[predictors]
y = ereg2[outcome]

#partition data
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.4, random_state = 1)

#fit a logistic regression (set penalty=l1 and C=1e42 to avoid regularization)
logit_reg_c = LogisticRegression(penalty='l1', C=1e42, solver = 'liblinear')
logit_reg_c.fit(train_X, train_y)
print('intercept ', logit_reg_c.intercept_[0])
print(pd.DataFrame({'coefficient': logit_reg_c.coef_[0]}, index=X.columns).transpose())

intercept  0.055748179413974246
             sellerRating  Duration  ClosePrice  OpenPrice  \
coefficient     -0.000045  0.013246    0.089937  -0.106971   

             Category_Antique/Art/Craft  Category_Automotive  Category_Books  \
coefficient                   -0.046153             -0.58478        0.231917   

             Category_Business/Industrial  Category_Clothing/Accessories  \
coefficient                      1.262923                      -1.299447   

             Category_Coins/Stamps  ...  currency_EUR  currency_GBP  \
coefficient              -2.044963  ...     -0.693713      1.019494   

             currency_US  endDay_Fri  endDay_Mon  endDay_Sat  endDay_Sun  \
coefficient    -0.128058    0.209239    0.633636   -0.433081   -0.241845   

             endDay_Thu  endDay_Tue  endDay_Wed  
coefficient   -0.375965    0.019495   -0.446275  

[1 rows x 32 columns]


  y = column_or_1d(y, warn=True)


In [253]:
#STEPWISE SELECTION FOR TRAINING SET WITH HIGHEST ACCURACY using l1 PENALTY
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)
    return AIC_score(train_y, model.predict(train_X[variables]), model)

best_model, best_variables = stepwise_selection(train_X.columns, train_model, score_model, verbose=True)

print(best_variables)

Variables: sellerRating, Duration, ClosePrice, OpenPrice, Category_Antique/Art/Craft, Category_Automotive, Category_Books, Category_Business/Industrial, Category_Clothing/Accessories, Category_Coins/Stamps, Category_Collectibles, Category_Computer, Category_Electronics, Category_EverythingElse, Category_Health/Beauty, Category_Home/Garden, Category_Jewelry, Category_Music/Movie/Game, Category_Photography, Category_Pottery/Glass, Category_SportingGoods, Category_Toys/Hobbies, currency_EUR, currency_GBP, currency_US, endDay_Fri, endDay_Mon, endDay_Sat, endDay_Sun, endDay_Thu, endDay_Tue, endDay_Wed
Start: score=1716.20, constant
Step: score=1676.05, add endDay_Mon
Step: score=1643.10, add ClosePrice
Step: score=1595.18, add OpenPrice
Step: score=1565.92, add Category_Health/Beauty
Step: score=1543.14, add currency_GBP
Step: score=1526.20, add Category_Coins/Stamps
Step: score=1512.50, add Category_Automotive
Step: score=1506.68, add Category_EverythingElse
Step: score=1500.57, add Catego

In [254]:
classificationSummary(valid_y, logit_reg_c.predict(valid_X))

Confusion Matrix (Accuracy 0.7579)

       Prediction
Actual   0   1
     0 276  77
     1 114 322


The training data with L1 penalty has 29 variables with 75.8% accuracy. This accuracy is higher than that of the training data with L2 penalty with 75.5% accuracy.

TRAINING SET WITH L1 PENALTY: endDay_Mon, ClosePrice, OpenPrice, Category_Health/Beauty, currency_GBP, Category_Coins/Stamps, Category_Automotive, Category_EverythingElse, Category_Jewelry, Category_Clothing/Accessories, sellerRating, Category_SportingGoods, Category_Business/Industrial, Category_Pottery/Glass, endDay_Sat, Category_Electronics, Duration, Category_Photography, endDay_Tue, endDay_Fri, Category_Music/Movie/Game, Category_Books, Category_Collectibles, endDay_Thu, Category_Computer, currency_EUR, Category_Antique/Art/Craft, Category_Home/Garden, endDay_Sun


BEST PREDICTIVE(TRAINING SET WITH L2 PENALTY): endDay_Mon, ClosePrice, OpenPrice, Category_Health/Beauty, currency_GBP, Category_Coins/Stamps, Category_Automotive, Category_EverythingElse, Category_Jewelry, Category_Clothing/Accessories, sellerRating, Category_SportingGoods, Category_Business/Industrial, Category_Pottery/Glass, endDay_Sat, Category_Electronics, Duration, Category_Photography, endDay_Tue, endDay_Fri, Category_Music/Movie/Game, Category_Books, Category_Collectibles, endDay_Thu, Category_Computer, currency_EUR, Category_Antique/Art/Craft, Category_Home/Garden, endDay_Sun


BEST FITTING(VALIDATION SET WITH L2 PENALTY): ClosePrice, OpenPrice, endDay_Mon, sellerRating, Category_Health/Beauty, Category_Jewelry, Category_Automotive, Category_EverythingElse, endDay_Sat, Category_Electronics, Category_Books, currency_GBP, endDay_Thu, Duration, Category_Home/Garden, Category_Collectibles, endDay_Tue, Category_Toys/Hobbies, currency_EUR, Category_Business/Industrial, endDay_Sun, Category_SportingGoods, Category_Clothing/Accessories, endDay_Fri, Category_Pottery/Glass, Category_Coins/Stamps, Category_Computer, Category_Photography, Category_Antique/Art/Craft, endDay_Wed

If the major objective is accurate classification, what cutoff value should be used? 

0.75 cutoff

Based on these data, what auction settings set by the seller (duration, opening price, ending day, currency) would you recommend as being most likely to lead to a competitive auction

Answer: ending day