In [1]:
import numpy
import scipy.optimize
from sklearn import datasets, linear_model

import parseData
import analysis

pathProducts = r"C:\Users\Yijun\Desktop\Amazon\meta_shoes_unique+fifteen_review+price.txt"
pathReviews = r"C:\Users\Yijun\Desktop\Amazon\reviews_shoes_unique+fifteen_review+price.txt"
pathLabels = r"C:\Users\Yijun\Desktop\Amazon\labels.txt"

shoes = parseData.parse(pathProducts)
reviews = parseData.parse(pathReviews)
labels = parseData.loadLabels(pathLabels)

for key, value in labels.items():
  if value == 0:
    del labels[key]

print len(labels)

sample_shoes = dict()
for i in shoes:
  if i['asin'] in labels.keys():
    sample_shoes[i['asin']] = i

sample_reviews = list()
for i in reviews:
  if i['asin'] in labels.keys():
    sample_reviews.append(i)

1579


In [2]:
# extract sub-categories

subcategories = set()
for key,value in sample_shoes.iteritems():
    for i in value['categories']:
      for j in range(0, len(i)):
        if i[j]== 'Shoes':
          if j+1 < len(i):
            subcategories.add(i[j+1])
            break

print subcategories

set(['Athletic & Outdoor', 'Outdoor', 'Loafers', 'Athletic', 'Men', 'Boots', 'Clogs & Mules', 'Fashion Sneakers', 'Women', 'Pumps', 'Oxfords', 'Flats', 'Sneakers', 'Sandals', 'Kids & Baby', 'Loafers & Slip-Ons', 'Slippers', 'Mules & Clogs'])


In [3]:
# Split data into training set, validation set, and test set

import random

training_set = random.sample(sample_shoes.items(), 1179)

for i in training_set:
  del sample_shoes[i[0]]

validation_set = random.sample(sample_shoes.items(), 200)

for i in validation_set:
  del sample_shoes[i[0]]

test_set = random.sample(sample_shoes.items(), 200)

# print training_set[0][1]

In [32]:
# Regularize price feature

min_price = 999999
max_price = 0

for i,j in sample_shoes.items():
  if j['price'] > max_price:
    max_price = j['price']
  elif j['price'] < min_price:
    min_price = j['price']
    
print min_price
print max_price

8.75
324.95


In [33]:
# Regularize popularity feature

min_reviews = 999999
max_reviews = 0

for i in sample_shoes:
  count = 0
  for j in sample_reviews:
    if i == j['asin']:
      count += 1
  if count > max_reviews:
    max_reviews = count
  elif count < min_reviews:
    min_reviews = count

print min_reviews
print max_reviews

16
1513


In [34]:
# Common words dictionary

reviewList = parseData.tokenize(sample_reviews)
words = analysis.commonWords(reviewList, 800)

wordID = dict(zip(words, range(len(words))))
wordSet = set(words)

In [35]:
# Regularize word counts

word_count = [0] * len(wordID)
for r in reviewList:
  for w in r:
    if w in words:
      word_count[wordID[w]] += 1

min_occurrence = min(word_count)
max_occurrence = max(word_count)

print min_occurrence
print max_occurrence

371
169991


In [36]:
# feature: price, popularity, keywords and subcategory

def feature(key):
  feat = [0] * len(words)
  for i in range(0, len(sample_reviews)):
    if sample_reviews[i]['asin'] == key[0]:
      review = reviewList[i]
      for w in review:
        if w in words:
          feat[wordID[w]] += 1    # keyword feature
  for i in range(0, len(wordID)):
    feat[i] = 1.0*feat[i]/word_count[i]
  feat.append(key[1]['price']/max_price)  # price feature
  count = 0
  for i in sample_reviews:
    if i['asin'] == key[0]:
      count += 1
  feat.append(count/max_reviews)   # popularity feature
  cat = 0
  for i in key[1]['categories']:
    for j in range(0, len(i)):
      if i[j] == 'Shoes' and j+1 < len(i):
        cat = i[j+1]
        break
  for i in subcategories:
    if i != cat:
      feat.append(0)
    else:
      feat.append(1)
  feat.append(1)  #offset
  return feat

In [37]:
# Look at theta and residuals on training set

#wordID = dict((y,x) for x,y in wordID.iteritems())

y = [labels[i[0]] for i in training_set]
X = [feature(i) for i in training_set]

# theta,residuals,rank,s = numpy.linalg.lstsq(X, y)
# print theta
# print residuals

In [38]:
# perform ridge regression


from sklearn.linear_model import Ridge

max_score = 0
lbda = 0.0001

for a in [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]:
  clf = Ridge(alpha=a)
  clf.fit(X, y) 

  y_validation = [labels[i[0]] for i in validation_set]
  X_validation = [feature(i) for i in validation_set]

  p = clf.predict(X_validation)
  print "Sum of suquared error: ", sum((b-c)**2 for (b,c) in zip(p, y_validation))
  print "Coefficient of determination R^2: ", clf.score(X, y)
  score = clf.score(X_validation, y_validation)
  print "Coefficient of determination R^2: ", score

  if score > max_score:
    max_score = score
    lbda = a

Sum of suquared error:  470.227183752
Coefficient of determination R^2:  0.792618185898
Coefficient of determination R^2:  -0.202780876714
Sum of suquared error:  367.893320403
Coefficient of determination R^2:  0.684910866792
Coefficient of determination R^2:  0.0589760317112
Sum of suquared error:  236.887221243
Coefficient of determination R^2:  0.548038085199
Coefficient of determination R^2:  0.394072845012
Sum of suquared error:  232.445742944
Coefficient of determination R^2:  0.447578849979
Coefficient of determination R^2:  0.405433577328
Sum of suquared error:  246.723926296
Coefficient of determination R^2:  0.413854185482
Coefficient of determination R^2:  0.368911814054
Sum of suquared error:  250.408922085
Coefficient of determination R^2:  0.401789327708
Coefficient of determination R^2:  0.359486067054
Sum of suquared error:  270.122329987
Coefficient of determination R^2:  0.32513489967
Coefficient of determination R^2:  0.309061695903
Sum of suquared error:  348.62235

In [39]:
# Find out the most status/utility words

clf = Ridge(alpha=lbda)
clf.fit(X, y)

utility = []
index = 0
for t in range(0, 800):
  utility.append((clf.coef_[t], index))
  index += 1

status = []
index = 0
for t in range(0, 800):
  status.append((clf.coef_[t],index))
  index += 1

utility.sort()
utility_words = utility[-101:]

status.sort()
status_words = status[:100]

wordID = dict((y,x) for x,y in wordID.iteritems())

for i in utility_words:
  if i[1] < len(wordID):
    print wordID[i[1]],
    print clf.coef_[i[1]],

print 

for i in status_words:
  print wordID[i[1]],
  print clf.coef_[i[1]],

etc 0.703791040928 recommended 0.706773990434 others 0.710903288905 summer 0.718522023086 rub 0.731500538475 third 0.733347376493 brands 0.739386333685 stretch 0.754331081484 several 0.757248150861 decent 0.758042220024 sole 0.760499593923 expensive 0.76149846112 seem 0.769677406724 owned 0.776638828947 months 0.784657792389 sore 0.785946414263 across 0.795120381223 arch 0.799483547767 gel 0.800186793444 break 0.800396481118 toe 0.801171431746 his 0.814021431701 arches 0.851504861441 difference 0.856638966939 hold 0.857271362912 value 0.862067565341 gone 0.871140973538 pairs 0.873642758734 lasted 0.877989823736 wet 0.883730435876 likes 0.884877665968 laces 0.887779784159 holding 0.89319940623 waterproof 0.893999683866 become 0.896973136704 work 0.898390522545 period 0.89871070511 pay 0.899482114937 durable 0.902907798177 place 0.904603406977 trip 0.921720517047 sides 0.936333980857 replacement 0.937107485971 help 0.938821721697 slip 0.95203015383 traction 0.958493008645 product 0.97379

In [40]:
from sklearn import metrics

wordID = dict((y,x) for x,y in wordID.iteritems())

y_test = numpy.array([labels[i[0]] for i in test_set])
y = []
for i in y_test:
  if i > 3:
    y.append(1)
  else:
    y.append(0)


X_test = [feature(i) for i in test_set]
p = clf.predict(X_test)
p = numpy.array(p)
score = []
for i in p:
  if i > 3:
    score.append(1)
  else:
    score.append(0)

fpr, tpr, thresholds = metrics.roc_curve(y, score)

print fpr
print tpr

[ 0.   0.5  1. ]
[ 0.          0.89795918  1.        ]


In [41]:
import matplotlib.pyplot as plt

# This is the ROC curve
plt.plot(fpr,tpr)
plt.show() 

In [26]:
import matplotlib.pyplot as plt

#wordID = dict((y,x) for x,y in wordID.iteritems())

y_test = numpy.array([labels[i[0]] for i in test_set])
X_test = [feature(i) for i in test_set]
p = clf.predict(X_test)
p = numpy.array(p)
colors = []
for i in range(0, 200):
  if y_test[i] > 3 and p[i] > 3:
    colors.append('blue')
  elif y_test[i] < 3 and p[i] < 3:
    colors.append('red')
  else:
    colors.append('purple')

plt.scatter(y_test, p, c=colors, alpha=0.5)
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.title('Scatter plot of prediction result')
#plt.grid(True)
plt.text(5.1, 5.4, 'uility goods', fontsize=10, color='blue', alpha = 0.7) 
plt.text(5.1, 5.2, 'true positive', fontsize=10, color='blue', alpha = 0.7)
plt.text(0.1, 0.3, 'status goods', fontsize=10, color='red', alpha = 0.7) 
plt.text(0.1, 0.1, 'false negative', fontsize=10, color='red', alpha = 0.7)
plt.show()

In [23]:
# the histogram of the data

y_test = numpy.array([labels[i[0]] for i in test_set])
X_test = [feature(i) for i in test_set]
p = clf.predict(X_test)
p = numpy.array(p)

correct_utility_counts = 0
correct_status_counts = 0
correct_middle_counts = 0
total_utility = 0
total_status = 0
total_middle = 0

for i in range(0,200):
  if y_test[i] > 3:
    total_utility += 1
    if p[i] > 3:
      correct_utility_counts += 1
  elif y_test[i] < 3:
    total_status += 1
    if p[i] < 3:
      correct_status_counts += 1
  elif y_test[i] == 3:
    total_middle += 1
    if p[i] == 3:
      correct_middle_counts += 1

print correct_utility_counts
print correct_status_counts
print correct_middle_counts
print total_utility
print total_status
print total_middle

a = 1.0 * correct_status_counts/total_status
print a
b = 1.0 * (total_status-correct_status_counts)/total_status
print b
print a+b

c = 1.0 * correct_utility_counts/total_utility
d = 1.0 * (total_utility-correct_utility_counts)/total_utility
print c
print d
print c+d

print 1.0*(correct_utility_counts + correct_status_counts)/(total_status+total_utility)

#n, bins, patches = plt.hist(x, 50, facecolor='g', alpha=0.7)

#plt.xlabel('Labels')
#plt.ylabel('Probability')
#plt.title('Histogram of IQ')
#plt.grid(True)
#plt.show()

88
51
0
98
94
8
0.542553191489
0.457446808511
1.0
0.897959183673
0.102040816327
1.0
0.723958333333
