In [13]:
import numpy
import scipy.optimize
from sklearn import datasets, linear_model

import parseData
import analysis

pathProducts = r"C:\Users\Yijun\Desktop\Amazon\meta_shoes_unique+fifteen_review+price.txt"
pathReviews = r"C:\Users\Yijun\Desktop\Amazon\reviews_shoes_unique+fifteen_review+price.txt"
pathLabels = r"C:\Users\Yijun\Desktop\Amazon\labels.txt"

shoes = parseData.parse(pathProducts)
reviews = parseData.parse(pathReviews)
labels = parseData.loadLabels(pathLabels)

for key, value in labels.items():
  if value == 0:
    del labels[key]

print len(labels)

sample_shoes = dict()
for i in shoes:
  if i['asin'] in labels.keys():
    sample_shoes[i['asin']] = i

sample_reviews = list()
for i in reviews:
  if i['asin'] in labels.keys():
    sample_reviews.append(i)

1579


In [14]:
# extract sub-categories

subcategories = set()
for key,value in sample_shoes.iteritems():
    for i in value['categories']:
      for j in range(0, len(i)):
        if i[j]== 'Shoes':
          if j+1 < len(i):
            subcategories.add(i[j+1])
            break

print subcategories

set(['Athletic & Outdoor', 'Outdoor', 'Loafers', 'Athletic', 'Men', 'Boots', 'Clogs & Mules', 'Fashion Sneakers', 'Women', 'Pumps', 'Oxfords', 'Flats', 'Sneakers', 'Sandals', 'Kids & Baby', 'Loafers & Slip-Ons', 'Slippers', 'Mules & Clogs'])


In [15]:
# Split data into training set, validation set, and test set

import random

training_set = random.sample(sample_shoes.items(), 1179)

for i in training_set:
  del sample_shoes[i[0]]

validation_set = random.sample(sample_shoes.items(), 200)

for i in validation_set:
  del sample_shoes[i[0]]

test_set = random.sample(sample_shoes.items(), 200)

# print training_set[0][1]

In [16]:
# Regularize price feature

min_price = 999999
max_price = 0

for i,j in sample_shoes.items():
  if j['price'] > max_price:
    max_price = j['price']
  elif j['price'] < min_price:
    min_price = j['price']
    
print min_price
print max_price

3.95
364.95


In [17]:
# Regularize popularity feature

min_reviews = 999999
max_reviews = 0

for i in sample_shoes:
  count = 0
  for j in sample_reviews:
    if i == j['asin']:
      count += 1
  if count > max_reviews:
    max_reviews = count
  elif count < min_reviews:
    min_reviews = count

print min_reviews
print max_reviews

16
444


In [18]:
# Common words dictionary

reviewList = parseData.tokenize(sample_reviews)
words = analysis.commonWords(reviewList, 800)

wordID = dict(zip(words, range(len(words))))
wordSet = set(words)

In [19]:
# Regularize word counts

word_count = [0] * len(wordID)
for r in reviewList:
  for w in r:
    if w in words:
      word_count[wordID[w]] += 1

min_occurrence = min(word_count)
max_occurrence = max(word_count)

print min_occurrence
print max_occurrence

371
169991


In [20]:
# feature: price, popularity, keywords and subcategory

def feature(key):
  feat = [0] * len(words)
  for i in range(0, len(sample_reviews)):
    if sample_reviews[i]['asin'] == key[0]:
      review = reviewList[i]
      for w in review:
        if w in words:
          feat[wordID[w]] += 1    # keyword feature
  for i in range(0, len(wordID)):
    feat[i] = 1.0*feat[i]/word_count[i]
  feat.append(key[1]['price']/max_price)  # price feature
  count = 0
  for i in sample_reviews:
    if i['asin'] == key[0]:
      count += 1
  feat.append(count/max_reviews)   # popularity feature
  cat = 0
  for i in key[1]['categories']:
    for j in range(0, len(i)):
      if i[j] == 'Shoes' and j+1 < len(i):
        cat = i[j+1]
        break
  for i in subcategories:
    if i != cat:
      feat.append(0)
    else:
      feat.append(1)
  feat.append(1)  #offset
  return feat

In [21]:
# Look at theta and residuals on training set

#wordID = dict((y,x) for x,y in wordID.iteritems())

y = [labels[i[0]] for i in training_set]
X = [feature(i) for i in training_set]

# theta,residuals,rank,s = numpy.linalg.lstsq(X, y)
# print theta
# print residuals

In [22]:
# perform ridge regression


from sklearn.linear_model import Ridge

max_score = 0
lbda = 0.0001

for a in [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]:
  clf = Ridge(alpha=a)
  clf.fit(X, y) 

  y_validation = [labels[i[0]] for i in validation_set]
  X_validation = [feature(i) for i in validation_set]

  p = clf.predict(X_validation)
  print "Sum of suquared error: ", sum((b-c)**2 for (b,c) in zip(p, y_validation))
  print "Coefficient of determination R^2: ", clf.score(X, y)
  score = clf.score(X_validation, y_validation)
  print "Coefficient of determination R^2: ", score

  if score > max_score:
    max_score = score
    lbda = a

Sum of suquared error:  493.574450316
Coefficient of determination R^2:  0.787190577261
Coefficient of determination R^2:  -0.213823674416
Sum of suquared error:  260.488916363
Coefficient of determination R^2:  0.673763814281
Coefficient of determination R^2:  0.359392259057
Sum of suquared error:  212.292093711
Coefficient of determination R^2:  0.534045320807
Coefficient of determination R^2:  0.477920364246
Sum of suquared error:  232.940765266
Coefficient of determination R^2:  0.438934764492
Coefficient of determination R^2:  0.427140089129
Sum of suquared error:  240.092755777
Coefficient of determination R^2:  0.405665331884
Coefficient of determination R^2:  0.409551546211
Sum of suquared error:  241.46085715
Coefficient of determination R^2:  0.392009557499
Coefficient of determination R^2:  0.40618704095
Sum of suquared error:  272.813425137
Coefficient of determination R^2:  0.315162981478
Coefficient of determination R^2:  0.329083193189
Sum of suquared error:  362.5338760

In [41]:
# Find out the most status/utility words

clf = Ridge(alpha=lbda)
clf.fit(X, y)

utility = []
index = 0
for t in range(0, 800):
  utility.append((clf.coef_[t], index))
  index += 1

status = []
index = 0
for t in range(0, 800):
  status.append((clf.coef_[t],index))
  index += 1

utility.sort()
utility_words = utility[-101:]

status.sort()
status_words = status[:100]

wordID = dict((y,x) for x,y in wordID.iteritems())

for i in utility_words:
  if i[1] < len(wordID):
    print wordID[i[1]],
    #print clf.coef_[i[1]],

print 

for i in status_words:
  print wordID[i[1]],
  #print clf.coef_[i[1]],

use comfortably care kept expect brand flat product longer pleased pairs help slipping went told easy find item needed held inserts using comes shape mine their ground toe easily nothing ones poor higher cheaper seller solid house sent insoles trouble given money work provide flexible period type stores average lasted clarks saucony things rather sandal broken sandals stay hoping waterproof across daily decent based red break blue clean match inches husband water keeps pay arch slip others insole green likes plantar spend arches broke local weeks customer office flats rubber stitching finding keen job slipper flops plastic flip hiking velcro crocs
fun sexy white upper cute compliments beautiful cool skirts dress picture fur adorable awesome absolutely plan hit smell footbed dont gift clogs dance leg twice gotten christmas jeans cozy saw im zipper legs height forward lots birkenstock gorgeous calves fantastic please described show taking fabric wedding loved five skinny exchange oh anyw

In [42]:
from sklearn import metrics

wordID = dict((y,x) for x,y in wordID.iteritems())

y_test = numpy.array([labels[i[0]] for i in test_set])
y = []
for i in y_test:
  if i > 3:
    y.append(1)
  else:
    y.append(0)


X_test = [feature(i) for i in test_set]
p = clf.predict(X_test)
p = numpy.array(p)
score = []
for i in p:
  if i > 3:
    score.append(1)
  else:
    score.append(0)

fpr, tpr, thresholds = metrics.roc_curve(y, score)

print fpr
print tpr

[ 0.          0.31182796  1.        ]
[ 0.          0.87850467  1.        ]


In [43]:
import matplotlib.pyplot as plt

# This is the ROC curve
plt.plot(fpr,tpr)
plt.show() 

In [27]:
import matplotlib.pyplot as plt

wordID = dict((y,x) for x,y in wordID.iteritems())

y_test = numpy.array([labels[i[0]] for i in test_set])
X_test = [feature(i) for i in test_set]
p = clf.predict(X_test)
p = numpy.array(p)
colors = []
for i in range(0, 200):
  if y_test[i] > 3 and p[i] > 3:
    colors.append('blue')
  elif y_test[i] < 3 and p[i] < 3:
    colors.append('red')
  else:
    colors.append('purple')

plt.scatter(y_test, p, c=colors, alpha=0.5)
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.title('Scatter plot of prediction result')
#plt.grid(True)
plt.text(5.1, 5.4, 'uility goods', fontsize=10, color='blue', alpha = 0.7) 
plt.text(5.1, 5.2, 'true positive', fontsize=10, color='blue', alpha = 0.7)
plt.text(0.1, 0.3, 'status goods', fontsize=10, color='red', alpha = 0.7) 
plt.text(0.1, 0.1, 'false negative', fontsize=10, color='red', alpha = 0.7)
plt.show()

In [28]:
# the histogram of the data

y_test = numpy.array([labels[i[0]] for i in test_set])
X_test = [feature(i) for i in test_set]
p = clf.predict(X_test)
p = numpy.array(p)

correct_utility_counts = 0
correct_status_counts = 0
total_utility = 0
total_status = 0

for i in range(0,200):
  if y_test[i] > 3:
    total_utility += 1
    if p[i] > 3:
      correct_utility_counts += 1
  elif y_test[i] < 3:
    total_status += 1
    if p[i] < 3:
      correct_status_counts += 1

print correct_utility_counts
print correct_status_counts
print total_utility
print total_status

print 1.0 * correct_status_counts/total_status
print 1.0 * (total_status-correct_status_counts)/total_status
print 1.0 * correct_utility_counts/total_utility
print 1.0 * (total_utility-correct_utility_counts)/total_status

#n, bins, patches = plt.hist(x, 50, facecolor='g', alpha=0.7)

#plt.xlabel('Labels')
#plt.ylabel('Probability')
#plt.title('Histogram of IQ')
#plt.grid(True)
#plt.show()

94
62
107
88
0.704545454545
0.295454545455
0.878504672897
0.147727272727
