In [94]:
import numpy
import urllib
import scipy.optimize
import random
from math import *

def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)

print "Reading data..."
data = list(parseData("file:beer_50000.json"))
print "done"

def feature(datum):
  text = datum['review/text'].lower().replace(',',' ').replace('?',' ')\
          .replace('!',' ').replace(':',' ').replace('"',' ').replace('.',' ')\
          .replace('(',' ').replace(')',' ').split()
  num_lactic = 0
  num_tart = 0
  num_sour = 0
  num_citric = 0
  num_sweet = 0
  num_acid = 0
  num_hop = 0
  num_fruit = 0
  num_salt = 0
  num_spicy = 0
  for word in text:
    if word == 'lactic':  num_lactic += 1
    if word == 'tart':  num_tart += 1
    if word == 'sour':  num_sour += 1
    if word == 'citric':  num_citric += 1
    if word == 'sweet':  num_sweet += 1
    if word == 'acid':  num_acid += 1
    if word == 'hop':  num_hop += 1
    if word == 'fruit':  num_fruit += 1
    if word == 'salt':  num_salt += 1
    if word == 'spicy':  num_spicy += 1
        
  feat = [1, num_lactic, num_tart, num_sour, \
         num_citric, num_sweet, num_acid, num_hop, \
         num_fruit, num_salt, num_spicy] 
  return feat

X = [feature(d) for d in data]
y = [d['beer/ABV'] >= 6.5 for d in data]

def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  res = 1.0 / (1 + exp(-x))
  return res

Reading data...
done


In [95]:
length = int(len(data)/3)

X_train = X[:length+1]
y_train = y[:length+1]

X_validation = X[length+1:2*length+1]
y_validation = y[length+1:2*length+1]

X_test = X[2*length+1:]
y_test = y[2*length+1:]

print len(X_train)
print len(X_validation)
print len(X_test)

16667
16666
16667


In [96]:
# Count for number of total data, y=0 and y=1
num_total = len(y_train)
num_y0 = y_train.count(0)
num_y1 = y_train.count(1)

In [97]:
# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  loglikelihood = 0
  for i in range(len(X)):
    logit = inner(X[i], theta)
    if y[i]:
      loglikelihood -= log(1 + exp(-logit)) * num_total / (2 * num_y1)
    if not y[i]:
      loglikelihood -= (log(1 + exp(-logit)) + logit ) * num_total / (2 * num_y0)
  for k in range(len(theta)):
    loglikelihood -= lam * theta[k]*theta[k]
  # for debugging
  # print("ll =" + str(loglikelihood))
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
  dl = [0]*len(theta)
  for i in range(len(X)):
    logit = inner(X[i], theta)
    for k in range(len(theta)):
      if y[i]:
        dl[k] += X[i][k] * (1 - sigmoid(logit)) * num_total / (2 * num_y1)
      if not y[i]:
        dl[k] -= X[i][k] * (1 - sigmoid(logit)) * num_total / (2 * num_y0)
  for k in range(len(theta)):
    dl[k] -= lam*2*theta[k]
  return numpy.array([-x for x in dl])

def train(lam):
  theta,_,_ = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, pgtol = 10, args = (X_train, y_train, lam))
  return theta

In [98]:
X_data = [X_train, X_validation, X_test]
y_data = [y_train, y_validation, y_test]
symbol = ['train', 'valid', 'test']
print 'λ\tDataset\t\tTruePositive\tFalsePositive\tTrueNegative\tFalseNegative\tAccuracy\tBER'
for lam in [0, 0.01, 0.1, 1, 100]:
    theta = train(lam)
    for i in range(3):
        def TP(theta):
          scores = [inner(theta,x) for x in X_data[i]]
          predictions = [s > 0 for s in scores]
          correct = [((a==1) and (b==1)) for (a,b) in zip(predictions,y_data[i])]
          tp = sum(correct) * 1.0
          return tp

        def TN(theta):
          scores = [inner(theta,x) for x in X_data[i]]
          predictions = [s > 0 for s in scores]
          correct = [((a==0) and (b==0)) for (a,b) in zip(predictions,y_data[i])]
          tn = sum(correct) * 1.0
          return tn

        def FP(theta):
          scores = [inner(theta,x) for x in X_data[i]]
          predictions = [s > 0 for s in scores]
          correct = [((a==1) and (b==0)) for (a,b) in zip(predictions,y_data[i])]
          fp = sum(correct) * 1.0
          return fp

        def FN(theta):
          scores = [inner(theta,x) for x in X_data[i]]
          predictions = [s > 0 for s in scores]
          correct = [((a==0) and (b==1)) for (a,b) in zip(predictions,y_data[i])]
          fn = sum(correct) * 1.0
          return fn
        
        if i == 1 :
            tp = TP(theta)
            fp = FP(theta)
            tn = TN(theta)
            fn = FN(theta)
            TPR = tp / (tp + fn)
            TNR = tn / (tn + fp)
            BER = 1 - 0.5 * (TPR + TNR)
            accuracy = (tp+tn)/(tp+tn+fp+fn)
            print str(lam)+'\t'+symbol[i]+'\t\t'+str(tp)+'\t\t'+str(fp)+'\t\t'+str(tn)+'\t\t'+str(fn)+'\t\t'+str(accuracy)+'\t'+str(BER)

λ	Dataset		TruePositive	FalsePositive	TrueNegative	FalseNegative	Accuracy	BER
0	valid		7289.0		210.0		548.0		8619.0		0.470238809552	0.409423860682
0.01	valid		7289.0		210.0		548.0		8619.0		0.470238809552	0.409423860682
0.1	valid		7289.0		210.0		548.0		8619.0		0.470238809552	0.409423860682
1	valid		7289.0		210.0		548.0		8619.0		0.470238809552	0.409423860682
100	valid		7276.0		209.0		549.0		8632.0		0.469518780751	0.409172829522
1000	valid		7357.0		213.0		545.0		8551.0		0.474138965559	0.409265463088
10000	valid		9676.0		444.0		314.0		6232.0		0.599423976959	0.488752278106


In [99]:
X_data = [X_train, X_validation, X_test]
y_data = [y_train, y_validation, y_test]
symbol = ['train', 'valid', 'test']
print 'λ\tDataset\t\tTruePositive\tFalsePositive\tTrueNegative\tFalseNegative\tAccuracy\tBER'
for lam in [100]:
    theta = train(lam)
    for i in range(3):
        def TP(theta):
          scores = [inner(theta,x) for x in X_data[i]]
          predictions = [s > 0 for s in scores]
          correct = [((a==1) and (b==1)) for (a,b) in zip(predictions,y_data[i])]
          tp = sum(correct) * 1.0
          return tp

        def TN(theta):
          scores = [inner(theta,x) for x in X_data[i]]
          predictions = [s > 0 for s in scores]
          correct = [((a==0) and (b==0)) for (a,b) in zip(predictions,y_data[i])]
          tn = sum(correct) * 1.0
          return tn

        def FP(theta):
          scores = [inner(theta,x) for x in X_data[i]]
          predictions = [s > 0 for s in scores]
          correct = [((a==1) and (b==0)) for (a,b) in zip(predictions,y_data[i])]
          fp = sum(correct) * 1.0
          return fp

        def FN(theta):
          scores = [inner(theta,x) for x in X_data[i]]
          predictions = [s > 0 for s in scores]
          correct = [((a==0) and (b==1)) for (a,b) in zip(predictions,y_data[i])]
          fn = sum(correct) * 1.0
          return fn

        tp = TP(theta)
        fp = FP(theta)
        tn = TN(theta)
        fn = FN(theta)
        TPR = tp / (tp + fn)
        TNR = tn / (tn + fp)
        BER = 1 - 0.5 * (TPR + TNR)
        accuracy = (tp+tn)/(tp+tn+fp+fn)
        print str(lam)+'\t'+symbol[i]+'\t\t'+str(tp)+'\t\t'+str(fp)+'\t\t'+str(tn)+'\t\t'+str(fn)+'\t\t'+str(accuracy)+'\t'+str(BER)

λ	Dataset		TruePositive	FalsePositive	TrueNegative	FalseNegative	Accuracy	BER
100	train		4371.0		2480.0		4838.0		4978.0		0.552528949421	0.43567688614
100	valid		7276.0		209.0		549.0		8632.0		0.469518780751	0.409172829522
100	test		2753.0		3811.0		6944.0		3159.0		0.581808363833	0.444341878624
