In [201]:
import csv
import math

with open('formatted_train.tsv','r') as f:
    reader = csv.reader(f,delimiter = '\t')
    content = list(reader)

In [188]:
word_dict = dict()
with open('dict.txt','r') as f:
    for line in f.readlines():
        [word, num_str] = line.split()
        word_dict[word] = int( num_str )

dim = len(word_dict)
dim

39176

In [189]:
# Initialize W 
W = [0]*dim + [0]    # dim + 1 for the bias term

In [202]:
label_train = []
feature_train = []

num_feature = len(content)

for line in content:
    label_train.append( int(line[0]) )
    comment = line[1:]
    feature_dict = dict()
    for ele in comment:
        [key,val] = ele.split(':')
        feature_dict[ int(key) ] = int(val)
    feature_dict[ dim ] = 1        # bias term in the end
    feature_train.append( feature_dict )
    

In [196]:
def sparse_dot(X,W):
    product = 0.0
    for key in X.keys():
        product += X[key]*W[key]
    return product

def cal_loss_i(theta,x,y):
    loss_i = -sparse_dot(x, theta)*float(y) + math.log(  1 + math.exp( sparse_dot(x, theta))  )
    return loss_i

def cal_loss(feature,label,W):
    
    num_data = len(feature)
    loss = 0.0
    
    for i in range(num_data):
        loss += cal_loss_i(W, feature[i], label[i])
    return loss

def cal_gradient(theta,x,y):
    gradient = {}
    exp_term = math.exp( sparse_dot(x,theta) )
    for key in x.keys():
        
        gradient[key] = -x[key]*(y - exp_term/( 1 + exp_term ) )
    return gradient   

def update(W,gradient,learning_rate):
    
    for key in gradient.keys():
        W[key] -= learning_rate*gradient[key]
    return W

def train(feature,label,W,num_epoch, learning_rate):
    
    num_data = len(feature)
    loss = []
    
    for i in range(num_epoch):
        
        loss.append( cal_loss(feature,label,W) )
        for x, y in zip(feature,label):
            gradient = cal_gradient(W,x,y)
            W = update(W,gradient,learning_rate)        
    return loss
 
    

In [197]:
train(feature_train,label_train,W, 60, 0.1)

[0.25777604938075427,
 0.24967104290248354,
 0.2421207642477205,
 0.23506810908281778,
 0.22846361676803417,
 0.2222642355820034,
 0.21643231824519915,
 0.21093479956544914,
 0.20574251910596358,
 0.2008296600948488,
 0.1961732820922456,
 0.19175292973164457,
 0.18755030353828098,
 0.18354898167806485,
 0.17973418370761526,
 0.17609256913308985,
 0.1726120649522925,
 0.16928171743917478,
 0.1660915642921377,
 0.1630325239584613,
 0.16009629950382614,
 0.15727529484528374,
 0.15456254153119248,
 0.1519516345504225,
 0.14943667589628895,
 0.14701222481344786,
 0.14467325382033658,
 0.14241510973896904,
 0.14023347907777775,
 0.13812435720940563,
 0.13608402086552937,
 0.13410900353847977,
 0.1321960734364544,
 0.13034221368729942,
 0.12854460452672792,
 0.12680060724219072,
 0.12510774967255456,
 0.1234637130903902,
 0.12186632031413809,
 0.12031352491791235,
 0.11880340142121894,
 0.11733413635629199,
 0.11590402012238896,
 0.11451143954653784,
 0.11315487108039703,
 0.11183287456994451

In [193]:
with open('formatted_test.tsv','r') as f:
    reader = csv.reader(f,delimiter = '\t')
    content = list(reader)

label_test = []
feature_test = []

num_feature = len(content)

for line in content:
    label_test.append( int(line[0]) )
    comment = line[1:]
    feature_dict = dict()
    for ele in comment:
        [key,val] = ele.split(':')
        feature_dict[ int(key) ] = int(val)
    feature_dict[ dim ] = 1        # bias term in the end
    feature_test.append( feature_dict )    

In [184]:
def predict(feature,label,W):
    num_data = len(feature)
    pred_label = []
    error_num = 0
    
    for i in range(num_data):
        x = feature[i]
        #y = 1/( 1 + math.exp(-sparse_dot(x,W)) )
        y = math.exp( sparse_dot(x,W) )/( 1 + math.exp( sparse_dot(x,W) ) )
        if y >= 0.5:
            pred_label.append(1)
        else:
            pred_label.append(0)
            
    for y1,y2 in zip(pred_label,label):
        if y1 != y2:
            error_num += 1
    
    accuracy = error_num/num_data
    print("accu:" + str(accuracy))

    return pred_label
    
    

SyntaxError: invalid syntax (<ipython-input-199-05061865a1d9>, line 2)