In [53]:
import csv
import math

with open('formatted_train.tsv','r') as f:
    reader = csv.reader(f,delimiter = '\t')
    content = list(reader)

In [59]:
word_dict = dict()
with open('dict.txt','r') as f:
    for line in f.readlines():
        [word, num_str] = line.split()
        word_dict[word] = int( num_str )

dim = len(word_dict)
dim

39176

In [138]:
# Initialize W 
W = [0]*dim + [0]    # dim + 1 for the bias term

In [139]:
label_train = []
feature_train = []

num_feature = len(content)

for line in content:
    label_train.append( int(line[0]) )
    comment = line[1:]
    feature_dict = dict()
    for ele in comment:
        [key,val] = ele.split(':')
        feature_dict[ int(key) ] = int(val)
    feature_dict[ dim ] = 1        # bias term in the end
    feature_train.append( feature_dict )
    

In [153]:
def sparse_dot(X,W):
    product = 0.0
    for key in X.keys():
        product += X[key]*W[key]
    return product

def cal_loss_i(theta,x,y):
    loss_i = -sparse_dot(x, theta)*float(y) + math.log(  1 + math.exp(sparse_dot(x, theta))  )
    return loss_i

def cal_loss(feature,label,W):
    
    num_data = len(feature)
    loss = 0.0
    
    for i in range(num_data):
        loss += cal_loss_i(W, feature[i], label[i])
    return loss

def cal_gradient(theta,x,y):
    gradient = {}
    exp_term = math.exp( sparse_dot(x,theta) )
    for key in x.keys():
        
        gradient[key] = -x[key]*(y - exp_term/( 1 + exp_term ) )
    return gradient   

def update(W,gradient,learning_rate):
    
    for key in gradient.keys():
        W[key] -= learning_rate*gradient[key]
    return W

def train(feature,label,W,num_epoch, learning_rate):
    
    num_data = len(feature)
    loss = []
    
    for i in range(num_epoch):
        seq_epoch = np.arange(num_data)
        random.shuffle(seq_epoch)
        loss.append( cal_loss(feature,label,W) )
        for epoch in seq_epoch:
            gradient = cal_gradient( W, feature[epoch], label[epoch])
            W = update(W,gradient,learning_rate)
            
        
    return loss
 
    

In [154]:
train(feature_train,label_train,W, 60, 0.1)

[69.31471805599459,
 65.60096839542086,
 1.22658368053563,
 0.21746094533933047,
 0.1648003689501009,
 0.13921169191519928,
 0.12174987258242352,
 0.10883260052605662,
 0.098754591186915,
 0.09063190250594039,
 0.08389944807943109,
 0.07820719945145117,
 0.07332752423312809,
 0.06908822688818253,
 0.06536584252456691,
 0.06205078514043322,
 0.05909498796341051,
 0.05643904364781495,
 0.05403444074435937,
 0.0518431486818573,
 0.04984029073176899,
 0.047998923208343806,
 0.04630199027742845,
 0.0447305810225166,
 0.04327163286556911,
 0.04191327387349355,
 0.04064488139199807,
 0.03945689574648521,
 0.038342528512167066,
 0.03729490322400784,
 0.036307024469008256,
 0.035374025086895855,
 0.03449185710354277,
 0.033656160795302856,
 0.032862832236373984,
 0.032109138963451626,
 0.03139200275404292,
 0.030708090143230317,
 0.030056066356957185,
 0.029433132655375623,
 0.028837344074368602,
 0.028267160836230765,
 0.02772048753298678,
 0.027196069569026635,
 0.026692600516005766,
 0.02620

In [162]:
with open('formatted_test.tsv','r') as f:
    reader = csv.reader(f,delimiter = '\t')
    content = list(reader)

label_test = []
feature_test = []

num_feature = len(content)

for line in content:
    label_test.append( int(line[0]) )
    comment = line[1:]
    feature_dict = dict()
    for ele in comment:
        [key,val] = ele.split(':')
        feature_dict[ int(key) ] = int(val)
    feature_dict[ dim ] = 1        # bias term in the end
    feature_test.append( feature_dict )    

In [163]:
def predict(feature,label,W):
    num_data = len(feature)
    pred_label = []
    error_num = 0
    
    for i in range(num_data):
        x = feature[i]
        #y = 1/( 1 + math.exp(-sparse_dot(x,W)) )
        y = math.exp( sparse_dot(x,W) )/( 1 + math.exp( sparse_dot(x,W) ) )
        if y >= 0.5:
            pred_label.append(1)
        else:
            pred_label.append(0)
            
    for y1,y2 in zip(pred_label,label):
        if y1 != y2:
            error_num += 1
    
    accuracy = error_num/num_data
    

    return accuracy
    
    

In [164]:
predict(feature_test,label_test,W)

0.0

In [165]:
predict(feature_train,label_train,W)

0.0