In [18]:
import numpy as np
import os
import pickle
import random
import time
from collections import Counter

In [2]:
def folder_list(path , label):
    file_list = os.listdir(path)
    review = []
    for infile in file_list :
        file = os.path.join(path , infile)
        r = [read_data(file)]
        r.append(label)
        review.append(r)
    return review

In [3]:
def read_data(file):
    f = open(file)
    symbols = '${}()[].,:;+-*/&|<>=~" '
    lines = f.readlines()
    words = []
    for line in lines :
        line = line.strip()
        sentences = line.split(' ')
        words = words +  [w for w in sentences if w not in symbols]
    return words

In [4]:
def split_review(review , train_size = 1500 , valid_size = 500) :
    #划分训练集和验证集,同时获得对应的稀疏表示
    X_train = [Counter(r[0]) for r in review[ : train_size]]
    y_train = [r[1] for r in review[  : train_size]]
    X_valid = [Counter(r[0]) for r in review[train_size : train_size + valid_size]]
    y_valid = [r[1] for r in review[train_size : train_size + valid_size]]
    return X_train , y_train , X_valid , y_valid

In [5]:
pos_path = 'data/pos'
neg_path = 'data/neg'
pos_review = folder_list(pos_path , 1)
neg_review = folder_list(neg_path , -1)
review = pos_review + neg_review
random.seed(2020)
random.shuffle(review)


In [6]:
X_train , y_train , X_valid , y_valid  = split_review(review)

In [7]:
def dotProduct(X , w) :
    return sum(w.get(k , 0) * v for k , v in X.items())

In [8]:
def update(w , X , scale) :
    for k , v in X.items() :
        w[k] = w.get(k , 0) + scale * v

In [19]:
def pegasos(X_train , y_train , X_valid , y_valid , weight , num_iter = 1 , lambda_reg = 0.01) :
    num_instances = len(X_train)
    t = 0
    for d in range(num_iter) :
        for i in range(num_instances) :
            t += 1
            eta = 1 / (lambda_reg * t)
            if (y_train[i] * dotProduct(X_train[i] , weight) < 1) :
                update(weight , weight , - lambda_reg * eta)
                update(weight , X_train[i] , eta * y_train[i])
            else :
                update(weight , weight , -lambda_reg * eta)
    loss = sum(max(1 - y_train[i] * dotProduct(X_train[i] , weight) , 0) for i in range(num_instances)) / num_instances
    return weight

In [20]:
def pegasos_tricks(X_train , y_train , X_valid , y_valid , weight , num_iter = 10 , lambda_reg = 0.01) :
    num_instances = len(X_train)
    t = 2
    scale = 1
    for d in range(num_iter) :
        for i in range(num_instances) :
            t += 1
            eta = 1 / (lambda_reg * t)
            scale = (1 - eta * lambda_reg) * scale
            if (y_train[i] * scale * dotProduct(X_train[i] , weight) < 1) :
                update(weight , X_train[i] , eta * y_train[i] / scale)
#     loss = sum(max(1 - y_train[i] * dotProduct(X_train[i] , weight) * scale , 0) for i in range(num_instances)) / num_instances
#     print (loss)
    for k , v in weight.items() :
        weight[k] = v * scale
    return weight

In [22]:
def validate(X_valid , y_valid , weight) :
    num_correct = 0
    for i in range(len(y_valid)) :
        if (y_valid[i] * dotProduct(X_valid[i] , weight) > 0) :
            num_correct += 1
    print (num_correct / len(y_valid))

In [23]:
start_time = time.time()
weight = {}
weight = pegasos(X_train , y_train , X_valid , y_valid , weight , num_iter = 50 , lambda_reg = 0.1)
end_time = time.time()
print (end_time - start_time)
start_time = time.time()
w = {}
w = pegasos_tricks(X_train , y_train , X_valid , y_valid , w , num_iter = 50 , lambda_reg = 0.1)
end_time = time.time()
print (end_time - start_time)

0.06303697777777541
678.3316600322723
7.65860390663147


In [15]:
validate(X_valid , y_valid , w)
validate(X_train , y_train , w)

In [16]:
#validate(X_valid , y_valid , weight)


0.77
0.96


In [17]:
print (w['disappointment'])
print (w['happy'])

-0.11998400213304815
0.37994934008798564


In [None]:
w