In [3]:
import numpy as np
import scipy as sp
from collections import defaultdict
from collections import Counter
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
from scipy import linalg
import re
import copy
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics
from porter2stemmer import Porter2Stemmer
stemmer = Porter2Stemmer()
#print(stemmer.stem('conspicuous'))si
def build_matrix(docs):
    r""" Build sparse matrix from a list of documents,
    each of which is a list of word/terms in the document.
    """
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    # Remove all ratings
    for d in docs:
        #d = d[1:]
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
    print nrows
    print ncols
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        #d = d[1:]
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1

    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()

    return mat

#@profile
def filterLen(docs, minlen):
    return [ [t for t in d if len(t) >= minlen ] for d in docs ]

def stemDoc(docs):
    """ automatically removes suffixes (and in some cases prefixes) in order to
    find the root word or stem of a given word
    """
    return [ [stemmer.stem(t) for t in d ] for d in docs]

def csr_idf(mat, copy=False, **kargs):
    r""" Scale a CSR matrix by idf.
    Returns scaling factors as dict. If copy is True,
    returns scaled matrix and scaling factors.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # document frequency
    df = defaultdict(int)
    for i in ind:
        df[i] += 1
    # inverse document frequency
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    # scale by idf
    for i in range(0, nnz):
        val[i] *= df[ind[i]]

    return df if copy is False else mat

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm.
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum

    if copy is True:
        return mat


def grouper(input_list, n = 2):
    for i in xrange(len(input_list) - (n - 1)):
        yield input_list[i:i+n]

def get_k_mers(input_list):
    new_list = []
    #new_list.extend(input_list)
    for first, second in grouper(input_list, 2):
        st = first + " "+second
        new_list.append(st)

    for first, second, third in grouper(input_list, 3):
        st = first + " "+second + " "+third
        new_list.append(st)

    return new_list


k = [5]

k_values = k
train_file = "train.dat"
test_file="test.dat"
result_file = "format.dat"


print 'Working on training set'
print train_file
i=0
with open(train_file, "r") as fh:
	lines = fh.readlines()
	i = i + 1
	print i

train_labels = [int(l[:2]) for l in lines]
print "train_labels"
train_docs = [re.sub(r'[^\w]', ' ',l[2:].lower()).split() for l in lines]
print "train_docs"

train_reviews_stem = [[t for t in d if len(t) >= 2 ] for d in train_docs ]
train_reviews = [[stemmer.stem(t) for t in d ] for d in train_reviews_stem]
print "LOL"
for t in train_reviews:
    new_list = get_k_mers(t)
    t.extend(new_list)
num_train_samples = len(train_reviews)

print 'Working on  test file'
with open(test_file, "r") as fh:
    test_lines = fh.readlines()
        
    #print "reached end of file"
test_docs = [re.sub(r'[^\w]', ' ',l.lower()).split() for l in test_lines]
#print "test_docs done"
test_reviews_stem = [[t for t in d if len(t) >= 2 ] for d in test_docs]
test_reviews = [[stemmer.stem(t) for t in d ] for d in test_reviews_stem]
for t in test_reviews:
    new_list = get_k_mers(t)
    t.extend(new_list)
num_test_samples = len(test_reviews)

train_reviews.extend(test_reviews)

   
print 'Building CSR matrix'
    # 7. Build csr_matrix with train and test reviews

csr_mat = build_matrix(train_reviews)

mat1 = csr_idf(csr_mat, copy=True)
mat = csr_l2normalize(mat1, copy=True)

print 'Calculate Cosine Similarity'
similarities_sparse = cosine_similarity(mat,dense_output=False)

print 'Finally, caluclating nearest neighbours'
   
all_test_labels = []
for k in k_values:
    test_labels = []
    for test_review_index in range(num_train_samples, num_train_samples +num_test_samples):
        similarity = similarities_sparse[test_review_index, :num_train_samples].toarray().tolist()[0]
        similarity_with_labels = zip(similarity, train_labels, range(len(train_labels)))

        sorted_similarity_with_labels = sorted(similarity_with_labels, key=lambda (val, k, l): val, reverse=True)
           
        tmp = 0

        for j in range(k):
            if sorted_similarity_with_labels[j][0] != 0:
                tmp += int(sorted_similarity_with_labels[j][1])
            if tmp == 0:
                while tmp == 0:
                    tmp = np.random.randint(-1,2)
        if tmp > 0:
            test_labels.append(1)
            tst = 1
        else:
            test_labels.append(-1)
            tst = -1

    
    all_test_labels.append(test_labels)


result = open(result_file, 'w')
for t in all_test_labels[0]:
    if t == 1:
        result.write("+1")
    else:
        result.write("-1")
    result.write("\n")
result.close()

Working on training set
train.dat
1
train_labels
train_docs
LOL
Working on  test file
Building CSR matrix
50000
8425213
Calculate Cosine Similarity


MemoryError: 