### Load the text data and do the preprocessing

In [165]:
import pandas as pd
import numpy as np
from collections import defaultdict
import string
import operator 

data_df = pd.read_csv('train.dat',header=None,names=['Class', 'Text'],sep='\t')
test_df = pd.read_csv('test.dat',header=None,names=['Text'],sep='\t')


# convert to lowercase
data_df.Text= data_df.Text.apply(lambda x: x.lower())
test_df.Text= test_df.Text.apply(lambda x: x.lower()) 

#remove punctuation
data_df.Text= data_df.Text.apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
test_df.Text= test_df.Text.apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))

#remove digit
data_df.Text= data_df.Text.apply(lambda x: x.translate(str.maketrans('','',string.digits)))
test_df.Text= test_df.Text.apply(lambda x: x.translate(str.maketrans('','',string.digits)))

#remove stop words    (required to download 'stopwords')
from nltk.corpus import stopwords
#import nltk
#nltk.download('stopwords')
stop = stopwords.words('english')
data_df.Text= data_df.Text.apply(lambda x: " ".join(x for x in x.split() if x not in stop))
test_df.Text= test_df.Text.apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# remove common words
freq_common1 = pd.Series(' '.join(data_df.Text).split()).value_counts()[:10]    
#print(freq)
freq_common1 = list(freq_common1.index)
data_df.Text= data_df.Text.apply(lambda x: " ".join(x for x in x.split() if x not in freq_common1))

freq_common2 = pd.Series(' '.join(test_df.Text).split()).value_counts()[:10]    
#print(freq)
freq_common2 = list(freq_common2.index)
test_df.Text= test_df.Text.apply(lambda x: " ".join(x for x in x.split() if x not in freq_common2))


# remove rare words
freq_rare1 = pd.Series(' '.join(data_df.Text).split()).value_counts()[-10:]    
#print(freq_rare)
freq_rare1 = list(freq_rare1.index)
data_df.Text= data_df.Text.apply(lambda x: " ".join(x for x in x.split() if x not in freq_rare1))

freq_rare2 = pd.Series(' '.join(test_df.Text).split()).value_counts()[-10:]    
#print(freq_rare)
freq_rare2 = list(freq_rare2.index)
test_df.Text= test_df.Text.apply(lambda x: " ".join(x for x in x.split() if x not in freq_rare2))

# remove suffices (reduce a lot ncols)
from nltk.stem import PorterStemmer
st = PorterStemmer()
data_df.Text= data_df.Text.apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
test_df.Text= test_df.Text.apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

# Lemmatization
from textblob import Word
data_df.Text= data_df.Text.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
test_df.Text= test_df.Text.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

docs = [t.split() for t in data_df.Text]
docs_test = [t.split() for t in test_df.Text]

### Limit the word's length

In [166]:
def filterLen(docs, minlen):
    r""" filter out terms that are too short. 
    docs is a list of lists, each inner list is a document represented as a list of words
    minlen is the minimum length of the word to keep
    """
    return [ [t for t in d if len(t) >= minlen ] for d in docs ]


### Convert the text data to the C.S.R metrix

In [167]:
from collections import Counter
from scipy.sparse import csr_matrix
def build_matrix(docs,ID):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs)
    idx = ID
    tid = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))  #unrepeated words in each row, and aggregated them
        for w in d:
            if w not in idx:    #key:words, values:ID; similar to plotWf
                idx[w] = tid    #count how many unrepeated words
                tid += 1
    ncols = len(idx)        #unrepeated words in whole dataset
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)    # an array full of zero, has 'nnz' amount of element
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)    # store memory, with length  ' amount of object +1'
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d)    #sorted dict. :  key=words, values=times
        keys = list(k for k,_ in cnt.most_common())    # list in frequency order (most to least)
        l = len(keys)
        for j,k in enumerate(keys):    # j for index, k for content
            ind[j+n] = idx[k]    # pass the words' ID value
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l    #ptr[0] don't touch, ptr[i+1] value is amount of set(words) in each row add the previous ptr[i]
        n += l    # Amounts of non-repeated words
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)    #(data,indices,indptr)->(frequency, ID, memory)
    mat.sort_indices()
    
    return (mat,idx)


def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

### Normalize the dataset with L2-norm method

In [168]:
# scale matrix and normalize its rows
def csr_idf(mat, copy=False, **kargs):
    r""" Scale a CSR matrix by idf. 
    Returns scaling factors as dict. If copy is True, 
    returns scaled matrix and scaling factors.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # document frequency
    df = defaultdict(int)
    for i in ind:
        df[i] += 1    #count how many times appear in whole document, key=ID, value=Frequnecy
    # inverse document frequency
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    # scale by idf
    for i in range(0, nnz):
        val[i] *= df[ind[i]]    # larger = more important
        
    return df if copy is False else mat

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat

### Create the training model

In [169]:
from scipy.sparse.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity

def getResult(trainingSet, testInstance, k, epsilon, trainWithClass):
    
    result_set=[]
    sim = cosine_similarity(trainingSet,testInstance)
    
    #test number ->0 to end 
    # ith test 
    for i in range(sim.shape[1]):
        # get k neighbors
        neighbors = []
        for x in range(k):
            index = sim[:,i].argsort()[-x-1]    #ith column of sim[][], get the sorting sequnence in index form
            
            if(sim[:,i][index]>epsilon):    #compare similarity value
                neighbors.append(index)   
            if(neighboers==0):
                print("epsilon value too high")
                return False
        result = getResponse(neighbors,trainWithClass)
        result_set.append(result)
        
    return result_set

In [170]:
def getResponse(neighbors,train):
    classVotes = {}    #majority vote
    #print("neighbor len:",len(neighbors))
    for x in range(len(neighbors)):
        response = train.Class[neighbors[x]]    #last element
        #print("response:", response)
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
        
        
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]


### Apply functions to the dataset

In [172]:
#filter words length
docs1 = filterLen(docs, 4)  
docs_test1 = filterLen(docs_test, 4)
docs_sum = docs1+docs_test1    # to get whole words ID

In [173]:
#transfer matrix
total_ID={}
(mat1, total_ID) = build_matrix(docs_sum,total_ID)
(train_mat1,total_ID) = build_matrix(docs1,total_ID)
(test_mat1,total_ID) = build_matrix(docs_test1,total_ID)
csr_info(train_mat1)
csr_info(test_mat1)


 [nrows 14438, ncols 156433, nnz 1245863]
 [nrows 14442, ncols 156433, nnz 1270791]


In [160]:
train_mat2 = csr_idf(train_mat1, copy=True)
test_mat2 = csr_idf(test_mat1, copy=True)
train_mat3 = csr_l2normalize(train_mat2, copy=True)
test_mat3 = csr_l2normalize(test_mat2, copy=True)

In [161]:
csr_info(train_mat3)
csr_info(test_mat3)
print(test_mat3.shape[0])
print(train_mat3.shape[0])

 [nrows 14438, ncols 46379, nnz 866351]
 [nrows 14442, ncols 46379, nnz 881093]
14442
14438


### Get the prediction of test data

In [162]:
import time

start = time.time()

k=5
epsilon=0.02
ans = getResult(train_mat3, test_mat3, k, epsilon, data_df)    #(train data; test data; k; train data with class)


#print(result)
end=time.time()
print('time:',end-start)

time: 126.15117001533508


In [164]:
# Output label file
with open('prediction.dat', mode='w') as out:
    writer = csv.writer(out)
    for i in range(len(ans)):
        writer.writerow([ans[i]])