# Question_1

### import lib

In [1]:
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
import numpy as np
import random
from sklearn.linear_model import LogisticRegression as LGR
from sklearn.cluster import KMeans

### PreProcess data

In [2]:

# mapping picked up from https://github.com/pararthshah/qa-memnn/blob/master/nltk_utils.py

lemmatizer = WordNetLemmatizer()

def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None


'''
takes in a list of tokens of length > 0
returns a list of the most likely part of speech for the token

'''
def get_pos(tokens):
    pos_list = nltk.pos_tag(tokens)
    #print(pos_list)
    pos_pegs = []
    for i, (token, pos) in enumerate(pos_list):        
        pos_peg = penn_to_wn(pos)
        if pos_peg is not None:
            pos_pegs.append(pos_peg)
        else:
            pos_pegs.append(None)
    return list(zip(tokens, pos_pegs))

def lemmatize(pos_tagged_tokens):
    lemmatized_tokens = []
    for i, (token, pos) in enumerate(pos_tagged_tokens):
        #print(token, pos)
        lemmatized_tokens.append(lemmatizer.lemmatize(token, pos=(pos if pos is not None else 'n')))
    return lemmatized_tokens

In [3]:
"""
train_data includes 2400 samples, where each sample is a list including the
elements which are the words in reviews.

train_label includes 2400 samples which belongs to {0,1}, which is the label 
of train_data.

test_data has the same form as the train_data, while it has 600 sample.

test_label is the same as train_label.
"""
def Split(filenames):

    train_data = []
    train_label = []
    test_data = []
    test_label = []
    root = "sentiment labelled sentences/"
    for filename in filenames:
        path = root + filename
        count = [1,1]
        punctuation = ["!","%","&","(",")","+",".",":",";","<","=",">","?","*",",","\t",""]
        meanless = ["and", "or", "the"]
        for line in open(path):
            if line[-1] == "\n":
                line = line[:-1]
            a = int(line[-1])
            b=[]
            for word in line[:-1].split(' '):
                ##while word and word[-1] in punctuation:
                    ##word = word[:-1]
                ##b.append(wordnet_lemmatizer.lemmatize(word.lower()))
                i = 0
                while i < len(word):
                    if word[i] in punctuation:
                        word = word[:i]+word[i+1:]
                    else:
                        i+=1
                if not word:
                    continue
                pos_tagged_tokens = get_pos([word.lower()])
                c = lemmatize(pos_tagged_tokens)[0]
                if c in meanless:
                    continue
                b.append(c)
            if count[a] > 400:
                test_label.append(a)
                test_data.append(b)
            else:
                train_label.append(a)
                train_data.append(b)
            count[a]+=1
    return [train_data, train_label, test_data, test_label]

In [4]:
[train_data, train_label, test_data, test_label] = Split(["yelp_labelled.txt","amazon_cells_labelled.txt","imdb_labelled.txt"])

In [5]:
train_data

[['wow', 'love', 'this', 'place'],
 ['crust', 'be', 'not', 'good'],
 ['not', 'tasty', 'texture', 'be', 'just', 'nasty'],
 ['stop',
  'by',
  'during',
  'late',
  'may',
  'bank',
  'holiday',
  'off',
  'rick',
  'steve',
  'recommendation',
  'love',
  'it'],
 ['selection', 'on', 'menu', 'be', 'great', 'so', 'be', 'price'],
 ['now', 'i', 'be', 'get', 'angry', 'i', 'want', 'my', 'damn', 'pho'],
 ['honeslty', 'it', "didn't", 'taste', 'that', 'fresh'],
 ['potato',
  'be',
  'like',
  'rubber',
  'you',
  'could',
  'tell',
  'they',
  'have',
  'be',
  'make',
  'up',
  'ahead',
  'of',
  'time',
  'be',
  'kept',
  'under',
  'a',
  'warmer'],
 ['fry', 'be', 'great', 'too'],
 ['a', 'great', 'touch'],
 ['service', 'be', 'very', 'prompt'],
 ['would', 'not', 'go', 'back'],
 ['cashier',
  'have',
  'no',
  'care',
  'what',
  'so',
  'ever',
  'on',
  'what',
  'i',
  'have',
  'to',
  'say',
  'it',
  'still',
  'end',
  'up',
  'be',
  'wayyy',
  'overprice'],
 ['i', 'try', 'cape', 'cod'

## Bag of Words 

In [6]:
"""
dic is a dictionary where key is the word shows in train_data and the items
of is a list with two elements, first one is the frequency of the key and 
second element is the index of the key in feature vector, which we will use
after.
"""
def bagOfWord(data):
    dic = {}
    t = 0
    n = 0
    for dataset in data:
        for line in dataset:
            for word in line:
                if word in dic:
                    dic[word][0] += 1
                elif t == 0:
                    dic[word] = [1,n]
                    n+=1
        t = 1
    return dic

In [7]:
Dic = bagOfWord([train_data, test_data])
len(Dic)

4160

In [8]:
"""Build feature vector."""
def buildB(data, dic):
    data_b = []
    size_dic = len(dic)
    for line in data:
        temp = [0]*size_dic
        for word in line:
            if word in dic:
                temp[dic[word][1]]+=1.0
        data_b.append(np.array(temp))
    return data_b
    

In [9]:
%time [train_data_b, test_data_b] = [buildB(train_data,Dic), buildB(test_data,Dic)]

CPU times: user 1.43 s, sys: 47.2 ms, total: 1.47 s
Wall time: 1.48 s


### postprocess feature vectors

In [10]:
"""
l^2 normalization
"""
def l2normalize(data):
    for vector in data:
        L = np.linalg.norm(vector)
        vector /= L
                
def standardize(data_b, size_dic):
    s = np.array([0.0]*size_dic)
    for bite in data_b:
        s += bite
    s_ = s/len(data_b)
    vec = []
    for bit in data_b:
            vec.append(bit - s_)
    return np.array(vec)

In [11]:
"""
train_vec and test_vec will be the feature vector to be used for future.
"""
l2normalize(train_data_b), l2normalize(test_data_b)
[train_vec, test_vec] = [standardize(train_data_b,len(Dic)), standardize(test_data_b,len(Dic))]

# K-means

In [12]:
"""
randomly pick two points in sample set to be initial points
label is a list indicate which cluster the vector is signed to.
p is the list including two mean point that the model converget to.
During the function, it first prints which two points function pick as
initial points and then how many time it iterates.
"""
def KMeans_2(data,size_dic):
    ##p = kmeans.cluster_centers_
    ##label = kmeans.labels_
    a = random.randint(0,len(data)-1)
    b = random.randint(0,len(data)-1)
    while a==b:
        b = random.randint(0,len(data)-1)
    p = np.array([data[a], data[b]])
    print("point_init1 is ",a)
    print("point_init2 is ",b)
    label = [0]*len(data)
    conver = False
    count = 0
    while not conver:
        count += 1
        conver = True
        for i in range(len(data)):
            d = [0]*2
            d[0] = np.linalg.norm(p[0]-data[i])
            d[1] = np.linalg.norm(p[1]-data[i])
            if d[label[i]] > d[1-label[i]]:
                conver = False
                label[i] = 1-label[i]
        if not conver:
            ##print("a")
            for j in [0,1]:
                n_p = 0
                s_p = np.array([0.0]*size_dic)
                for point in range(len(label)):
                    if label[point] == j:
                        s_p += data[point]
                        n_p += 1
                p[j] = s_p/n_p
    print("iterate time is ",count)
    return(label, p)

In [40]:
def n_kmeans(vec, k_train_label,kmeans_lib,size):
    [k_label, k_p]=KMeans_2(vec, size)
    n_bruce = 0
    n_python = 0
    for i in range(len(k_label)):
        if k_train_label[i] == kmeans_lib.labels_[i]:
            n_python+=1
        if k_train_label[i] == k_label[i]:
            n_bruce+=1
    print("self-designed accuracy is",n_bruce/len(k_label))
    print("          lib accuracy is", n_python/len(k_label))
    print("centers are:")
    print(k_p)
    print("higher than lib?: ",n_python/len(k_label)<n_bruce/len(k_label) )
    print("************************")

In [41]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(train_vec)

In [42]:
for i in range(1):
    n_kmeans(train_vec, train_label, kmeans,len(Dic))

point_init1 is  751
point_init2 is  2087
iterate time is  23
self-designed accuracy is 0.4841666666666667
          lib accuracy is 0.49375
centers are:
[[  2.24231216e-04  -3.52696566e-03  -1.06858425e-02 ...,   2.36947372e-05
    2.36947372e-05   2.36947372e-05]
 [ -7.19900218e-04   1.13234161e-02   3.43071785e-02 ...,  -7.60725774e-05
   -7.60725774e-05  -7.60725774e-05]]
higher than lib?:  False
************************


### Logistic Regression

In [16]:
lgr = LGR()
lgr.fit(train_vec ,train_label)
lgr.score(test_vec,test_label)

0.80000000000000004

# N-gram model

In [17]:
def Ngram(data):
    data_ng = []
    for line in data:
        line_new = []
        for i in range(len(line)-1):
            line_new.append(line[i]+' '+line[i+1])
        data_ng.append(line_new)
    return data_ng

In [18]:
train_data_ng = Ngram(train_data)
test_data_ng = Ngram(test_data)

In [19]:
Dic_ng = bagOfWord([train_data_ng, test_data_ng])
len(Dic_ng)

15538

In [20]:
%time [train_data_ng_b, test_data_ng_b] = [buildB(train_data_ng,Dic_ng), buildB(test_data_ng,Dic_ng)]

CPU times: user 5.41 s, sys: 171 ms, total: 5.58 s
Wall time: 5.51 s


In [21]:
##drop empty element
train_label_ng = train_label[:]
i = 0
while i < len(train_data_ng_b):
    if not np.linalg.norm(train_data_ng_b[i]):
        train_label_ng.pop(i)
        train_data_ng_b.pop(i)
        train_data_ng.pop(i)
    else:
        i+=1
        
test_label_ng = test_label[:]
i = 0
while i < len(test_data_ng_b):
    if not np.linalg.norm(test_data_ng_b[i]):
        test_label_ng.pop(i)
        test_data_ng_b.pop(i)
        test_data_ng.pop(i)
    else:
        i+=1

In [22]:
l2normalize(train_data_ng_b)
l2normalize(test_data_ng_b)
[train_vec_ng, test_vec_ng] = [standardize(train_data_ng_b, len(Dic_ng)), standardize(test_data_ng_b, len(Dic_ng))]

In [23]:
kmeans_ng = KMeans(n_clusters=2, random_state=0).fit(train_vec_ng)

In [24]:
for i in range(3):
    n_kmeans(train_vec_ng, train_label_ng, kmeans_ng, len(Dic_ng))

point_init1 is  882
point_init2 is  19
iterate time is  20
self-designed accuracy is 0.502928870292887
          lib accuracy is 0.497071129707113
higher than lib?:  True
************************
point_init1 is  902
point_init2 is  1944
iterate time is  22
self-designed accuracy is 0.4866108786610879
          lib accuracy is 0.497071129707113
higher than lib?:  False
************************
point_init1 is  1042
point_init2 is  1883
iterate time is  19
self-designed accuracy is 0.502928870292887
          lib accuracy is 0.497071129707113
higher than lib?:  True
************************


In [25]:
lgr = LGR()
lgr.fit(train_vec_ng ,train_label_ng)
lgr.score(test_vec_ng,test_label_ng)

0.74414414414414409

In [26]:
np.linalg.norm(sum(train_vec))

2.9034284604995552e-12

In [27]:
train_vec = np.array(train_vec)

## PCA

In [28]:
U, s, V = np.linalg.svd(train_vec, full_matrices=True)

In [29]:
s.shape, V.shape

((2400,), (4160, 4160))

In [30]:
train_vec[0].dot(V[:5].T)

array([ 0.18492564, -0.0728534 ,  0.0196917 , -0.32395015, -0.3565442 ])

In [43]:
def reduce(n):
    train_vec_n = train_vec.dot(V[:n].T)
    test_vec_n = test_vec.dot(V[:n].T)
    kmeans_n = KMeans(n_clusters=2, random_state=0).fit(train_vec_n)
    for i in range(1):
        n_kmeans(train_vec_n, train_label, kmeans_n, n)
    lgr = LGR()
    lgr.fit(train_vec_n ,train_label)
    print("Logistic Regression result is",lgr.score(test_vec_n,test_label))

In [44]:
reduce(10)

point_init1 is  1064
point_init2 is  861
iterate time is  14
self-designed accuracy is 0.4841666666666667
          lib accuracy is 0.5041666666666667
centers are:
[[-0.02011884  0.04862659 -0.05775276  0.01156296 -0.02373191  0.00324261
  -0.00641176  0.00356699  0.00370273  0.00427407]
 [ 0.06400174 -0.15469015  0.18372221 -0.0367839   0.07549558 -0.01031534
   0.020397   -0.01134726 -0.01177908 -0.01359662]]
higher than lib?:  False
************************
Logistic Regression result is 0.588333333333


In [45]:
reduce(50)

point_init1 is  103
point_init2 is  280
iterate time is  9
self-designed accuracy is 0.5066666666666667
          lib accuracy is 0.49416666666666664
centers are:
[[ -5.09167002e-04  -7.55143694e-02  -4.93855094e-02  -5.12474633e-03
   -7.58145484e-03  -1.62065610e-03   2.09011451e-03  -1.65430280e-03
    4.07030597e-04   1.70250237e-04  -3.28123723e-04   2.44817689e-03
   -2.91938455e-04   1.76165687e-03   4.59404946e-04  -8.16848836e-05
    9.96984186e-04   7.46765530e-04  -4.98899572e-04  -1.61519053e-04
   -7.22196816e-04   4.54410763e-04   1.25772344e-04   1.31497989e-04
    1.20915207e-04   4.42018034e-05  -1.81904859e-04  -2.51006866e-04
    3.84913098e-04   7.32335197e-04   5.19549174e-04  -3.93928393e-04
    2.90961639e-04   2.75890862e-04   4.30949002e-04   4.11459096e-05
   -4.33379119e-04  -2.09642613e-04   8.88812036e-05   4.61052117e-04
    1.07339755e-04   3.24489751e-04  -2.91770904e-04   2.40840476e-05
    4.95761730e-04   7.52866770e-06   4.89670442e-04  -4.12683752e-

In [46]:
reduce(100)

point_init1 is  1580
point_init2 is  243
iterate time is  13
self-designed accuracy is 0.5158333333333334
          lib accuracy is 0.5058333333333334
centers are:
[[  6.51711773e-02  -1.54826019e-01   1.84210066e-01  -3.67034872e-02
    7.54151151e-02  -9.84643476e-03   1.98465263e-02  -1.12511927e-02
   -1.19006778e-02  -1.33898688e-02   6.49760885e-04   1.87824071e-03
    7.30566318e-03  -1.37515820e-03   6.84624743e-06  -5.11229542e-03
   -3.41634068e-03   1.60439390e-03   1.75015698e-03   1.71268044e-04
    4.10898767e-03  -7.36809082e-04  -1.30689040e-03  -3.24133813e-03
   -2.52093369e-04  -3.88702858e-03  -1.45872331e-03   1.50682598e-03
   -1.29176157e-03  -1.56518486e-03   1.73732966e-03   4.92948399e-04
    1.79719542e-03  -8.84407393e-04  -3.18550720e-03   1.35290655e-03
    1.53325321e-03  -1.18744440e-03  -4.72772270e-04  -4.87504774e-04
    1.08385574e-03   1.22887874e-04   3.57670968e-04   2.23467099e-04
   -1.70878130e-04  -8.77478223e-04  -4.95410218e-04  -1.39434227e

In [35]:
##compare with PCA Lib
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
pca.fit(train_vec)
abs(abs(V[:100])-abs(pca.components_)) < 0.0000000000001

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ..., 
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]], dtype=bool)