# Question_1

### import lib

In [1]:
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
import numpy as np
import random
from sklearn.linear_model import LogisticRegression as LGR
from sklearn.cluster import KMeans

In [2]:
'this' in nltk.corpus.stopwords.words("english")

True

### PreProcess data

In [3]:

# mapping picked up from https://github.com/pararthshah/qa-memnn/blob/master/nltk_utils.py

lemmatizer = WordNetLemmatizer()

def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None


'''
takes in a list of tokens of length > 0
returns a list of the most likely part of speech for the token

'''
def get_pos(tokens):
    pos_list = nltk.pos_tag(tokens)
    #print(pos_list)
    pos_pegs = []
    for i, (token, pos) in enumerate(pos_list):        
        pos_peg = penn_to_wn(pos)
        if pos_peg is not None:
            pos_pegs.append(pos_peg)
        else:
            pos_pegs.append(None)
    return list(zip(tokens, pos_pegs))

def lemmatize(pos_tagged_tokens):
    lemmatized_tokens = []
    for i, (token, pos) in enumerate(pos_tagged_tokens):
        #print(token, pos)
        lemmatized_tokens.append(lemmatizer.lemmatize(token, pos=(pos if pos is not None else 'n')))
    return lemmatized_tokens

In [4]:
"""
train_data includes 2400 samples, where each sample is a list including the
elements which are the words in reviews.

train_label includes 2400 samples which belongs to {0,1}, which is the label 
of train_data.

test_data has the same form as the train_data, while it has 600 sample.

test_label is the same as train_label.
"""
def Split(filenames):

    train_data = []
    train_label = []
    test_data = []
    test_label = []
    root = "sentiment labelled sentences/"
    for filename in filenames:
        path = root + filename
        count = [1,1]
        punctuation = ["!","%","&","(",")","+",".",":",";","<","=",">","?","*",",","\t",""]
        #meanless = ["and", "or", "the"]
        for line in open(path):
            if line[-1] == "\n":
                line = line[:-1]
            a = int(line[-1])
            b=[]
            for word in line[:-1].split(' '):
                ##while word and word[-1] in punctuation:
                    ##word = word[:-1]
                ##b.append(wordnet_lemmatizer.lemmatize(word.lower()))
                i = 0
                while i < len(word):
                    if word[i] in punctuation:
                        word = word[:i]+word[i+1:]
                    else:
                        i+=1
                if not word:
                    continue
                pos_tagged_tokens = get_pos([word.lower()])
                c = lemmatize(pos_tagged_tokens)[0]
                if c in nltk.corpus.stopwords.words("english"):
                    continue
                b.append(c)
            if count[a] > 400:
                test_label.append(a)
                test_data.append(b)
            else:
                train_label.append(a)
                train_data.append(b)
            count[a]+=1
    return [train_data, train_label, test_data, test_label]

In [5]:
[train_data, train_label, test_data, test_label] = Split(["yelp_labelled.txt","amazon_cells_labelled.txt","imdb_labelled.txt"])

In [6]:
train_data

[['wow', 'love', 'place'],
 ['crust', 'good'],
 ['tasty', 'texture', 'nasty'],
 ['stop',
  'late',
  'may',
  'bank',
  'holiday',
  'rick',
  'steve',
  'recommendation',
  'love'],
 ['selection', 'menu', 'great', 'price'],
 ['get', 'angry', 'want', 'damn', 'pho'],
 ['honeslty', "didn't", 'taste', 'fresh'],
 ['potato',
  'like',
  'rubber',
  'could',
  'tell',
  'make',
  'ahead',
  'time',
  'kept',
  'warmer'],
 ['fry', 'great'],
 ['great', 'touch'],
 ['service', 'prompt'],
 ['would', 'go', 'back'],
 ['cashier', 'care', 'ever', 'say', 'still', 'end', 'wayyy', 'overprice'],
 ['try', 'cape', 'cod', 'ravoli', 'chickenwith', 'cranberrymmmm'],
 ['disgust', 'pretty', 'sure', 'human', 'hair'],
 ['shock', 'sign', 'indicate', 'cash'],
 ['highly', 'recommend'],
 ['waitress', 'little', 'slow', 'service'],
 ['place', 'worth', 'time', 'let', 'alone', 'vega'],
 ['like'],
 ['burrittos', 'blah'],
 ['food', 'amaze'],
 ['service', 'also', 'cute'],
 ['could', 'care', 'less', 'interior', 'beautiful'],

## Bag of Words 

In [7]:
"""
dic is a dictionary where key is the word shows in train_data and the items
of is a list with two elements, first one is the frequency of the key and 
second element is the index of the key in feature vector, which we will use
after.
"""
def bagOfWord(data):
    dic = {}
    t = 0
    n = 0
    for dataset in data:
        for line in dataset:
            for word in line:
                if word in dic:
                    dic[word][0] += 1
                elif t == 0:
                    dic[word] = [1,n]
                    n+=1
        t = 1
    return dic

In [8]:
Dic = bagOfWord([train_data, test_data])
len(Dic)

4058

In [9]:
"""Build feature vector."""
def buildB(data, dic):
    data_b = []
    size_dic = len(dic)
    for line in data:
        temp = [0]*size_dic
        for word in line:
            if word in dic:
                temp[dic[word][1]]+=1.0
        data_b.append(np.array(temp))
    return data_b
    

In [10]:
%time [train_data_b, test_data_b] = [buildB(train_data,Dic), buildB(test_data,Dic)]

CPU times: user 1.41 s, sys: 49.8 ms, total: 1.46 s
Wall time: 1.48 s


In [11]:
for i in test_data_b:
    if not np.linalg.norm(i):
        print('yes')
        break

yes


### postprocess feature vectors

In [12]:
"""
l^2 normalization
"""
def l2normalize(data):
    for vector in data:
        L = np.linalg.norm(vector)
        if L:
            vector /= L
                
def standardize(data_b, size_dic):
    s = np.array([0.0]*size_dic)
    for bite in data_b:
        s += bite
    s_ = s/len(data_b)
    vec = []
    for bit in data_b:
            vec.append(bit - s_)
    return np.array(vec)

In [13]:
"""
train_vec and test_vec will be the feature vector to be used for future.
"""
l2normalize(train_data_b), l2normalize(test_data_b)
[train_vec, test_vec] = [standardize(train_data_b,len(Dic)), standardize(test_data_b,len(Dic))]

# K-means

In [14]:
"""
randomly pick two points in sample set to be initial points
label is a list indicate which cluster the vector is signed to.
p is the list including two mean point that the model converget to.
During the function, it first prints which two points function pick as
initial points and then how many time it iterates.
"""
def KMeans_2(data,size_dic):
    ##p = kmeans.cluster_centers_
    ##label = kmeans.labels_
    a = random.randint(0,len(data)-1)
    b = random.randint(0,len(data)-1)
    while a==b:
        b = random.randint(0,len(data)-1)
    p = np.array([data[a], data[b]])
    print("point_init1 is ",a)
    print("point_init2 is ",b)
    label = [0]*len(data)
    conver = False
    count = 0
    while not conver:
        count += 1
        conver = True
        for i in range(len(data)):
            d = [0]*2
            d[0] = np.linalg.norm(p[0]-data[i])
            d[1] = np.linalg.norm(p[1]-data[i])
            if d[label[i]] > d[1-label[i]]:
                conver = False
                label[i] = 1-label[i]
        if not conver:
            ##print("a")
            for j in [0,1]:
                n_p = 0
                s_p = np.array([0.0]*size_dic)
                for point in range(len(label)):
                    if label[point] == j:
                        s_p += data[point]
                        n_p += 1
                p[j] = s_p/n_p
    print("iterate time is ",count)
    return(label, p)

In [15]:
def n_kmeans(vec, k_train_label,kmeans_lib,size):
    [k_label, k_p]=KMeans_2(vec, size)
    n_bruce = 0
    n_python = 0
    for i in range(len(k_label)):
        if k_train_label[i] == kmeans_lib.labels_[i]:
            n_python+=1
        if k_train_label[i] == k_label[i]:
            n_bruce+=1
    print("self-designed accuracy is",n_bruce/len(k_label))
    print("          lib accuracy is", n_python/len(k_label))
    print("centers are:")
    print(k_p)
    print("higher than lib?: ",n_python/len(k_label)<n_bruce/len(k_label) )
    print("************************")

In [16]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(train_vec)

In [17]:
for i in range(1):
    n_kmeans(train_vec, train_label, kmeans,len(Dic))

point_init1 is  85
point_init2 is  2226
iterate time is  13
self-designed accuracy is 0.50875
          lib accuracy is 0.44125
centers are:
[[  1.16586891e-04   1.34438068e-03   5.12423086e-05 ...,   1.38135127e-05
    1.38135127e-05   1.38135127e-05]
 [ -8.79173034e-04  -1.01378743e-02  -3.86414420e-04 ...,  -1.04166667e-04
   -1.04166667e-04  -1.04166667e-04]]
higher than lib?:  True
************************


### Logistic Regression

In [18]:
lgr = LGR()
lgr.fit(train_vec ,train_label)
lgr.score(test_vec,test_label)

0.80000000000000004

# N-gram model

In [19]:
def Ngram(data):
    data_ng = []
    for line in data:
        line_new = []
        for i in range(len(line)-1):
            line_new.append(line[i]+' '+line[i+1])
        data_ng.append(line_new)
    return data_ng

In [20]:
train_data_ng = Ngram(train_data)
test_data_ng = Ngram(test_data)

In [21]:
Dic_ng = bagOfWord([train_data_ng, test_data_ng])
len(Dic_ng)

11372

In [22]:
%time [train_data_ng_b, test_data_ng_b] = [buildB(train_data_ng,Dic_ng), buildB(test_data_ng,Dic_ng)]

CPU times: user 3.85 s, sys: 94.8 ms, total: 3.94 s
Wall time: 3.98 s


In [23]:
##drop empty element
train_label_ng = train_label[:]
i = 0
while i < len(train_data_ng_b):
    if not np.linalg.norm(train_data_ng_b[i]):
        train_label_ng.pop(i)
        train_data_ng_b.pop(i)
        train_data_ng.pop(i)
    else:
        i+=1
        
test_label_ng = test_label[:]
i = 0
while i < len(test_data_ng_b):
    if not np.linalg.norm(test_data_ng_b[i]):
        test_label_ng.pop(i)
        test_data_ng_b.pop(i)
        test_data_ng.pop(i)
    else:
        i+=1

In [24]:
l2normalize(train_data_ng_b)
l2normalize(test_data_ng_b)
[train_vec_ng, test_vec_ng] = [standardize(train_data_ng_b, len(Dic_ng)), standardize(test_data_ng_b, len(Dic_ng))]

In [25]:
kmeans_ng = KMeans(n_clusters=2, random_state=0).fit(train_vec_ng)

In [33]:
for i in range(1):
    n_kmeans(train_vec_ng, train_label_ng, kmeans_ng, len(Dic_ng))

point_init1 is  1876
point_init2 is  2298
iterate time is  2
self-designed accuracy is 0.49654576856649396
          lib accuracy is 0.5043177892918825
centers are:
[[  3.95997158e-07   1.70272111e-06   5.60024551e-07 ...,   1.44597717e-07
    1.44597717e-07   1.44597717e-07]
 [ -3.05313809e-04  -1.31279798e-03  -4.31778929e-04 ...,  -1.11484840e-04
   -1.11484840e-04  -1.11484840e-04]]
higher than lib?:  False
************************


In [34]:
lgr = LGR()
lgr.fit(train_vec_ng ,train_label_ng)
lgr.score(test_vec_ng,test_label_ng)

0.76056338028169013

## PCA

In [28]:
U, s, V = np.linalg.svd(train_vec, full_matrices=True)

In [29]:
s.shape, V.shape

((2400,), (4058, 4058))

In [35]:
train_vec[0].dot(V[:5].T)

array([-0.0495421 ,  0.03947017, -0.07532169,  0.07609576, -0.03962555])

In [36]:
def reduce(n):
    train_vec_n = train_vec.dot(V[:n].T)
    test_vec_n = test_vec.dot(V[:n].T)
    kmeans_n = KMeans(n_clusters=2, random_state=0).fit(train_vec_n)
    for i in range(1):
        n_kmeans(train_vec_n, train_label, kmeans_n, n)
    lgr = LGR()
    lgr.fit(train_vec_n ,train_label)
    print("Logistic Regression result is",lgr.score(test_vec_n,test_label))

In [37]:
reduce(10)

point_init1 is  567
point_init2 is  1289
iterate time is  4
self-designed accuracy is 0.5025
          lib accuracy is 0.44208333333333333
centers are:
[[ 0.01862558 -0.09529385  0.04098942  0.11560459  0.14163764  0.0322151
   0.08448645  0.02488959 -0.03347525 -0.23618515]
 [-0.00098029  0.00501547 -0.00215734 -0.00608445 -0.00745461 -0.00169553
  -0.00444666 -0.00130998  0.00176186  0.0124308 ]]
higher than lib?:  True
************************
Logistic Regression result is 0.633333333333


In [38]:
reduce(50)

point_init1 is  708
point_init2 is  580
iterate time is  5
self-designed accuracy is 0.49916666666666665
          lib accuracy is 0.44166666666666665
centers are:
[[ -3.85958262e-02   2.98788553e-02   1.09260325e-03  -3.50402718e-02
   -6.03908006e-02   2.71285544e-02   9.24205680e-02  -2.42083049e-02
   -1.16878912e-02  -4.17813085e-02  -1.08842813e-03   9.82827612e-02
   -1.06257074e-01   7.95867231e-02   2.71005850e-01  -1.91636199e-02
    1.19027426e-01   6.27011539e-02   2.37796589e-02  -4.12539733e-02
    5.71128940e-02   4.97341417e-03   4.46629939e-02  -2.72566169e-02
   -2.55405800e-02  -6.76200807e-03   1.13661894e-02  -8.78335209e-03
   -4.75150784e-03   7.61861380e-03  -4.62395727e-03   4.67152023e-03
    2.84586015e-03   2.14300684e-03   1.18431878e-03  -6.56355824e-03
    1.43377025e-03   1.90586999e-03   5.79876858e-03  -1.32462988e-03
    4.29970564e-03  -7.10216944e-04  -9.32155332e-04   2.95507672e-04
   -5.11507905e-03  -2.94108678e-03  -8.33417749e-04   5.63885268e

In [39]:
reduce(100)

point_init1 is  1674
point_init2 is  739
iterate time is  5
self-designed accuracy is 0.5125
          lib accuracy is 0.44166666666666665
centers are:
[[ -2.58300899e-02   1.87944688e-02  -3.13997139e-02   5.03425740e-02
    1.25379216e-02  -5.98003149e-03  -9.05578367e-02   5.50654209e-02
    1.15977233e-01  -1.96191894e-01   1.35022180e-01   5.23498559e-02
   -1.72298261e-01   1.59890578e-03  -7.26419893e-02   2.85466726e-02
    2.68612741e-02  -4.02900039e-03   4.73988738e-03   3.10942854e-02
    4.57722255e-03  -4.20099517e-03  -1.87796410e-02   1.58850513e-03
   -3.95072934e-02   5.20537500e-02   2.05245987e-02  -1.09424503e-04
    7.63602357e-04   9.57360179e-03  -7.90333903e-03   7.59567835e-03
    8.61797566e-03   4.44212040e-05  -3.78065237e-03  -5.93526239e-03
   -1.24102836e-05  -2.39677943e-03  -3.81382343e-03   1.58868143e-03
   -2.21226722e-03  -2.20930982e-03  -1.32397568e-03  -2.76130301e-03
    2.10583604e-03   6.62133470e-04  -3.32544027e-03   1.16283299e-03
   -1.36

In [40]:
##compare with PCA Lib
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
pca.fit(train_vec)
abs(abs(V[:100])-abs(pca.components_)) < 0.0000000000001

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ..., 
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]], dtype=bool)