/
perceplearn3.py
811 lines (704 loc) · 39.5 KB
/
perceplearn3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
#!/usr/local/bin/python3.6
# encoding: utf-8
'''
Perceptron.perceplearn -- This perceptron classifiers learner program include vanilla and averaged models
Perceptron.perceplearn is a perceptron classifiers learner program which include vanilla and averaged models
to identify hotel reviews as either true or fake, and either positive or negative.
The word tokens will be treated as features, or other features may devise from the text.
The learner will store weights and bias into model file and pass to classifier to perform classification tasks.
The argument is a single file containing the training data; the program will learn perceptron models, and write
the model parameters to two files:
1. vanillamodel.txt for the vanilla perceptron, and
2. averagedmodel.txt for the averaged perceptron.
It defines classes_and_methods
@author: Cheng-Lin Li a.k.a. Clark Li@University of Southern California 2018. All rights reserved.
@copyright: 2018 organization_name. All rights reserved.
@license: Licensed under the GNU v3.0. https://www.gnu.org/licenses/gpl.html
@contact: chenglil@usc.edu or clark.cl.li@gmail
@version: 1.0
@create: April 18, 2018
@updated: April 20, 2018
'''
from __future__ import print_function
from __future__ import division
__all__ = []
__version__ = 1.0
__date__ = '2018-04-18'
__updated__ = '2018-04-20'
import sys, os
import collections
import math, json, re
from datetime import datetime
import numpy as np
from random import seed, randrange
#Reference F1 score: 0.88 for vanilla perceptron and 0.89 for the averaged perceptron,
DEBUG = 0 # 1 = print debug information, 2=detail steps information
PRINT_TIME = 0 # 0= disable, 1 = print time stamps, 2 = print detail time stamps
TOKEN_DELIMITER = ' ' #the splitter for each token ( word/tag ).
COLUMNS = 4 # How many columns of data store in the training set.
LOW_FREQ_OBSERVATION_THRESHOLD = 2 # words appear more than or equal to the number of times will be reserved
HIGH_FREQ_OBSERVATION_THRESHOLD =1000 # words appear less than or equal to the number of times will be reserved
FOLDS = 10 # folds = None or folds >= 1, folds=1=random sample data without validation set.
TEST_RATIO = 0.1 # Holdout ratio, TEST_RATIO = 0 = sequance sample data without validate set
ITERATION = 30
CONVERAGE = 0.0001
PATIENT = 5
SEED = 9999999
ASCII_ONLY = True
REMOVE_STOPWORDS = True
REMOVE_PUNCTUATION = True
# NLTK stop words
'''
STOP_WORDS = ['ourselves', 'he', 've', 'and', 'm', 'shan', 'having', 'an', 'other', 'wasn', 'me', 'had', 'why', 'up', 'same', 'these',\
'be', 'did', 'some', 'few', 'she', 'between', 'for', 'as', 'weren', 'most', 'from', 'no', 'in', 'there', 'but', 'before',\
'about', 'what', 'then', 'her', 'any', 'more', 'of', 'once', 'now', 'or', 'y', 'their', 'don', 'who', 'which', 'at', 'to',\
'isn', 'each', 'own', 'because', 'myself', 'll', 't', 're', 'wouldn', 'were', 'doesn', 'until', 'such', 'both', 'only',\
'we', 'with', 'ma', 'against', 'couldn', 'they', 'doing', 'needn', 'your', 'too', 'them', 'aren', 'yours', 'didn', 'that',\
'is', 'mustn', 'should', 'being', 'i', 'on', 'if', 'mightn', 'when', 'down', 'haven', 'where', 'it', 'than', 'how',\
'itself', 'our', 'so', 'himself', 'shouldn', 'above', 'you', 'ain', 'my', 'can', 'after', 'while', 'the', 'him', 'hasn',\
'a', 'been', 's', 'will', 'ours', 'into', 'yourself', 'here', 'further', 'by', 'yourselves', 'his', 'whom', 'do', 'over',\
'under', 'very', 'was', 'hadn', 'again', 'theirs', 'not', 'nor', 'those', 'this', 'below', 'does', 'all', 'has', 'during',\
'am', 'hers', 'd', 'off', 'have', 'through', 'out', 'herself', 'just', 'its', 'o', 'themselves', 'won', 'are']
# Stanford NLP stop words
STOP_WORDS = ["'ll", "'s", "'m", 'a', 'about', 'above', 'after', 'again',\
'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being',\
'below', 'between', 'both', 'but', 'by', 'can', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do' , 'does',\
"doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't",\
'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself',\
'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself',\
"let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or',\
'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's",\
'should', "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves',\
'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through',\
'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't",\
'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with',\
"won't", 'wourld', 'would', "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself',\
'yourselves', '###', 'return', 'arent', 'cant', 'couldnt', 'didnt', 'doesnt', 'dont', 'hadnt', 'hasnt', 'havent', 'hes',\
'heres', 'hows', 'im', 'isnt', 'its', 'lets', 'mustnt', 'shant', 'shes', 'shouldnt', 'thats', 'theres', 'theyll',\
'theyre', 'theyve', 'wasnt', 'were','werent', 'whats', 'whens', 'wheres', 'whos', 'whys', 'wont', 'wouldnt', 'youd',\
'youll', 'youre', 'youve']
'''
# Stanford NLP + NLTK stop words, remove but, add us
STOP_WORDS = ['ve', 'm', 'shan', 'wasn', 'weren', 'y','don', 'isn', 'll', 't', 're', 'wouldn', 'dosen', 'ma', 'couldn', 'needn', 'aren',\
'didn','mustn', 'mightn', 'haven', 'shouldn', 'ain', 'hasn', 's', 'hadn', 'd', 'o', 'won', 'but',\
"'ll", "'s", "'m", 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't",\
'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'by', 'can', "can't", 'cannot',\
'could', "couldn't", 'did', "didn't", 'do' , 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for',\
'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here',\
"here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is',\
"isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off',\
'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd",\
"she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them',\
'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through',\
'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't",\
'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with',\
"won't", 'wourld', 'would', "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself',\
'yourselves', '###', 'return', 'arent', 'cant', 'couldnt', 'didnt', 'doesnt', 'dont', 'hadnt', 'hasnt', 'havent', 'hes',\
# 'heres', 'hows', 'im', 'isnt', 'its', 'lets', 'mustnt', 'shant', 'shes', 'shouldnt', 'thats', 'theres', 'theyll',\
'theyre', 'theyve', 'wasnt', 'were','werent', 'whats', 'whens', 'wheres', 'whos', 'whys', 'wont', 'wouldnt', 'youd',\
'youll', 'youre', 'youve', 'us']
PUNCTUATION = ['!!','?!','??','!?','`','``',"''", ',', '.', ':', ';', '"', "'", '?', '<', '>', '{', '}', '[', ']', '+', '-', '(',\
')', '&', '%', '$', '@', '!', '^', '#', '*', '..', '...']
VANILLA_MODEL_FILE_NAME = './vanillamodel.txt'
AVERAGED_MODEL_FILE_NAME = './averagedmodel.txt'
def get_input(file_name):
document = []
try:
with open(file_name, 'r', encoding='utf-8') as _fp:
for _each_line in _fp:
_each_line =_each_line.strip()
document.append(_each_line)
return document
except IOError as _err:
if (1):
print ('File error: ' + str (_err))
else :
pass
exit()
def print_list(l):
for i in l:
print(i)
def get_feature_matrix(documents):
'''
Find total vocabulary we have from those reviews and create document vector for each review.
word_dict = {'word0': 0, 'word1': 1,....'wordn', n} where the 0 ... n is the index for word0 ... wordn.
document_matrix = [
[0, 0, 1, 5, ...1, 0] # First document / review with word distribution. There are 1 word2, 5 word3, ... 1 wordn-1
[1, 0, 0, 1, ...0, 0] # Second document / review with word distribution. There are 1 word0, 1 word3, ...
[2, 10, 1, 2, ...3, 0]
...
[0, 11, 0, 0, ...9, 1]
]
'''
classes_dict_list = [{}, {}] # [{'Fake': -1, 'True': 1}, {'Neg': -1, 'Pos': 1}]
label = []
data = []
word_dict = collections.OrderedDict() # dictionary for each word
document_matrix = [] # Store vector of documents
word_doc_count = {} #{'word1': number_of_documents_exist_word1, 'word2': ...} for IDF calculations
init_binary_classification = -1
review, sentences = '', ''
tokenize = tokenizer(STOP_WORDS, PUNCTUATION)
for _each_line in documents:
review = _each_line.rstrip('\n').split(TOKEN_DELIMITER, COLUMNS-1)
# Get review from (columns-1)th column of text content file.
sentences = tokenize.get_wordlist(review[COLUMNS-1], ascii_only=ASCII_ONLY, remove_stopwords=REMOVE_STOPWORDS, remove_punctuation = REMOVE_PUNCTUATION)
if DEBUG > 1:
print (sentences)
label.append([review[1], review[2]])
data.append(sentences)
# Collect how many classes we have to identify and give it '1' or '-1' as label value.
if (review[1] in classes_dict_list[0]) and (review[2] in classes_dict_list[1]):
pass
elif (review[1] not in classes_dict_list[0]) and (review[2] not in classes_dict_list[1]):
classes_dict_list[0][review[1]] = init_binary_classification
classes_dict_list[1][review[2]] = init_binary_classification
init_binary_classification *= -1
elif review[1] not in classes_dict_list[0]:
classes_dict_list[0][review[1]] = init_binary_classification
elif review[2] not in classes_dict_list[1]:
classes_dict_list[1][review[2]] = init_binary_classification
### TF-IDF feature
# tfidf = tf_idf(LOW_FREQ_OBSERVATION_THRESHOLD, HIGH_FREQ_OBSERVATION_THRESHOLD)
# word_dict, document_matrix, word_doc_count = tfidf.get_dictionary_n_document_matrix(data)
# # Convert the label from text (Fake, True, Neg, Pos) to value (+1, -1)
# label = np.array([ [classes_dict_list[0][cf[0]], classes_dict_list[1][cf[1]]] for cf in label ])
#
wc = word_counts(LOW_FREQ_OBSERVATION_THRESHOLD, HIGH_FREQ_OBSERVATION_THRESHOLD)
word_dict, document_matrix = wc.get_dictionary_n_document_matrix(data)
# Convert the label from text (Fake, True, Neg, Pos) to value (+1, -1)
label = np.array([ [classes_dict_list[0][cf[0]], classes_dict_list[1][cf[1]]] for cf in label ])
return word_dict, document_matrix, classes_dict_list, label, word_doc_count
class tf_idf(object):
def __init__(self, LOW_FREQ_OBSERVATION_THRESHOLD = None, HIGH_FREQ_OBSERVATION_THRESHOLD = None):
self.word_dict = collections.OrderedDict() # dictionary for each word {'word1': no of documents have word1, 'word2':...}
self.document_matrix = [] # Store vector of documents # store tf then calculate tf-idf
self.word_doc_count = {} #{'word1': number_of_documents_exist_word1, 'word2': ...} for IDF calculations
self.low_freq_threshold = LOW_FREQ_OBSERVATION_THRESHOLD
self.high_freq_threshold = HIGH_FREQ_OBSERVATION_THRESHOLD
def get_dictionary_n_document_matrix(self, documents):
word_dict = self.word_dict
doc_matrix = self.document_matrix
high_threshold = self.high_freq_threshold
low_threshold = self.low_freq_threshold
N = len(documents) # total number of documents
word_count = {} #{'word1': count1, 'word2': count2,...}
word_doc_count = {} #{'word1': number_of_documents_exist_word1, 'word2': ...} for IDF calculations
# Build word counts dictionary
for doc in documents:
doc_vector = []
word_set = set()
for word in doc:
word_count[word] = word_count.get(word, 0) + 1
word_set.add(word)
for word in word_set:
word_doc_count[word] = word_doc_count.get(word, 0) + 1
if DEBUG > 0 : print('Word_count=%s'%(word_count))
for doc in documents:
doc_vector = [0] * len(word_dict)
for word in doc:
# detect low or high frequency words
if word_count[word] >= low_threshold and word_count[word] <= high_threshold:
if word in word_dict: # if word in dictionary
idx = word_dict[word] # Get index
else: # Create word index into dictionary
idx = len(word_dict)
word_dict[word] = idx
if len(doc_vector) > idx:
doc_vector[idx] += 1 # add word counts
else: # A new word in the document
doc_vector.insert(idx, 1) # add one word in the index position of the word.
else: # skip low or high frequency words
pass
# Calculate TF-IDF
nw = sum(doc_vector)
for word in doc:
if word_count[word] >= low_threshold and word_count[word] <= high_threshold:
_i = word_dict[word]
doc_vector[_i] = doc_vector[_i]/nw * math.log(N/word_doc_count[word])
doc_matrix.append(doc_vector)
# initial a matrix with row no = no. of reviews and columns = no. of vocabulary
_m = np.zeros([len(doc_matrix),len(word_dict)])
for i,doc in enumerate(doc_matrix):
_m[i][0:len(doc)] = doc
self.word_dict = word_dict
self.document_matrix = np.matrix(_m)
self.word_doc_count = word_doc_count
if DEBUG > 0 : print (self.document_matrix)
return self.word_dict, self.document_matrix, self.word_doc_count
class word_counts(object):
def __init__(self, LOW_FREQ_OBSERVATION_THRESHOLD = None, HIGH_FREQ_OBSERVATION_THRESHOLD = None):
self.word_dict = collections.OrderedDict() # dictionary for each word
self.document_matrix = [] # Store vector of documents
self.low_freq_threshold = LOW_FREQ_OBSERVATION_THRESHOLD
self.high_freq_threshold = HIGH_FREQ_OBSERVATION_THRESHOLD
def get_dictionary_n_document_matrix(self, documents):
word_dict = self.word_dict
doc_matrix = self.document_matrix
high_threshold = self.high_freq_threshold
low_threshold = self.low_freq_threshold
word_count = {} #{'word1': count1, 'word2': count2,...}
# Build word counts dictionary
for doc in documents:
doc_vector = []
for word in doc:
word_count[word] = word_count.get(word, 0) + 1
if DEBUG > 0 : print('Word_count=%s'%(word_count))
for doc in documents:
doc_vector = [0] * len(word_dict)
for word in doc:
# detect low or high frequency words
if word_count[word] >= low_threshold and word_count[word] <= high_threshold:
if word in word_dict: # if word in dictionary
idx = word_dict[word] # Get index
else: # Create word index into dictionary
idx = len(word_dict)
word_dict[word] = idx
if len(doc_vector) > idx:
doc_vector[idx] += 1 # add word counts
else: # A new word in the document
doc_vector.insert(idx, 1) # add one word in the index position of the word.
else: # skip low or high frequency words
pass
doc_matrix.append(doc_vector)
# initial a matrix with row no = no. of reviews and columns = no. of vocabulary
_m = np.zeros([len(doc_matrix),len(word_dict)])
for i,doc in enumerate(doc_matrix):
_m[i][0:len(doc)] = doc
self.word_dict = word_dict
self.document_matrix = np.matrix(_m)
if DEBUG > 0 : print (self.document_matrix)
return self.word_dict, self.document_matrix
class Perceptron(object):
'''
Algorithm: PerceptronTrain(D, MaxIter)
1: wd ← 0, for all d = 1 . . . D # initialize weights
2: b ← 0 # initialize bias
3: for iter = 1 . . . MaxIter do
4: for all (x,y) ∈ D do
5: a ← ∑d=1~D wd xd + b # compute activation for this example
6: if ya ≤ 0 then
7: wd ← wd + yxd, for all d = 1 ... D # update weights
8: b ← b + y # update bias
9: end if
10: end for
11: end for
12: return w0, w1, ..., wD, b
Algorithm: PerceptronTest(w0, w1, ..., wD, b, ˆx)
1: a ← ∑D d=1 wd xˆ_d + b # compute activation for the test example
2: return sign(a)
Algorithm: AveragedPerceptronTrain(D, MaxIter)
1: w ← <0, 0, . . . 0>, b ← 0 # initialize weights and bias
2: u ← <0, 0, . . . 0>, β ← 0 # initialize chased weights and bias
3: c ← 1 # initialize example counter to one
4: for iter = 1 . . . MaxIter do
5: for all (x,y) ∈ D do
6: if y(w · x + b) ≤ 0 then
7: w ← w + y x # update weights
8: b ← b + y # update bias
9: u ← u + y c x # update cached weights
10: β ← β + y c # update cached bias
11: end if
12: c ← c + 1 # increment counter regardless of update
13: end for
14: end for
15: return w - 1/c u, b - 1/c β # return averaged weights and bias
'''
def __init__ (self, algorithm=None, iter=None):
self.iteration = iter
self.set_algorithm(algorithm)
self.weights = None
self.bias = None
def set_algorithm(self, algorithm):
if algorithm == 'vanilla':
self.execute = self.vanilla_perceptron
elif algorithm == 'averaged':
self.execute = self.averaged_perceptron
else:
pass
def load_model(self, weights, bias):
self.weights = np.array(weights)
self.bias = bias
def get_fold_index(self, len_data, folds):
dataset_split = []
sample_index = [i for i in range(len_data)]
fold_size = int(len_data / folds)
for i in range(folds):
fold = []
while len(fold) < fold_size:
index = randrange(len(sample_index))
# index = 0 #If follow the original data sequence
fold.append(sample_index.pop(index))
dataset_split.append(fold)
return dataset_split
def get_kfolds_training_validate_sets(self, iter, folds_sample):
'''
For fold = 3, folds_sample = [[7,1,3], [5,8,0], [2,4,6]]
'''
folds = len(folds_sample)
training_index = []
validate_index = []
if folds == 1:
training_index = folds_sample[folds-1]
if DEBUG > 1: print(training_index)
else:
for i in range(folds):
if i != iter%folds:
training_index.extend(folds_sample[i])
else:
validate_index.extend(folds_sample[i])
if DEBUG > 1: print('iter=%d, training=%s'%(iter, training_index))
if DEBUG > 1: print('iter=%d, validate=%s'%(iter, validate_index))
return training_index, validate_index
def vanilla_perceptron(self, training, training_labels, validate=None, validate_labels=None, folds=None, patient=0):
'''
This perceptron classifiers learner algorithm of vanilla perceptron.
Algorithm: PerceptronTrain(D, MaxIter)
1: wd ← 0, for all d = 1 . . . D # initialize weights
2: b ← 0 # initialize bias
3: for iter = 1 . . . MaxIter do
4: for all (x,y) ∈ D do
5: a ← ∑d=1~D wd xd + b # compute activation for this example
6: if ya ≤ 0 then
7: wd ← wd + yxd, for all d = 1 ... D # update weights
8: b ← b + y # update bias
9: end if
10: end for
11: end for
12: return w0, w1, ..., wD, b
Algorithm: PerceptronTest(w0, w1, ..., wD, b, ˆx)
1: a ← ∑D d=1 wd xˆ_d + b # compute activation for the test example
2: return sign(a)
'''
X = training
Y = training_labels
max_f1 = 0
_no_improve = 0
max_weights = None
max_bias = None
weights = np.zeros(shape=(1, X.shape[1])) # Get the number of vocabularies
bias = 0
training_index, validate_index = 0, 0
if folds is not None and folds > 1: #K-folds approach
folds_sample = self.get_fold_index(len(X), folds)
self.iteration = folds
# print (folds_sample)
for _it in range(self.iteration):
if DEBUG > 0: print('iteration=%d'%(_it))
if folds is not None: #K-folds approach
if folds == 1: # random select data from training set
folds_sample = self.get_fold_index(len(X), folds)
training_index, validate_index = self.get_kfolds_training_validate_sets( _it, folds_sample)
else: # general training / validate set approach
training_index, validate_index = [i for i in range(len(training))], [i for i in range(len(validate))]
for i in training_index:
x = X[i]
y = Y[i]
_a = np.dot(weights, x.transpose()) + bias
if y*_a <= 0:
weights = weights + np.dot(y, x)
bias = bias + y
if DEBUG > 1 : print('iteration:%d, y=%f, _a=%f, x=%s, b=%f, w=%s'%(_it, y, _a, str(x), bias, str(weights)))
self.weights = weights
self.bias = bias
if (folds is not None and folds > 1):
vX = [X[i] for i in validate_index]
vY = [Y[i] for i in validate_index]
elif validate is not None and len(validate) > 0 and validate_labels is not None and len(validate_labels) > 0:
vX = validate
vY = validate_labels
else:
vX = X
vY = Y
max_f1, _avg_f1, max_weights, max_bias, _no_improve = self.cross_validate(vX, vY, max_f1, max_weights, max_bias, _no_improve)
if _no_improve >= patient and (folds is None or _it > folds): # Make sure k-folds are passed
if DEBUG > 0: print('Escape: _final_avg_f1=%f, max_avg_f1=%f\n'%(_avg_f1, max_f1))
break
self.weights = max_weights
self.bias = max_bias
# return [self.weights.tolist(), self.bias.tolist()]
return self.weights, self.bias
def averaged_perceptron(self, training, training_labels, validate=None, validate_labels=None, folds=1, patient=0):
'''
This perceptron classifiers learner algorithm of averaged perceptron.
Algorithm: AveragedPerceptronTrain(D, MaxIter)
1: w ← <0, 0, . . . 0>, b ← 0 # initialize weights and bias
2: u ← <0, 0, . . . 0>, β ← 0 # initialize chased weights and bias
3: c ← 1 # initialize example counter to one
4: for iter = 1 . . . MaxIter do
5: for all (x,y) ∈ D do
6: if y(w · x + b) ≤ 0 then
7: w ← w + y x # update weights
8: b ← b + y # update bias
9: u ← u + y c x # update cached weights
10: β ← β + y c # update cached bias
11: end if
12: c ← c + 1 # increment counter regardless of update
13: end for
14: end for
15: return w - ((1/c)*u), b - ((1/c)* β) # return averaged weights and bias
'''
X = training
Y = training_labels
max_f1 = 0
_no_improve = 0
max_weights = None
max_bias = None
weights = np.zeros(shape=(1, X.shape[1])) # Get the number of vocabularies
bias = 0
training_index, validate_index = 0, 0
u = np.zeros(shape=(1, X.shape[1])) # Get the number of vocabularies
b = 0
c = 1
if folds is not None and folds >1: #K-folds approach
folds_sample = self.get_fold_index(len(X), folds)
self.iteration = folds
for _it in range(self.iteration):
if DEBUG > 0: print('iteration=%d'%(_it))
if folds is not None: #K-folds approach
if folds == 1: # random select data from training set
folds_sample = self.get_fold_index(len(X), folds)
training_index, validate_index = self.get_kfolds_training_validate_sets( _it, folds_sample)
else: # general training / validate set approach
training_index, validate_index = [i for i in range(len(training))], [i for i in range(len(validate))]
for i in training_index:
x = X[i]
y = Y[i]
_a = np.dot(weights, x.transpose()) + bias
if y*_a <= 0:
weights = weights + np.dot(y, x)
bias = bias + y
u = u + np.dot(y*c, x)
b = b + y*c
c += 1
if DEBUG > 1 : ('iteration:%d, y=%f, _a=%f, x=%s, b=%f, w=%s'%(_it, y, _a, str(x), bias, str(weights)))
self.weights = weights-(1/c)*u
self.bias = bias-(1/c)*b
if (folds is not None and folds > 1):
vX = [X[i] for i in validate_index]
vY = [Y[i] for i in validate_index]
elif validate is not None and len(validate) > 0 and validate_labels is not None and len(validate_labels) > 0:
vX = validate
vY = validate_labels
else:
vX = X
vY = Y
max_f1, _avg_f1, max_weights, max_bias, _no_improve = self.cross_validate(vX, vY, max_f1, max_weights, max_bias, _no_improve)
if _no_improve >= patient and (folds is None or _it > folds): # Make sure k-folds are passed
if DEBUG > 0: print('Escape: _final_avg_f1=%f, max_avg_f1=%f\n'%(_avg_f1, max_f1))
break
self.weights = max_weights
self.bias = max_bias
# return [self.weights.tolist(), self.bias.tolist()]
return self.weights, self.bias
def cross_validate(self, vX, vY, max_f1, max_weights, max_bias, no_improve):
pY = self.predict(vX)
_avg_f1, _ = classification_report(vY, pY, print_results=False)
if (_avg_f1 - max_f1) >= CONVERAGE:
max_f1 = _avg_f1
max_weights = self.weights
max_bias = self.bias
no_improve = 0
if DEBUG > 0: print(' _avg_f1=%f'%(_avg_f1))
elif (_avg_f1 - max_f1) < CONVERAGE:
no_improve += 1
if DEBUG > 0: print('May skip _avg_f1=%f, no_improve=%d'%(_avg_f1, no_improve))
# elif (_avg_f1 - max_f1) == 0 : # No improve in avg f1, but the weights and bias may more reliable
# max_weights = self.weights
# max_bias = self.bias
# no_improve += 1
# if DEBUG > 0: print('May skip _avg_f1=%f, no_improve=%d'%( _avg_f1, no_improve))
else:
pass
return max_f1, _avg_f1, max_weights, max_bias, no_improve
def predict(self, data, class_dict=None):
'''
data = multiple test cases
class_dict to convert digital results to text classifications
Algorithm: PerceptronTest(w0, w1, ..., wD, b, ˆx)
1: a ← ∑D d=1 wd xˆ_d + b # compute activation for the test example
2: return sign(a)
'''
weights = self.weights
bias = self.bias
Y = []
X = data
for x in X:
_a = np.dot(weights, x.transpose()) + bias
if _a <=0 :
y = -1
else:
y = 1
if class_dict != None:
y = class_dict[y]
else:
pass
Y.append(y)
return Y
class tokenizer(object):
def __init__(self, stopword = STOP_WORDS, punctuation = PUNCTUATION):
self.stop_words = stopword
self.punctuation = punctuation
def get_wordlist(self, sentence, ascii_only=True, remove_stopwords=False, remove_punctuation=False ):
# Function to convert a document to a sequence of words,
# optionally removing stop words. Returns a list of words.
# Remove non-letters, we may remark this line and see different filtering approach. ####
if ascii_only:
sentence = re.sub("[^a-zA-Z]"," ", sentence)
else:
pass
# Convert all characters to lower case and split them
words = sentence.lower().split()
# Optionally remove stop words (false by default)
if remove_stopwords and remove_punctuation:
wordlist = [w for w in words if (not w in self.stop_words and not w in self.punctuation)]
elif remove_stopwords:
wordlist = [w for w in words if (not w in self.stop_words)]
elif remove_punctuation:
wordlist = [w for w in words if (not w in self.punctuation)]
else:
wordlist = words
# Return a word list
return wordlist
# Define a function to split a review into parsed sentences
def document_to_sentences(self, document, ascii_only=True, remove_stopwords=False, remove_puncutation=False ):
# Function to split a review into parsed sentences. Returns a
# list of sentences, where each sentence is a list of words
#
raw_sentences = document.rstrip('\n')
#
# Loop over each sentence
sentences = []
for _sentence in raw_sentences:
# If a sentence is empty, skip it
if len(_sentence) > 0:
# Otherwise, call review_to_wordlist to get a list of words
sentences.append(self.get_wordlist(_sentence, ascii_only, remove_stopwords, remove_puncutation ))
else:
pass
# Return the list of sentences which are lists of words,
return sentences
def classification_report(truth_list, predict_list, print_results=False):
'''
results = {class1:{TP:count, FP: count, FN:count}, ...}
'''
count_results = {}
score_results = {}
avg_f1 = 0
for i, predict in enumerate(predict_list):
if predict == truth_list[i]:
_tmp_dict = count_results.get(predict, {'TP': 0})
_tmp_dict['TP']=_tmp_dict.get('TP', 0) + 1
count_results[predict] = _tmp_dict
elif predict != truth_list[i]:
_tmp_dict = count_results.get(predict, {'FP': 0})
_tmp_dict['FP']=_tmp_dict.get('FP', 0) + 1
count_results[predict] = _tmp_dict
_tmp_dict = count_results.get(truth_list[i], {'FN': 0})
_tmp_dict['FN']=_tmp_dict.get('FN', 0) + 1
count_results[truth_list[i]] = _tmp_dict
for key, result in count_results.items():
precision = result.get('TP', 0)/(result.get('TP', 0)+result.get('FP', 1))
support = result.get('TP', 0)+result.get('FN', 0)
recall = result.get('TP', 0)/support
if (precision+recall) == 0:
F1 = 0
else:
F1 = 2*precision*recall/(precision+recall)
score_results[key] = {'precision': precision, 'recall': recall, 'f1-score': F1, 'support': support}
if print_results == True: print ('class:%s, precision=%f, recall=%f, f1-score=%f, support=%d'%( key, precision, recall, F1, support ))
total_f1 = 0
total_support = 0
for key, result in score_results.items():
total_f1 += result['f1-score']*result['support']
total_support += result['support']
if total_support != 0:
avg_f1 = total_f1/total_support
else:
avg_f1 = None
if print_results == True: print ('average f1=%f'%(avg_f1))
return avg_f1, score_results
def train_test_split(X, Y, test_rate=0.1):
training_size = int(len(X)*(1-test_rate))
return X[0:training_size, :], X[training_size:, :], Y[0:training_size], Y[training_size:]
def save_model(model_file_name, model):
#Save the model to model_file_name
with open(model_file_name, 'w', encoding='utf-8') as fp:
json.dump(model, fp , indent=1, ensure_ascii=False)
'''
Main program for the prior, and evident probabilities tables generate for Naive Bayes class execution.
'''
def main(input_doc):
parameters_list = [] #[class1[weights, bias], class2[weights, bias]]
test_ratio = TEST_RATIO
if PRINT_TIME : print ('perceplearn.get_input=>Start=>%s'%(str(datetime.now())))
document = get_input(input_doc)
##################
if DEBUG > 1: print_list(document)
if PRINT_TIME : print ('perceplearn.get_probabilities_tables=>Start=>%s'%(str(datetime.now())))
word_dict, document_matrix, classes_list, label, word_doc_count = get_feature_matrix(document)
# Predict by vanilla model.
p = Perceptron('vanilla', ITERATION)
parameters_list = [] #[class1[weights, bias], class2[weights, bias]]
for i, class_label in enumerate(classes_list):
if DEBUG > 0: print ('vanilla: class_label=%s'%(class_label))
# Prepare data
training_index, validate_index, training_label, validate_label = train_test_split( document_matrix, np.array(label)[:, i], test_rate=test_ratio)
weights1, bias1 = p.execute(training_index, training_label, validate_index, validate_label, folds=None, patient=PATIENT)
weights2, bias2 = p.execute(document_matrix, np.array(label)[:, i], None, None, folds=1, patient=PATIENT)
weights3, bias3 = p.execute(document_matrix, np.array(label)[:, i], None, None, folds=FOLDS, patient=PATIENT)
weights = (weights1+weights2+weights3)/3
bias = (bias1+bias2+bias3)/3
parameters_list.append( [weights.tolist(), bias.tolist()] )
save_model(VANILLA_MODEL_FILE_NAME, [word_dict, parameters_list, classes_list, word_doc_count])
# Predict by averaged model.
p.set_algorithm('averaged')
parameters_list = []
for i, class_label in enumerate(classes_list):
if DEBUG > 0: print ('averaged: class_label=%s'%(class_label))
training_index, validate_index, training_label, validate_label = train_test_split( document_matrix, np.array(label)[:, i], test_rate=test_ratio)
# parameters_list.append(p.execute(training_index, training_label, validate_index, validate_label, folds=FOLDS, patient=PATIENT))
weights1, bias1 = p.execute(training_index, training_label, validate_index, validate_label, folds=None, patient=PATIENT)
weights2, bias2 = p.execute(document_matrix, np.array(label)[:, i], None, None, folds=1, patient=PATIENT)
weights3, bias3 = p.execute(document_matrix, np.array(label)[:, i], None, None, folds=FOLDS, patient=PATIENT)
weights = (weights1+weights2+weights3)/3
bias = (bias1+bias2+bias3)/3
parameters_list.append( [weights.tolist(), bias.tolist()] )
save_model(AVERAGED_MODEL_FILE_NAME, [word_dict, parameters_list, classes_list, word_doc_count])
if PRINT_TIME : print ('perceplearn.get_probabilities_tables=>End=>%s'%(str(datetime.now())))
if __name__ == '__main__':
'''
Main program.
1. Read the training file from train-labeled.txt as default.
2. Using each vocabulary as features
3. Construct dictionary and word counting for each review.
3-1. Find total vocabulary we have from those reviews and create document vector for each review.
word_dict = {'word0': 0, 'word1': 1,....'wordn', n} where the 0 ... n is the index for word0 ... wordn.
3-2. Construct document matrix
document_matrix = [
[0, 0, 1, 5, ...1, 0] # First document / review with word distribution. There are 1 word2, 5 word3, ... 1 wordn-1
[1, 0, 0, 1, ...0, 0] # Second document / review with word distribution. There are 1 word0, 1 word3, ...
[2, 10, 1, 2, ...3, 0]
...
[0, 11, 0, 0, ...9, 1]
]
4. Perform two different Percepton algorithms (vanilla and averaged) to calculate weights and bias.
This program uses bagging approach to get better performance.
Three training approaches:
1. FOLDS = None for training data without shuffle.
2. FOLDS = 1 for random sample data without validation set.
3. FOLDS = 10 for K folds algorithm
Stop criteria:
1. ITERATION = 30
2. CONVERAGE = 0.0001
3. PATIENT = 5
5. Store the network weights and bias into vanillamodel.txt or averagedmodel.txt for percepclassify3.py to perform classification tasks.
'''
# Get input and output parameters
if len(sys.argv) != 2:
print('Usage: ' + sys.argv[0] + ' /path/to/inputfile ')
sys.exit(1)
seed(SEED)
# Assign the input and output variables
INPUT_FILE = sys.argv[1]
main (INPUT_FILE)