/
rank.go
147 lines (128 loc) · 3.68 KB
/
rank.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
package rank
// Rank struct contains every original raw sentences, words, tokens, phrases,
// indexes, word hits, phrase hits and minimum-maximum values.
//
// Max is the occurrence of the most used word.
//
// Min is the occurrence of the less used word. It is always greater then 0.
//
// Relation is the Relation object, contains phrases.
//
// SentenceMap contains raw sentences. Index is the sentence ID, value is the
// sentence itself.
//
// Words contains Word objects. Index is the word ID, value is the word/token
// itself.
//
// WordValID contains words. Index is the word/token, value is the ID.
type Rank struct {
Max float32
Min float32
Relation Relation
SentenceMap map[int]string
Words map[int]*Word
WordValID map[string]int
}
// Word struct contains all data about the words.
//
// If a word is multiple times in the text then the multiple words point to the
// same ID. So Word is unique.
//
// SentenceIDs contains all IDs of sentences what contain the word.
//
// ConnectionLeft contains all words what are connected to this word on the left
// side. The map index is the ID of the related word and its value is the
// occurrence.
//
// ConnectionRight contains all words what are connected to this word on the
// right side. The map index is the ID of the related word and its value is the
// occurrence.
//
// Token is the word itself, but not the original, it is tokenized.
//
// Qty is the number of occurrence of the word.
//
// Weight is the weight of the word between 0.00 and 1.00.
type Word struct {
ID int
SentenceIDs []int
ConnectionLeft map[int]int
ConnectionRight map[int]int
Token string
Qty int
Weight float32
}
// NewRank constructor retrieves a Rank pointer.
func NewRank() *Rank {
return &Rank{
0,
0,
Relation{
0,
0,
make(map[int]map[int]Score),
},
make(map[int]string),
make(map[int]*Word),
make(map[string]int),
}
}
// IsWordExist method retrieves true when the given word is already in the rank.
func (rank *Rank) IsWordExist(word string) bool {
_, find := rank.WordValID[word]
return find
}
// AddNewWord method adds a new word to the rank object and it defines its ID.
func (rank *Rank) AddNewWord(word string, prevWordIdx int, sentenceID int) (wordID int) {
wordID = len(rank.Words)
connectionLeft := make(map[int]int)
if prevWordIdx >= 0 {
connectionLeft[prevWordIdx] = 1
}
newWord := &Word{
ID: wordID,
SentenceIDs: []int{sentenceID},
ConnectionLeft: connectionLeft,
ConnectionRight: make(map[int]int),
Token: word,
Qty: 1,
Weight: 0,
}
rank.Words[wordID] = newWord
rank.WordValID[word] = wordID
return
}
// UpdateWord method update a word what already exists in the rank object. It
// retrieves its ID.
func (rank *Rank) UpdateWord(word string, prevWordIdx int, sentenceID int) (wordID int) {
wordID = rank.WordValID[word]
found := false
for _, oldSentenceID := range rank.Words[wordID].SentenceIDs {
if sentenceID == oldSentenceID {
found = true
break
}
}
if !found {
rank.Words[wordID].SentenceIDs = append(
rank.Words[wordID].SentenceIDs,
sentenceID,
)
}
rank.Words[wordID].Qty++
if prevWordIdx >= 0 {
rank.Words[wordID].ConnectionLeft[prevWordIdx]++
}
return
}
// UpdateRightConnection method adds the right connection to the word. It always
// can be used after a word has added and the next word is known.
func (rank *Rank) UpdateRightConnection(wordID int, rightWordID int) {
if wordID >= 0 {
rank.Words[wordID].ConnectionRight[rightWordID]++
}
}
// GetWordData method retrieves all words as a pointer.
func (rank *Rank) GetWordData() map[int]*Word {
return rank.Words
}