rank/rank.go

package rank

// Rank struct contains every original raw sentences, words, tokens, phrases,
// indexes, word hits, phrase hits and minimum-maximum values.
//
// Max is the occurrence of the most used word.
//
// Min is the occurrence of the less used word. It is always greater then 0.
//
// Relation is the Relation object, contains phrases.
//
// SentenceMap contains raw sentences. Index is the sentence ID, value is the
// sentence itself.
//
// Words contains Word objects. Index is the word ID, value is the word/token
// itself.
//
// WordValID contains words. Index is the word/token, value is the ID.
type Rank struct {
	Max         float32
	Min         float32
	Relation    Relation
	SentenceMap map[int]string
	Words       map[int]*Word
	WordValID   map[string]int
}

// Word struct contains all data about the words.
//
// If a word is multiple times in the text then the multiple words point to the
// same ID. So Word is unique.
//
// SentenceIDs contains all IDs of sentences what contain the word.
//
// ConnectionLeft contains all words what are connected to this word on the left
// side. The map index is the ID of the related word and its value is the
// occurrence.
//
// ConnectionRight contains all words what are connected to this word on the
// right side. The map index is the ID of the related word and its value is the
// occurrence.
//
// Token is the word itself, but not the original, it is tokenized.
//
// Qty is the number of occurrence of the word.
//
// Weight is the weight of the word between 0.00 and 1.00.
type Word struct {
	ID              int
	SentenceIDs     []int
	ConnectionLeft  map[int]int
	ConnectionRight map[int]int
	Token           string
	Qty             int
	Weight          float32
}

// NewRank constructor retrieves a Rank pointer.
func NewRank() *Rank {
	return &Rank{
		0,
		0,
		Relation{
			0,
			0,
			make(map[int]map[int]Score),
		},
		make(map[int]string),
		make(map[int]*Word),
		make(map[string]int),
	}
}

// IsWordExist method retrieves true when the given word is already in the rank.
func (rank *Rank) IsWordExist(word string) bool {
	_, find := rank.WordValID[word]

	return find
}

// AddNewWord method adds a new word to the rank object and it defines its ID.
func (rank *Rank) AddNewWord(word string, prevWordIdx int, sentenceID int) (wordID int) {
	wordID = len(rank.Words)
	connectionLeft := make(map[int]int)

	if prevWordIdx >= 0 {
		connectionLeft[prevWordIdx] = 1
	}

	newWord := &Word{
		ID:              wordID,
		SentenceIDs:     []int{sentenceID},
		ConnectionLeft:  connectionLeft,
		ConnectionRight: make(map[int]int),
		Token:           word,
		Qty:             1,
		Weight:          0,
	}

	rank.Words[wordID] = newWord
	rank.WordValID[word] = wordID

	return
}

// UpdateWord method update a word what already exists in the rank object. It
// retrieves its ID.
func (rank *Rank) UpdateWord(word string, prevWordIdx int, sentenceID int) (wordID int) {
	wordID = rank.WordValID[word]

	found := false

	for _, oldSentenceID := range rank.Words[wordID].SentenceIDs {
		if sentenceID == oldSentenceID {
			found = true
			break
		}
	}

	if !found {
		rank.Words[wordID].SentenceIDs = append(
			rank.Words[wordID].SentenceIDs,
			sentenceID,
		)
	}

	rank.Words[wordID].Qty++

	if prevWordIdx >= 0 {
		rank.Words[wordID].ConnectionLeft[prevWordIdx]++
	}

	return
}

// UpdateRightConnection method adds the right connection to the word. It always
// can be used after a word has added and the next word is known.
func (rank *Rank) UpdateRightConnection(wordID int, rightWordID int) {
	if wordID >= 0 {
		rank.Words[wordID].ConnectionRight[rightWordID]++
	}
}

// GetWordData method retrieves all words as a pointer.
func (rank *Rank) GetWordData() map[int]*Word {
	return rank.Words
}