Skip to content

Commit

Permalink
Merge pull request #55 from Financial-Times/feature/elasticsearch7-up…
Browse files Browse the repository at this point in the history
…date

Upgrade to ES v7
  • Loading branch information
atanasdinov committed Sep 7, 2022
2 parents 382056d + 15700b7 commit a0ae8fd
Show file tree
Hide file tree
Showing 22 changed files with 899 additions and 4,703 deletions.
2 changes: 1 addition & 1 deletion _ft/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ FROM peteclarkft/ersatz:0.0.4
COPY ./ersatz-fixtures.yml /_ft/ersatz-fixtures.yml

WORKDIR /
CMD [ "/ersatz" ]
CMD [ "/ersatz", "--fixtures", "/_ft/ersatz-fixtures.yml", "--port", "9000" ]

4 changes: 2 additions & 2 deletions _ft/ersatz-fixtures.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ fixtures:
/:
get:
status: 200
/_cluster/health/:
/_cluster/health:
get:
body:
cluster_name: "070529446553:upp-sapi-v1-k8s-dev"
Expand All @@ -19,4 +19,4 @@ fixtures:
number_of_pending_tasks: 0
headers:
content-type: application/json
status: 200
status: 200
14 changes: 7 additions & 7 deletions cmd/content-rw-elasticsearch/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ import (
"os"
"time"

"github.com/Financial-Times/content-rw-elasticsearch/v2/pkg/concept"
"github.com/Financial-Times/content-rw-elasticsearch/v2/pkg/config"
"github.com/Financial-Times/content-rw-elasticsearch/v2/pkg/es"
"github.com/Financial-Times/content-rw-elasticsearch/v2/pkg/health"
pkghttp "github.com/Financial-Times/content-rw-elasticsearch/v2/pkg/http"
"github.com/Financial-Times/content-rw-elasticsearch/v2/pkg/mapper"
"github.com/Financial-Times/content-rw-elasticsearch/v2/pkg/message"
"github.com/Financial-Times/content-rw-elasticsearch/v4/pkg/concept"
"github.com/Financial-Times/content-rw-elasticsearch/v4/pkg/config"
"github.com/Financial-Times/content-rw-elasticsearch/v4/pkg/es"
"github.com/Financial-Times/content-rw-elasticsearch/v4/pkg/health"
pkghttp "github.com/Financial-Times/content-rw-elasticsearch/v4/pkg/http"
"github.com/Financial-Times/content-rw-elasticsearch/v4/pkg/mapper"
"github.com/Financial-Times/content-rw-elasticsearch/v4/pkg/message"
"github.com/Financial-Times/go-logger/v2"
"github.com/Financial-Times/kafka-client-go/v3"
cli "github.com/jawher/mow.cli"
Expand Down
82 changes: 82 additions & 0 deletions configs/migrate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/env bash

echo "Don't run this script, is not totally compatible anymore, but is useful as documentation for the relevant changes"
exit 5
index_name="combinedpostpublicationevents"
file_path=$(dirname "$0")/referenceSchema.json
# This make sure that the script is apply to the same file that the script was created for
# due to poor portability
if [ $(git log --follow -1 --pretty=format:"%h" -- "$file_path") -ne 9781498 ]; then
echo "This script was created to migrate $file_path from revision 9781498"
exit 1
fi
pushd $(dirname "$0"); git restore "$file_path"; popd

set_total_fields_limit() {
file_path=$1
limit=$2
#TODO jq --arg
#jq < "$file_path"_2 '.settings.index += {mapping:{total_fields:{limit:$limit}}}' > "$file_path"_2
jq < "$file_path" '.settings.index += {mapping:{total_fields:{limit:12000}}}' > "$file_path"_2
mv "$file_path"_2 "$file_path"
}

remove_spaces() {
file_path=$1
sed -i 's/ //g' "$file_path"
sed -i -z 's/\n//g' "$file_path"
}

# `_all` field
# Deprecated in 6.0.0.
# `_all` may no longer be enabled for indices created in 6.0+, use a custom field and the mapping copy_to parameter
# The _all field is a special catch-all field which concatenates the values of all of the other fields into one big
# string, using space as a delimiter, which is then analyzed and indexed, but not stored. This means that it can be
# searched, but not retrieved.
# https://www.elastic.co/guide/en/elasticsearch/reference/current/copy-to.html#copy-to
replace_include_in_all() {
file_path=$1
jq < "$file_path" '(..|objects|select(has("_all") and has("properties") and (._all.enabled == true)).properties) += {"meta_all_fields":{"type":"text","index":true}}' > "$file_path"_2
jq < "$file_path"_2 'walk(if type == "object" and has("_all") then del(._all) else . end)' > "$file_path"
remove_spaces "$file_path"
sed -i 's/,"include_in_all":true/,"copy_to":"meta_all_fields"/g' "$file_path"
sed -i 's/,"include_in_all":false//g' "$file_path"
}

# In the release of Elasticsearch 5.0, removal of the string type. The background for this change is that we think the string type is confusing:
# Elasticsearch has two very different ways to search strings. You can either search whole values, that we often refer to as keyword search,
# or individual tokens, that we usually refer to as full-text search. The former strings should be mapped as a not_analyzed string while the latter
# should be mapped as an analyzed string.
# But the fact that the same field type is used for these two very different use-cases is causing problems since some options only make sense for
# one of the use case. For instance, position_increment_gap makes little sense for a not_analyzed string and it is not obvious whether ignore_above
# applies to the whole value or to individual tokens in the case of an analyzed string (in case you wonder: it does apply to the whole value, limits
# on individual tokens can be applied with the limit token filter).
# To avoid these issues, the string field has split into two new types: text, which should be used for full-text search, and keyword, which should
# be used for keyword search.
# https://www.elastic.co/blog/strings-are-dead-long-live-strings
change_type_string() {
remove_spaces "$file_path"
sed -i 's/"type":"string","index":"analyzed"/"type":"text","index":true/g' "$file_path"
sed -i 's/"type":"string","index":"not_analyzed"/"type":"keyword","index":true/g' "$file_path"
sed -i 's/"type":"string"/"type":"text"/g' "$file_path"
}

move_mappings_fields_to_a_nested_properties_object() {
file_path=$1
jq < "$file_path" '.mappings |= { properties: .}' > "$file_path"_2
mv "$file_path"_2 "$file_path"
}

reformat_json() {
jq < "$file_path" '.' > "$file_path"_2
mv "$file_path"_2 "$file_path"
}

set_total_fields_limit "$file_path" 12000
move_mappings_fields_to_a_nested_properties_object "$file_path"
replace_include_in_all "$file_path"
change_type_string "$file_path"
reformat_json "$file_path"
# exit 0
curl -X PUT "http://localhost:9200/$index_name?pretty" -H 'Content-Type: application/json' --data @"$file_path"
curl "http://localhost:9200/$index_name/_mapping?pretty" | jq .

0 comments on commit a0ae8fd

Please sign in to comment.