http://www.bigdataschool.ru

<img src="logo.png" align="right">

# Настройка и установка Trino

<hr style="border:2px solid #294A70"> </hr>

<h3 style="color: #294A70">Практические материалы</h3>

<div style="color:red; text-align: right"> Версия 1.02 от 10 января 2025 </div>

In [8]:
hosts = [
"10.150.2.5",
"10.150.2.6",
"10.150.2.7"
]

Сюда был развернут трино (на всех узлах):

In [9]:
trPath = "/opt/trino-server-468"

In [10]:
def str2file(ss, host, target):
    """ копирует строковую переменную в файл на нужно сервере """

    with open("/tmp/strfile.txt","w") as fp:
        fp.write(ss)
    !scp /tmp/strfile.txt ubuntu@{host}:{target}


## Тестовый сервер 

Для небольших тестов нужно иметь возможность стартовать только один узел (мастер) и на нем тестить.

Это всего-лишь конфиги, поэтому вполне можем себе это позволить.

### Готовлю конфиги

In [None]:
!ssh ubuntu@{hosts[0]} mkdir {trPath}/etc

In [None]:
!ssh ubuntu@{hosts[0]} mkdir {trPath}/data

In [None]:
!ssh ubuntu@{hosts[0]} mkdir {trPath}/etc/catalog

In [None]:
nodeStr = f"""node.environment=test
node.id=test-server
node.data-dir={trPath}/data
"""

In [None]:
str2file(nodeStr,hosts[0],f"{trPath}/etc/node.properties")

In [None]:
javaStr = """-server
-Xmx2G
-XX:InitialRAMPercentage=80
-XX:MaxRAMPercentage=80
-XX:G1HeapRegionSize=32M
-XX:+ExplicitGCInvokesConcurrent
-XX:+ExitOnOutOfMemoryError
-XX:+HeapDumpOnOutOfMemoryError
-XX:-OmitStackTraceInFastThrow
-XX:ReservedCodeCacheSize=512M
-XX:PerMethodRecompilationCutoff=10000
-XX:PerBytecodeRecompilationCutoff=10000
-Djdk.attach.allowAttachSelf=true
-Djdk.nio.maxCachedBufferSize=2000000
-Dfile.encoding=UTF-8
# Allow loading dynamic agent used by JOL
-XX:+EnableDynamicAgentLoading
"""

In [None]:
str2file(javaStr,hosts[0],f"{trPath}/etc/jvm.config")

In [None]:
confStr = f"""coordinator=true
node-scheduler.include-coordinator=true
http-server.http.port=8080
discovery.uri=http://{hosts[0]}:8080
http-server.process-forwarded=true
"""

In [None]:
str2file(confStr,hosts[0],f"{trPath}/etc/config.properties")

### Стартую и останавливаю

In [None]:
!ssh ubuntu@{hosts[0]} {trPath}/bin/launcher start

In [None]:
!ssh ubuntu@{hosts[0]} {trPath}/bin/launcher status

In [None]:
!ssh ubuntu@{hosts[0]} {trPath}/bin/launcher restart

In [None]:
!ssh ubuntu@{hosts[0]} {trPath}/bin/launcher stop

## Кластер

Полноценный сервер, отличия

* конфиги чуть другие и везде
    * config.properties: роли
    * node.properties: идентификация узлов
* стартовать надо тоже везде

In [None]:
for host in hosts[1:]:
    !ssh ubuntu@{host} mkdir {trPath}/etc
    !ssh ubuntu@{host} mkdir {trPath}/data
    !ssh ubuntu@{host} mkdir {trPath}/etc/catalog

In [None]:
for i,host in enumerate(hosts):
    if i==0:
        hName = "master"
    else:
        hName = f"worker-{i}"
    nodeStr = "node.environment=prod\n"
    nodeStr += f"node.id={hName}\n"
    nodeStr += f"node.data-dir={trPath}/data\n"
    str2file(nodeStr,host,f"{trPath}/etc/node.properties")

In [None]:
for i,host in enumerate(hosts):
    if i==0:
        confStr = "coordinator=true\n"
        confStr += "node-scheduler.include-coordinator=false\n"
    else:
        confStr = "coordinator=false\n"
    confStr += "http-server.http.port=8080\n"
    confStr += "http-server.process-forwarded=true\n"
    confStr += f"discovery.uri=http://{hosts[0]}:8080\n"
    str2file(confStr,host,f"{trPath}/etc/config.properties")

In [None]:
for host in hosts[1:]:
    str2file(javaStr,host,f"{trPath}/etc/jvm.config")

In [None]:
for host in hosts[1:]:
    str2file(tpsdsStr,host,f"{trPath}/etc/catalog/tpcds.properties")

### Стартую и останавливаю

In [None]:
for host in hosts:
    !ssh ubuntu@{host} {trPath}/bin/launcher start

In [11]:
for host in hosts:
    !ssh ubuntu@{host} {trPath}/bin/launcher status

INFO: Running as 15411
INFO: Running as 1485
INFO: Running as 15556


In [None]:
for host in hosts[1:]:
    !ssh ubuntu@{host} {trPath}/bin/launcher stop
!ssh ubuntu@{hosts[0]} {trPath}/bin/launcher stop

## Connectors & Catalogs

### TPC-DS

In [None]:
tpsdsStr = "connector.name=tpcds"

In [None]:
str2file(tpsdsStr,hosts[0],f"{trPath}/etc/catalog/tpcds.properties")

In [None]:
for host in hosts[1:]:
    str2file(tpsdsStr,host,f"{trPath}/etc/catalog/tpcds.properties")

### Memory

In [None]:
memStr = "connector.name=memory"

In [None]:
str2file(memStr,hosts[0],f"{trPath}/etc/catalog/memory.properties")

In [None]:
for host in hosts[1:]:
    str2file(memStr,host,f"{trPath}/etc/catalog/memory.properties")

### Postgres

In [None]:
psqlStr = """
connector.name=postgresql
connection-url=jdbc:postgresql://10.150.2.30:5432/spark
connection-user=sparkuser
connection-password=XXXX
"""

In [None]:
str2file(psqlStr,hosts[0],f"{trPath}/etc/catalog/postgres.properties")

In [None]:
for host in hosts[1:]:
    str2file(psqlStr,host,f"{trPath}/etc/catalog/postgres.properties")

### Hive metastore service

Инструкцию брал уже свою - http://localhost:8888/lab/workspaces/greenplum/tree/mk_win/Documents/25.DataStand/trino/install.md

Делаю все по ней прям там (на координаторе)

Только нужно перед стартом метастора делать

    export JAVA_HOME=/usr/lib/jvm/jdk-11.0.25-oracle-x64
    export HADOOP_HOME=/opt/hadoop-3.3.4
    /opt/apache-hive-metastore-3.0.0-bin/bin/start-metastore

есть проблема: хадуп не умеет работать с java 23, поэтому для старта метастора пришлось выбрать версию 11...

### Hive catalog

In [None]:
hiveStr = """
connector.name=hive
hive.metastore.uri=thrift://localhost:9083
fs.native-s3.enabled=true
s3.endpoint=https://storage.yandexcloud.net
s3.aws-access-key=XXXX
s3.aws-secret-key=XXXX
"""

In [None]:
str2file(hiveStr,hosts[0],f"{trPath}/etc/catalog/hive.properties")

In [None]:
for host in hosts[1:]:
    str2file(hiveStr,host,f"{trPath}/etc/catalog/hive.properties")

#### Start HMS

In [None]:
hmsEnv = "export HADOOP_HOME=/opt/hadoop-3.3.4; export JAVA_HOME=/usr/lib/jvm/jdk-11.0.25-oracle-x64;"

In [None]:
hmsCmd = f"{hmsEnv} nohup /opt/apache-hive-metastore-3.0.0-bin/bin/start-metastore > /tmp/hms.log 2>&1 &"

In [None]:
!ssh ubuntu@{hosts[0]} "{hmsCmd}"

In [15]:
!ssh ubuntu@{hosts[0]} tail /tmp/hms.log

SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
ERROR StatusLogger No log4j2 configuration file found. Using default configuration: logging only errors to the console. Set system property 'org.apache.logging.log4j.simplelog.StatusLogger.level' to TRACE to show Log4j2 internal initialization logging.
2025-02-13 11:54:04: Starting Metastore Server
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/apache-hive-metastore-3.0.0-bin/lib/log4j-slf4j-impl-2.8.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/hadoop-3.3.4/share/hadoop/common/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
2025-02-13 11:54:13,561 main INFO Log4j 

In [None]:
!ssh ubuntu@{hosts[0]} 'ps ax | grep java'

### Iceberg catalog

In [None]:
iceStr = f"""
connector.name=iceberg
hive.metastore.uri=thrift://{hosts[0]}:9083
fs.native-s3.enabled=true
s3.endpoint=https://storage.yandexcloud.net
s3.aws-access-key=XXXX
s3.aws-secret-key=XXXX
"""

In [None]:
str2file(iceStr,hosts[0],f"{trPath}/etc/catalog/ice.properties")

In [None]:
for host in hosts[1:]:
    str2file(iceStr,host,f"{trPath}/etc/catalog/ice.properties")

#### База данных hive

In [None]:
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

In [None]:
aDsn = _getDsn()

In [None]:
conn = psycopg2.connect(aDsn)
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
cur = conn.cursor()
cur.execute("create database hive")

In [None]:
conn.close()

### Kafka

In [None]:
for host in hosts:
    !ssh ubuntu@{host} mkdir -p {trPath}/etc/kafka

In [None]:
kfStr = """
connector.name=kafka
kafka.nodes=10.150.2.30:9092
kafka.table-names=teacher_test,person
kafka.hide-internal-columns=true
"""

In [None]:
for host in hosts:
    str2file(kfStr,host,f"{trPath}/etc/catalog/kafka.properties")

In [None]:
persStr = """{
  "tableName": "person",
  "schemaName": "default",
  "topicName": "person",
  "key": {
    "dataFormat": "json",
    "fields": [
      {
        "name": "key",
        "type": "BIGINT",
        "mapping": "_key"
      }
    ]
  },
  "message": {
    "dataFormat": "json",
    "fields": [
      {
        "name": "person_id",
        "type": "BIGINT",
        "mapping": "person_id"
      },
      {
        "name": "name",
        "type": "VARCHAR",
        "mapping": "name"
      }
    ]
  }
}
"""

In [None]:
for host in hosts:
    str2file(persStr,host,f"{trPath}/etc/kafka/person.json")

## Проверка работоспособности

In [12]:
from trsql_h import _psql, _sql, _setSchema, _getSchema

In [14]:
_psql("show catalogs")

In [None]:
_sql("show catalogs")

In [13]:
_sql("SELECT * FROM system.runtime.nodes")

Unnamed: 0,node_id,http_uri,node_version,coordinator,state
0,master,http://10.150.2.5:8080,468,True,active
1,worker-1,http://10.150.2.6:8080,468,False,active
2,worker-2,http://10.150.2.7:8080,468,False,active


In [None]:
_sql("SELECT count(*) FROM tpcds.sf1.store_sales")

<hr style="border:2px solid #294A70"> </hr>

<div style="text-align: center"> © ООО «Учебный центр «Коммерсант», 2024 </div>

<div style="text-align: center"> info@bigdataschool.ru, +7(495) 41-41-121 </div>