# Spark deploying

Without helm

In [None]:
!docker-compose -f .\docker\entrypoint.docker-compose.yml  -f .\docker\spark\spark.docker-compose.yml --env-file=./env/.env  up producer -d --force-recreate

!wsl -d <Distributive-Name>
!kubectl create namespace spark
!kubectl apply -f ./docker/kubernetes/deprecated/no-auth-spark-master.yaml -n spark
!kubectl apply -f ./docker/kubernetes/deprecated/no-auth-spark-master.yaml -n spark

With helm

```bash
docker-compose -f .\docker\entrypoint.docker-compose.yml  -f .\docker\spark\spark.docker-compose.yml --env-file=./env/.env  up producer -d --force-recreate

wsl -d <Distributive-Name>
```

If it's helm creating:
```bash
helm repo add bitnami https://charts.bitnami.com/bitnami
helm search repo bitnami
helm install kayvan-release oci://registry-1.docker.io/bitnamicharts/spark
```


```bash
helm upgrade kayvan-release bitnami/spark \
  -f ./docker/kubernetes/helm/spark-config.yaml \
  --set image.tag=3.5.6 \
  --force
kubectl apply -f ./docker/kubernetis/helm/spark-master-service.yaml
```

Some diagnostic commands:
```bash
kubectl get svc -n default
kubectl get pod -n default
kubectl get nodes -o wide
kubectl get pods --show-labels
curl host.docker.internal:30080
```

# Connection tests into spark pods

Correct:

In [None]:
import os
os.environ["HOME"] = "/tmp"
os.environ["PYSPARK_SUBMIT_ARGS"] = f"""
--conf spark.executor.memory=1G
--conf spark.executor.cores=1
pyspark-shell
"""
os.environ["PYSPARK_PYTHON"] = "/opt/bitnami/python/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/opt/bitnami/python/bin/python3"
from pyspark.sql import SparkSession

# 1) Создаём SparkSession, указываем master как spark://<имя-сервиса>:7077
#    При работе внутри Pod’а Kubernetes автоматически резолвит "spark-master" в его ClusterIP.
spark = (
    SparkSession.builder
        .appName("standalone-simple")
        # .master("spark://kayvan-release-spark-master-0:7077")
    .master("spark://host.docker.internal:30077")
        .config("spark.jars.ivy", "/tmp/.ivy2")  # укажи директорию вручную
        .config("spark.executor.instances", "2")      # сколько Executors запустить
        .config("spark.executor.cores", "1")          # по одному CPU‐ядру
        .config("spark.executor.memory", "1g")      # по 512 МБ памяти
        .config("spark.driver.memory", "1g")        # драйверу тоже ограничим RAM, если нужно

        .config("spark.submit.deployMode", "client")
        # .config("spark.kubernetes.namespace", "spark")
        # .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
        # .config("spark.kubernetes.container.image", "bitnami/spark:latest")

        # .config("spark.driver.host", "host.docker.internal")

    # .config("spark.driver.host",         "spark-master")
        .config("spark.driver.bindAddress",  "0.0.0.0")
        .config("spark.driver.port",         "45555")
        .config("spark.blockManager.port",   "45556")


    .config("spark.pyspark.python", "/opt/bitnami/python/bin/python3")              # путь к python3 внутри контейнера
.config("spark.executorEnv.PYSPARK_PYTHON", "/opt/bitnami/python/bin/python3")  # тоже самое для среды executor’а

        .getOrCreate()
)

# 2) Делаем простую проверку
df = spark.range(1_000)                 # создаём DataFrame с числами от 0 до 999 999
result = df.selectExpr("sum(id) as total")  # суммируем колонку "id"
result.show()                               # показываем в консоли: ожидаем 499999500000

spark.stop()

Bad:

In [None]:
import os
from pyspark.sql import SparkSession
# driver_host = "localhost"
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
# os.environ["KUBECONFIG"] = os.path.join(ROOT_DIR, "k8s-creds", 'config')
# os.environ["PYSPARK_SUBMIT_ARGS"] = f"""
# --conf spark.executor.memory=1G
# --conf spark.executor.cores=1
# pyspark-shell
# """
spark = (SparkSession.builder
    .appName("k8s-test-app")
     # .master("spark://localhost:7077")
    .config("spark.kubernetes.container.image", "bitnami/spark:latest")
    .config("spark.kubernetes.namespace", "spark")
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
    .config("spark.executor.instances", "2")
    # .config("spark.driver.host", "localhost")
    .getOrCreate())


df = spark.range(1000)
df.selectExpr("sum(id)").show()

spark.stop()


In [None]:
import os
os.environ["HOME"] = "/tmp"
# os.environ["KUBECONFIG"] = os.path.join('root', 'kube','config', 'config')
# os.environ["KUBECONFIG"] = os.path.join('k8s-creds', 'config')
os.environ["PYSPARK_SUBMIT_ARGS"] = f"""
--conf spark.executor.memory=1G
--conf spark.executor.cores=1
pyspark-shell
"""
try:
    from pyspark.sql import SparkSession
    SparkSession.builder.getOrCreate().stop()
except:
    pass

spark = (
    SparkSession.builder
    .appName("jupyter-on-spark")
    .master("spark://spark-master:7077")
    # .master("spark://https://kubernetes.default.svc:443")
    .config("spark.submit.deployMode", "client")
    .config("spark.kubernetes.namespace", "spark")
    .config("spark.kubernetes.container.image", "bitnami/spark:latest")
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
    .config("spark.jars.ivy", "/tmp/.ivy2")  # укажи директорию вручную
    .getOrCreate()
)
df = spark.range(1000)
df.selectExpr("sum(id)").show()

spark.stop()

In [None]:

# import sys
# from operator import add
#
# from pyspark.sql import SparkSession
#
#
# spark = SparkSession\
#         .builder\
#         .appName("PythonWordCount")\
#      .master("spark://spark-master:7077")\
#         .getOrCreate()
#
# lines = spark.read.text("Привет Привет привет").rdd.map(lambda r: r[0])
# counts = lines.flatMap(lambda x: x.split(' ')) \
#                   .map(lambda x: (x, 1)) \
#                   .reduceByKey(add)
# output = counts.collect()
# for (word, count) in output:
#     print("%s: %i" % (word, count))
#
# spark.stop()

# Connection test into jupyter docker image

Correct:

In [None]:
import os
ROOT_DIR = '/workspace/NN'
os.chdir(ROOT_DIR)
from pyspark.sql import SparkSession
# driver_host = "localhost"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
# os.environ["KUBECONFIG"] = os.path.join(ROOT_DIR, "k8s-creds", 'config')
os.environ["PYSPARK_SUBMIT_ARGS"] = f"""
--conf spark.executor.memory=1G
--conf spark.executor.cores=1
pyspark-shell
"""
spark = (SparkSession.builder
    .appName("k8s-test-app")
    # .master("k8s://https://kubernetes.docker.internal:6443")
.master("spark://host.docker.internal:30077")
    .config("spark.kubernetes.container.image", "bitnami/spark:latest")
    .config("spark.kubernetes.namespace", "default")
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")

        .config("spark.executor.instances", "2")      # сколько Executors запустить
        .config("spark.executor.cores", "1")          # по одному CPU‐ядру
        .config("spark.executor.memory", "1g")      # по 512 МБ памяти
        .config("spark.driver.memory", "1g")        # драйверу тоже ограничим RAM, если нужно

    # .config("spark.kubernetes.authenticate.submission.caCertFile",  os.path.join(ROOT_DIR, 'k8s-creds', "ca.crt"))
    # .config("spark.kubernetes.authenticate.submission.clientKeyFile",  os.path.join(ROOT_DIR, 'k8s-creds', "client.key"))
    # .config("spark.kubernetes.authenticate.submission.clientCertFile",  os.path.join(ROOT_DIR, 'k8s-creds', "client.crt"))
    # .config("spark.kubernetes.executor.podNamePrefix", "spark-exec")

    .config("spark.driver.host", "host.docker.internal")
    .config("spark.jars.ivy", "/tmp/.ivy2")  # укажи директорию вручную

    # .config("spark.submit.deployMode", "cluster")

    .config("spark.driver.bindAddress",  "0.0.0.0")
    .config("spark.driver.port",         "45555")
    .config("spark.blockManager.port",   "45556")

    # фиксированные порты, если нужны

    # .config("spark.driver.bindAddress", "0.0.0.0")
    # .config("spark.eventLog.enabled", "true")
    # .config("spark.ui.showConsoleProgress", "true")
    # .config("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:/path/to/log4j.properties")
    .getOrCreate())


# spark = (SparkSession.builder
#              .appName("k8s-test-app")
#     .master("k8s://https://kubernetes.docker.internal:6443")
#
#     # образ и namespace
#     .config("spark.kubernetes.container.image", "bitnami/spark:latest")
#     .config("spark.kubernetes.namespace", "spark")
#     .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
# # .config("spark.kubernetes.client.watch.allowWatchBookmarks", "false")
#     .config("spark.submit.deployMode", "client")
#     # .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
#
#
#     # ресурсы
#     .config("spark.executor.instances", "2")
#
#     # драйвер: рекламируемый адрес (для исполнителей) и bind address (куда слушать)
#     # .config("spark.driver.host", "host.docker.internal")
#     # .config("spark.driver.bindAddress", "0.0.0.0")
#     #
#     # # фиксированные порты, если нужны
#     # .config("spark.driver.port", "45555")
#     # .config("spark.blockManager.port", "45556")
#
#     .getOrCreate())

df = spark.range(1000)
df.selectExpr("sum(id)").show()

spark.stop()


Bad:

In [None]:
from pyspark.sql import SparkSession
driver_host = "localhost"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["KUBECONFIG"] = os.path.join(ROOT_DIR, "k8s-creds", 'config')
os.environ["PYSPARK_SUBMIT_ARGS"] = f"""
--conf spark.executor.memory=1G
--conf spark.executor.cores=1
pyspark-shell
"""

spark = (SparkSession.builder
             .appName("k8s-test-app")
    .master("k8s://https://kubernetes.docker.internal:6443")
    .config("spark.kubernetes.container.image", "bitnami/spark:latest")
    .config("spark.kubernetes.namespace", "spark")
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
    .config("spark.submit.deployMode", "cluster")
    .config("spark.executor.instances", "2")
    .getOrCreate())

df = spark.range(1000)
df.selectExpr("sum(id)").show()

spark.stop()


In [None]:
from pyspark.sql import SparkSession

# Внешний IP или DNS ноды/LoadBalancer (замени на свой)
spark_master_ip = 'host.docker.internal'
spark_master_port = "7077"

spark = SparkSession.builder \
    .appName("test-spark-k8s") \
    .master("k8s://https://kubernetes.docker.internal:6443")  \
    .config("spark.driver.host", "host.docker.internal") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "1") \
    .config("spark.kubernetes.namespace", "spark") \
    .config("spark.kubernetes.container.image", "bitnami/spark:latest") \
    .getOrCreate()

df = spark.range(1000)
df.selectExpr("sum(id)").show()

spark.stop()


In [None]:
!curl http://host.docker.internal:10005/