Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "daily"
16 changes: 16 additions & 0 deletions .github/workflows/check-dependencies-updates.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
on:
schedule:
- cron: '0 6 * * 1-5'

name: 🍄 Check dependencies updates

permissions:
contents: write
pull-requests: write

jobs:
scala-steward:
runs-on: ubuntu-22.04
name: Check Scala project dependencies updates with Scala Steward
steps:
- uses: scala-steward-org/scala-steward-action@v2
25 changes: 25 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: CI

on:
push:
branches:
- main
pull_request:

permissions:
contents: read

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-java@v4
with:
distribution: 'zulu'
java-version: '21'
cache: 'sbt'
- name: 👌 Run "pre-push" tasks (compile and style-check)
run: sbt prep
- name: ✅ Run test
run: sbt test
16 changes: 16 additions & 0 deletions .github/workflows/update-github-dependency-graph.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: Update GitHub Dependency Graph

on:
push:
branches:
- main

permissions:
contents: write

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: scalacenter/sbt-dependency-submission@v3
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
.bsp/
target/
boot/
lib_managed/
src_managed/
project/plugins/project/

/docker/spark/data/
/docker/volume/
/docker/spark/apps/
11 changes: 11 additions & 0 deletions .scalafmt.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
version = 3.8.2
runner.dialect = scala213
style = default
maxColumn = 120
continuationIndent.callSite = 2
align.preset = more
runner.optimizer.forceConfigStyleMinArgCount = 1
rewrite.rules = [SortImports]
importSelectors = singleLine
project.excludeFilters = ["target/"]
project.git = true # Only format files tracked by git
17 changes: 17 additions & 0 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Settings.settings

libraryDependencies := Dependencies.all

SbtAliases.aliases.flatMap { case (alias, command) =>
addCommandAlias(alias, command)
}

assembly / mainClass := Some(
"com.codely.lesson_04_how_to_deploy_spark.video_01__deploy_application.DeploySparkApp"
)

import sbtassembly.MergeStrategy
assembly / assemblyMergeStrategy := {
case PathList("org", "apache", "spark", "unused", "UnusedStubClass.class") => MergeStrategy.first
case x => (assembly / assemblyMergeStrategy).value(x)
}
Empty file added conf/.gitkeep
Empty file.
8 changes: 8 additions & 0 deletions doc/hooks/install-hooks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/sh

cd "$(dirname "$0")/../.."

rm -rf .git/hooks

ln -s ../doc/hooks .git/hooks
sudo chmod -R 777 doc/hooks/*
50 changes: 50 additions & 0 deletions doc/hooks/pre-push
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash

# Checks if locally staged changes are formatted properly ignoring non-staged changes.
# Install it with the `install-hooks.sh` script
# Based on: https://gist.github.com/cvogt/2676ed6c6d1abafa3d6a

PATH=$PATH:/usr/local/bin:/usr/local/sbin

echo ""
echo "Running pre-push hook… (you can omit this with --no-verify, but don't)"

echo "* Moving to the project directory…"
_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
DIR=$( echo $_DIR | sed 's/\/.git\/hooks$//' )

echo "* Stashing non-staged changes so we avoid checking them…"
git diff --quiet
hadNoNonStagedChanges=$?

if ! [ $hadNoNonStagedChanges -eq 0 ]
then
git stash --keep-index -u > /dev/null
fi

echo "* Checking pre push conditions ('prep' SBT task)…"
sbt prep > /dev/null
canPush=$?

if [ $canPush -ne 0 ]
then
echo " [KO] Error :("
fi

echo "* Applying the stash with the non-staged changes…"
if ! [ $hadNoNonStagedChanges -eq 0 ]
then
sleep 1 && git stash pop --index > /dev/null & # sleep because otherwise commit fails when this leads to a merge conflict
fi

# Final result
echo ""

if [ $canPush -eq 0 ]
then
echo "[OK] Your code will be pushed young Padawan"
exit 0
else
echo "[KO] Cancelling push due to test code style error (run 'sbt prep' for more information)"
exit 1
fi
8 changes: 8 additions & 0 deletions docker/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
AWS_ACCESS_KEY_ID=test
AWS_SECRET_ACCESS_KEY=test
LOCALSTACK_PORT=4566
S3_BUCKET=my-bucket
S3_PREFIX=data
POSTGRES_USER=admin
POSTGRES_PASSWORD=secret
POSTGRES_DB=metastore
61 changes: 61 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Use a lightweight JDK base image
FROM openjdk:11.0.11-jre-slim-buster as base-stage

# Install only the necessary dependencies
RUN apt-get update && \
apt-get install -y curl wget ca-certificates software-properties-common ssh net-tools && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Set Spark and Hadoop versions as environment variables
ENV SPARK_VERSION=3.5.0 \
HADOOP_VERSION=3 \
SPARK_HOME=/opt/spark

# Download and install Apache Spark
RUN wget --no-verbose -O apache-spark.tgz "https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz" \
&& mkdir -p /opt/spark \
&& tar -xf apache-spark.tgz -C /opt/spark --strip-components=1 \
&& rm apache-spark.tgz

# Download additional JARs needed
RUN wget -P /opt/spark/jars https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/3.3.1/hadoop-common-3.3.1.jar \
&& wget -P /opt/spark/jars https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.375/aws-java-sdk-bundle-1.11.375.jar \
&& wget -P /opt/spark/jars https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.2/hadoop-aws-3.2.2.jar \
&& wget -P /opt/spark/jars https://repo1.maven.org/maven2/io/delta/delta-spark_2.12/3.1.0/delta-spark_2.12-3.1.0.jar \
&& wget -P /opt/spark/jars https://repo1.maven.org/maven2/org/apache/spark/spark-hive_2.12/3.5.0/spark-hive_2.12-3.5.0.jar \
&& wget -P /opt/spark/jars https://repo1.maven.org/maven2/io/delta/delta-storage/3.1.0/delta-storage-3.1.0.jar


# Setup the next stage for the actual Spark master and worker setup
FROM base-stage as spark-cluster-setup

# Define the working directory
WORKDIR /opt/spark

# Set environment variables for Spark master and worker configuration
ENV SPARK_MASTER_PORT=7077 \
SPARK_MASTER_WEBUI_PORT=8080 \
SPARK_LOG_DIR=/opt/spark/logs \
SPARK_MASTER_LOG=/opt/spark/logs/spark-master.out \
SPARK_WORKER_LOG=/opt/spark/logs/spark-worker.out \
SPARK_WORKER_WEBUI_PORT=8080 \
SPARK_WORKER_PORT=7000 \
SPARK_MASTER="spark://spark-master:7077" \
SPARK_WORKLOAD="master"

# Expose the ports used by Spark master and worker
EXPOSE 8080 7077 7000

# Setup log directories and link logs to stdout for easier container log management
RUN mkdir -p $SPARK_LOG_DIR && \
touch $SPARK_MASTER_LOG && \
touch $SPARK_WORKER_LOG && \
ln -sf /dev/stdout $SPARK_MASTER_LOG && \
ln -sf /dev/stdout $SPARK_WORKER_LOG

# Copy the start script to the container
COPY spark/run.sh /

# Set the command to start the Spark cluster
CMD ["/bin/bash", "/run.sh"]
Loading