forked from ml-tooling/ml-workspace
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Dockerfile
161 lines (138 loc) 路 6 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
ARG ARG_WORKSPACE_VERSION="latest"
# Build from full flavor of workspace with same version
FROM mltooling/ml-workspace-r:$ARG_WORKSPACE_VERSION
ARG ARG_WORKSPACE_FLAVOR="spark"
ENV WORKSPACE_FLAVOR=$ARG_WORKSPACE_FLAVOR
# argument needs to be initalized again
ARG ARG_WORKSPACE_VERSION="latest"
ENV WORKSPACE_VERSION=$ARG_WORKSPACE_VERSION
# Inspirations:
# https://github.com/jupyter/docker-stacks/blob/master/all-spark-notebook/Dockerfile
# Install Scala
RUN \
apt-get update && \
apt-get install scala
# default-jdk
# Install Spark
ENV \
SPARK_VERSION="2.4.4" \
HADOOP_VERSION="2.7"
RUN \
# https://github.com/jupyter/docker-stacks/blob/master/pyspark-notebook/Dockerfile#L18
wget -q https://www-eu.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -O ./spark.tar.gz && \
tar xfz spark.tar.gz -C /usr/local --owner root --group root --no-same-owner && \
rm spark.tar.gz && \
# link spark folder to /usr/local/spark
ln -s /usr/local/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} /usr/local/spark && \
# Cleanup
clean-layer.sh
# Install Zeppelin
RUN \
/bin/bash $RESOURCES_PATH/tools/zeppelin.sh --install && \
# Cleanup
clean-layer.sh
# Install spark python libraries
RUN \
# pyspark
pip install --no-cache-dir \
findspark \
pyarrow \
# downgrades sklearn: spark-sklearn \
# Improved performance: https://github.com/mozilla/jupyter-spark
lxml \
# Spylon - Jupyter kernel for scala and spark
spylon-kernel && \
# Deprecated: jupyter-spark: https://github.com/mozilla/jupyter-spark
# jupyter serverextension enable --py jupyter_spark && \
# jupyter nbextension install --py jupyter_spark && \
# jupyter nbextension enable --py jupyter_spark && \
# python -m spylon_kernel install
# Install Jupyter kernels
# Install beakerX? https://github.com/twosigma/beakerx
# Cleanup
clean-layer.sh
# Install Sparkmagic: https://github.com/jupyter-incubator/sparkmagic
RUN \
pip install --no-cache-dir sparkmagic && \
jupyter serverextension enable --py sparkmagic && \
# Cleanup
clean-layer.sh
# Install Apache Torre Kernel: https://github.com/apache/incubator-toree
RUN \
pip install --no-cache-dir toree && \
jupyter toree install --sys-prefix && \
# Cleanup
clean-layer.sh
# Install Spark Monitor: https://github.com/krishnan-r/sparkmonitor
RUN \
# git+https://github.com/krishnan-r/sparkmonitor#subdirectory=extension && \
pip install --no-cache-dir sparkmonitor && \
jupyter nbextension install sparkmonitor --py --user --symlink && \
jupyter nbextension enable sparkmonitor --py --user && \
jupyter serverextension enable --py --user sparkmonitor && \
ipython profile create && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py && \
# Cleanup
clean-layer.sh
# R Spark support
ENV R_LIBS_USER $SPARK_HOME/R/lib
RUN conda install --quiet --yes 'r-sparklyr' && \
# Cleanup
clean-layer.sh
### CONFIGURATION ###
ENV \
# PYSPARK_DRIVER_PYTHON / PYSPARK_DRIVER_PYTHON_OPTS / HADOOP_HOME / HADOOP_CLASSPATH / SPARK_DIST_CLASSPATH
# export HADOOP_HOME=~/hadoop-2.7.0 export PATH=$HADOOP_HOME/bin:$PATH export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
# export HADOOP_CLASSPATH=$HADOOP_HOME/share/hadoop/tools/lib/*
# export SPARK_DIST_CLASSPATH=`hadoop classpath`
# export PYSPARK_DRIVER_PYTHON="jupyter"
# export PYSPARK_DRIVER_PYTHON_OPTS="notebook"
# HADOOP_CONF_DIR=/usr/lib/hadoop
SPARK_HOME="/usr/local/spark" \
PYSPARK_PYTHON="python3" \
SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip:$PYTHONPATH \
PATH=$SPARK_HOME/bin:$PATH \
# http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
PYTHONHASHSEED=0
RUN \
# spark events dir
mkdir -p /tmp/spark-events
# https://medium.com/@marcovillarreal_40011/creating-a-spark-standalone-cluster-with-docker-and-docker-compose-ba9d743a157f
# ENV SPARK_MASTER_PORT 7077
# ENV SPARK_MASTER_WEBUI_PORT 8080
# ENV SPARK_WORKER_WEBUI_PORT 8081
# ENV SPARK_MASTER_LOG /spark/logs
# ENV SPARK_WORKER_LOG /spark/logs
# CMD ["/bin/bash", "/start-master.sh"]
# export SPARK_MASTER_HOST=`hostname`
# SPARK_WORKER_CORES=1
# SPARK_WORKER_MEMORY=1G
# SPARK_DRIVER_MEMORY=128m
# SPARK_EXECUTOR_MEMORY=256m
# TODO start spark master?
# TODO configure spark ui to be proxied with base path:
# https://stackoverflow.com/questions/45971127/wrong-css-location-of-spark-application-ui
# https://github.com/jupyterhub/jupyter-server-proxy/issues/57
# https://github.com/yuvipanda/jupyter-sparkui-proxy/blob/master/jupyter_sparkui_proxy/__init__.py
# Todo: Add additional spark configuration:
# https://spark.apache.org/docs/latest/configuration.html
# https://zeppelin.apache.org/docs/latest/interpreter/spark.html
# Add default configuration
COPY resources/spark-defaults.conf ${SPARK_HOME}/conf/
# Add supervisor config to start zeppelin on port 8072
COPY resources/zeppelin-service.conf /etc/supervisor/conf.d/
# Add spark tutorials
COPY resources/tutorials $RESOURCES_PATH/tutorials/tutorials
# Overwrite & add Labels
ARG ARG_BUILD_DATE="unknown"
ARG ARG_VCS_REF="unknown"
LABEL \
"workspace.version"=$WORKSPACE_VERSION \
"workspace.flavor"=$WORKSPACE_FLAVOR \
"workspace.baseimage"=mltooling/ml-workspace:$WORKSPACE_VERSION \
"org.opencontainers.image.version"=$WORKSPACE_VERSION \
"org.opencontainers.image.revision"=$ARG_VCS_REF \
"org.opencontainers.image.created"=$ARG_BUILD_DATE \
"org.label-schema.version"=$WORKSPACE_VERSION \
"org.label-schema.vcs-ref"=$ARG_VCS_REF \
"org.label-schema.build-date"=$ARG_BUILD_DATE