# Proof of Concept - Read from MySQL Database

## Setup Database Server
```
docker run --detach --name=mysql-db --env="MYSQL_ROOT_PASSWORD=datascience" mysql
```

This will use the [official mysql docker image](https://hub.docker.com/_/mysql/) to create a database server container with root password 'datascience'.
Capture the IP address of the DB server container with `docker inspect mysql-db | grep -i 'ipaddress'`
This will yeild something like
```
            "SecondaryIPAddresses": null,
            "IPAddress": "172.17.0.8",
                    "IPAddress": "172.17.0.8",
```

## Command Line Client
To create a database and add data, access the mySql server through a command line by creating another mysql container linked to the server container. 
This container will only exist until you exit the mysql command prompt. 
Assume data you want to load into the database is in /home/cownby/data/
Run mysql command line client:
```
export MYSQL_PORT_3306_TCP_ADDR="172.17.0.8:6603"
export MYSQL_ROOT_PASSWORD="datascience"
export MYSQL_SERVER_CONTAINER="mysql-db"

docker run -it -v "/home/cownby/data/":"/data" --link "$MYSQL_SERVER_CONTAINER":mysql --rm mysql sh -c 'exec mysql -h"$MYSQL_PORT_3306_TCP_ADDR" -P"$MYSQL_PORT_3306_TCP_PORT" -uroot -p"$MYSQL_ROOT_PASSWORD"'

```
You will be prompted again for the password

### SQL Commands

```SQL
/*  
Assume volume is mounted to `/data` and data is in file `supers.csv`.

sample data:
id,firstName,lastName,superpower,goodDeeds,maxSpeed
0,100,carolyn,ownby,smile,72,15.5
0,200,krizia,conrad,good food!,80,25.7
0,300,jordan,dick,roadster,30,80.25
0,400,dixon,dick,vision,65,56.8

*/

create database giskard;
use giskard;
create table supers (
	key_id      int NOT NULL AUTO_INCREMENT UNIQUE,
	import_id   int,
	first_name  varchar(20),
	last_name   varchar(20),
	super_power varchar(20),
	good_deeds  int,
	max_speed   float
);

load data local infile '/data/supers.csv'
      into table supers
      fields terminated by ','
      lines terminated by '\n'
      ignore 1 rows;
```

## References

* [Excel to MySQL](http://www.prcconsulting.net/2016/10/migrating-an-excel-spreadsheet-to-mysql-and-to-spark-2-0-1-part-1/)  
This is NOT done programmatically, but by simply exporting an excel sheet as csv and importing into mySQL.
* [Spark & MySQL](http://www.prcconsulting.net/wp-content/uploads/2016/10/Connect_MySQL.py_-1.html)
* [Spark & database properties](http://spark.apache.org/docs/latest/sql-programming-guide.html#jdbc-to-other-databases)
* [tutorial](http://severalnines.com/blog/mysql-docker-containers-understanding-basics)
* [official mysql docker repository](https://hub.docker.com/_/mysql/)


##  Test Notes
Create new container linked to mysql server

```bash
sudo docker run -d -p 8826:8888  -e GRANT_SUDO=yes --user root --net=hadoop --name=spark-mysql --link mysql-db:spark-mysql  cotest2-image  start-notebook.sh 

docker run -it -v "/home/ubuntu/carolyn/data":"/data" --link mysql-db:mysql --rm mysql sh -c 'exec mysql -h"$MYSQL_PORT_3306_TCP_ADDR" -P"$MYSQL_PORT_3306_TCP_PORT" -uroot -p"$MYSQL_ROOT_PASSWORD"' 



 #local VM:
 docker run -d -p 8826:8888  -e GRANT_SUDO=yes --user root --net=hadoop --name=cotest cotest-image  start-notebook.sh

 
 time docker build -t cotest-image . 1>build.log 2>build.err  ;ll
 
 docker run -d -p 8826:8888  -e GRANT_SUDO=yes --user root --net=hadoop --name=cotest-mysql --link mysql-db:cotest-mysql  cotest-image  start-notebook.sh
 
 docker stop cotest-mysql;docker rm cotest-mysql;docker rmi cotest-image
 
 find / -name *.jar 2>&1 | grep -v "Permission denied" | grep -i mysql
 root@96e60e2c77f8:~/work# find / -name *.jar 2>&1 | grep -v "Permission denied" | grep -i mysql      
/usr/share/maven-repo/mysql/mysql-connector-java/5.1.39/mysql-connector-java-5.1.39.jar
/usr/share/maven-repo/mysql/mysql-connector-java/debian/mysql-connector-java-debian.jar
/usr/share/java/mysql.jar
/usr/share/java/mysql-connector-java-5.1.39.jar
/usr/share/java/mysql-connector-java.jar


```


In [1]:
# Basic setup required in all notebooks
 
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark import SparkContext, SparkConf

# Define an arbitrary application name & stand-alone Spark cluster
appName='archetest'
master='local[*]' #local spark-master

# Explicitly define python 2 since we have both versions 2 & 3 installed
os.environ['PYSPARK_PYTHON'] = '/opt/conda/envs/python2/bin/python'

# Create spark context with which we will reference the Spark API
spark = (SparkSession
         .builder
         .master(master)
         .config(conf=SparkConf())  
         .appName(appName)
         .getOrCreate()
        )

spark.version


u'2.0.2'

In [None]:
#conf.setJars()
#.setjars(Array("/usr/local/spark-2.0.0-bin-hadoop2.7/jars"))

sys.version
#example
#3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]

#/usr/share/java/mysql-connector-java.jar
import sys,os,os.path
sys.path.append('/usr/share/java')
!echo $PATH
!echo $PYTHONPATH

In [None]:
#  
!sudo apt-get update && apt-get install -y --no-install-recommends apt-utils
!sudo apt-get install -y mysql-client
!sudo apt-get install -y python-dev 
!sudo apt-get install -y libmysqlclient-dev
!sudo apt-get install -y libmysql-java

In [None]:
!pip2 install MySQL-python

In [2]:
#URL = 'jdbc:mysql://172.17.0.8:6603/giskard'
#URL = 'jdbc:mysql://172.17.0.8:6603/giskard?user=root&password=datascience'
URL = 'jdbc:mysql://172.17.0.8:6603/giskard'

#spark.conf.set(spark.sparkContext.environment.
#    spark.driver.extraClassPath, r'/usr/share/java/mysql-connector-java.jar')
#spark.executor.extraClassPath = r'/usr/share/java/mysql-connector-java.jar'

df = (spark.read.format('jdbc')
      .options(
        url=URL,
        user='root',
        password='datascience',
        dbtable='supers',driver='com.mysql.jdbc.Driver')
      .load()
     )

# driver='com.mysql.jdbc.Driver'

Py4JJavaError: An error occurred while calling o42.load.
: java.lang.ClassNotFoundException: com.mysql.jdbc.Driver
	at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
	at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
	at java.security.AccessController.doPrivileged(Native Method)
	at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
	at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:38)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:49)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:49)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.createConnectionFactory(JdbcUtils.scala:49)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:123)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:117)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:53)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:345)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:149)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:122)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)


In [3]:
%env

{'APACHE_SPARK_VERSION': '2.0.2',
 'CLICOLOR': '1',
 'CONDA_DIR': '/opt/conda',
 'DEBIAN_FRONTEND': 'noninteractive',
 'GIT_PAGER': 'cat',
 'GRANT_SUDO': 'yes',
 'HOME': '/home/jovyan',
 'HOSTNAME': 'e004b18c30ec',
 'JPY_PARENT_PID': '9',
 'LANG': 'en_US.UTF-8',
 'LANGUAGE': 'en_US.UTF-8',
 'LC_ALL': 'en_US.UTF-8',
 'LOGNAME': 'jovyan',
 'MAIL': '/var/mail/jovyan',
 'MESOS_NATIVE_LIBRARY': '/usr/local/lib/libmesos.so',
 'NB_UID': '1000',
 'NB_USER': 'jovyan',
 'PAGER': 'cat',
 'PATH': '/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
 'PWD': '/home/jovyan/work',
 'PYSPARK_PYTHON': '/opt/conda/envs/python2/bin/python',
 'PYTHONPATH': '/usr/local/spark/python:/usr/local/spark/python/lib/py4j-0.10.4-src.zip:/usr/lib/python2.7/dist-packages',
 'R_LIBS_USER': '/usr/local/spark/R/lib',
 'SHELL': '/bin/bash',
 'SHLVL': '1',
 'SPARK_HOME': '/usr/local/spark',
 'SPARK_OPTS': '--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog