In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # Allows the use of display() for DataFrames
from __future__ import division

# Import supplementary visualization code visuals.py
# import visuals as vs

# Pretty display for notebooks
%matplotlib inline

## Welcome to Data Engineering Workshop

## Outline

### Part 0 : Orientation and setup
### Part 1 : Introduction
### Part 2 : Requirements and Tools
    Task 0: Prerequisites
    Task 1 : Run Docker containers
    Task 2 : Package and run a custom app using Docker
    Task 3 : Modify a running app
    Kahoot BREAK

### Break
 
### Part 3 : Deploy your App
 
### Part 4 : Exercises
 


# Orientation and Setup

# Requirements

* Your computer:
    * Docker

* You
    * Basic Shell

# Tools

* Two main types:
    * Virtual Machines:
        * Virtualbox
        * VMware
        * AWS, Google Compute Engine, ...
    * Containers:
        * Docker
        * Singularity
* Main idea -- isolate the computing environment
    * Allow regenerating computing environments
    * Allow sharing your computing environments

# Virtual Machines Vs. Containers
![containers_vs_VM](./images/containers-versus-virtual-machines-docker-inc-rightscale.jpg)

## Virtual Machines
* emulate whole computer system (software + hardware)
* run on top of a physical machine using a hypervisor
* _hypervisor_ shares and manages hardware of the host and executes the guest operating system
* guest machines are completely isolated and have dedicated resourcesc

# Virtual Machines Vs. Containers
![containers_vs_VM](./images/containers-versus-virtual-machines-docker-inc-rightscale.jpg)

## Docker Containers
* share the host system's kernel with other containers
* each container gets its own isolated user space
* only bins and libs are created from scratch
* containers are very lightweight and fast to start up

# Introduction

## Why do we use containers?

## What are the various types of container based environments?

## How to use Docker?

## Container Technologies
* Isolate the computing environments
* Provide a mechanism to encapsulate environments in a self-contained unit that can run anywhere

## Why do we need containers? 

## Science Reproducibility

* Each Project in a lab depends on complex software environments
    * Operating system
    * drivers
    * software dependencies: Python/MATLAB/R+libraries
* We try to avoid
    * the computer I used was shut down a year ago, can't rerun the results from my publication...
    * the analysis were run by my student, have no idea where and how...
    * etc.
* Collaboration with your colleagues
    * Sharing your code or using a repository might not be enough
    * We try to avoid
        * Well, I forgot to mention that you have to use Clang, gcc never worked for me...
        * don't see any reason why it shouldn't work on Windows...(I actually have no idea about Windows, but won't say it...)
        * it works on my coputer...
        * etc.
* Freedom to experiment!

# Docker
* leading software container platform
* an open-source project
* it runs now on Mac OS x and Windows (you don't have to run VM!)

## Testing your Docker Installation:
`docker run hello-world`

## Interesting tutorials and blog posts:

* [A beginner friendly intro to VMs and Docker](https://medium.freecodecamp.org/a-beginner-friendly-introduction-to-containers-vms-and-docker-79a9e3e119b)
* [Intro to Docker from Neurohackweek](https://neurohackweek.github.io/docker-for-scientists/)
* [Understanding Images](https://code.tutsplus.com/tutorials/docker-from-the-ground-up-understanding-images--cms-28165)

# Break

# Deploy your App

# Exercises

# Docker: Using existing images

* Docker Hub -- repositories to share Docker images

* managing images:
``` 
$ docker pull ubuntu
$ docker images
# remove images
$ docker rmi <image_id>
# remove dangling images
$ docker rmi $(docker images | grep "^<none>" | awk '{print $3}') 
```

* running containers
```
$ docker run ubuntu
$ docker run ubuntu echo "hello from your container"
```

* `-it` option: running interactively
```
$ docker run -it ubuntu bash
```

# Docker: Using existing images

* managing containers
```
# list currently running containers
$ docker ps
# list created containers
$ docker ps -a
# remove containers
$ docker rm <container_id>
# remove all stops containers
$ docker rm $(docker ps -a -q)
```

* `--rm` option: automatically removing the container when it exits
```
$ docker run -it --rm ubuntu
```

* adding a data volume to a container (you can use multiple times to mount multiple data volumes)
```
# you should use absolute path to the LocalDirectory
$ docker run -it --rm -v LocalDirectory:/src ubuntu
# read only mode
$ docker run -it --rm -v LocalDirectory:/src:ro ubuntu
# you can mount multiple data volumes
# the directory `temp` doesn't have to exist and will be created
$ docker run -it --rm -v LocalDirectory:/src -v TempLocalDirectory:/temp ubuntu
```

# Docker: Installing software with Dockerfile

* Create a new directory 
```
$ mkdir mydockerbuild
$ cd mydockerbuild
```

* Dockerfile content:
```
FROM ubuntu:latest
RUN apt-get update -y && apt-get install git emacs
```

* Building a new container:
```
$ docker build -t my_new_container .
```

* Running your new container:
```
$ docker run -it --rm my_new_container
```

* Within container you can try:
```
$ git
$ emacs
```

## Census Income Data Set 

In [2]:
# Load the Census dataset
orig_data = pd.read_csv("./Dataset/adult.data",header=None)
orig_data.columns = ['age','workclass','fnlwgt','education_level','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']

# Success - Display the first record
display(orig_data.head(n=10))

Unnamed: 0,age,workclass,fnlwgt,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


### Data Exploration

In [3]:
# TODO: Total number of records
n_records = len(orig_data)

# TODO: Number of records where individual's income is more than $50,000
n_greater_50k = len(orig_data[orig_data.income==' >50K'])

# TODO: Number of records where individual's income is at most $50,000
n_at_most_50k = len(orig_data[orig_data.income==' <=50K'])

# TODO: Percentage of individuals whose income is more than $50,000
greater_percent = n_greater_50k/n_records*100.0

# TODO: Visualizations

# TODO: Correlations

# TODO: ANOVA

# TODO: Chi-Squared tests

# Print the results
print ("Total number of records: {}".format(n_records))
print ("Individuals making more than $50,000: {}".format(n_greater_50k))
print ("Individuals making at most $50,000: {}".format(n_at_most_50k))
print ("Percentage of individuals making more than $50,000: {:.2f}%".format(greater_percent))

Total number of records: 32561
Individuals making more than $50,000: 7841
Individuals making at most $50,000: 24720
Percentage of individuals making more than $50,000: 24.08%


In [29]:
orig_data.shape

(32561, 15)

# Change data types for categorical features

In [17]:
orig_data['workclass'].astype('category')
orig_data['education_level'].astype('category')
orig_data['marital-status'].astype('category')
orig_data['occupation'].astype('category')
orig_data['relationship'].astype('category')
orig_data['race'].astype('category')
orig_data['sex'].astype('category')
orig_data['native-country'].astype('category')

0         United-States
1         United-States
2         United-States
3         United-States
4                  Cuba
5         United-States
6               Jamaica
7         United-States
8         United-States
9         United-States
10        United-States
11                India
12        United-States
13        United-States
14                    ?
15               Mexico
16        United-States
17        United-States
18        United-States
19        United-States
20        United-States
21        United-States
22        United-States
23        United-States
24        United-States
25        United-States
26        United-States
27                South
28        United-States
29        United-States
              ...      
32531     United-States
32532     United-States
32533             Japan
32534     United-States
32535     United-States
32536     United-States
32537     United-States
32538     United-States
32539     United-States
32540     United-States
32541     United

In [20]:
orig_data.groupby()

age                32561
workclass          32561
fnlwgt             32561
education_level    32561
education-num      32561
marital-status     32561
occupation         32561
relationship       32561
race               32561
sex                32561
capital-gain       32561
capital-loss       32561
hours-per-week     32561
native-country     32561
income             32561
dtype: int64

In [None]:
df_train['PassengerId'].groupby(df_train['Survived']).count()

# Correlation Matrix of Values

In [14]:
orig_data.corr()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
age,1.0,-0.076646,0.036527,0.077674,0.057775,0.068756
fnlwgt,-0.076646,1.0,-0.043195,0.000432,-0.010252,-0.018768
education-num,0.036527,-0.043195,1.0,0.12263,0.079923,0.148123
capital-gain,0.077674,0.000432,0.12263,1.0,-0.031615,0.078409
capital-loss,0.057775,-0.010252,0.079923,-0.031615,1.0,0.054256
hours-per-week,0.068756,-0.018768,0.148123,0.078409,0.054256,1.0


# Covariance Matrix of Values

In [16]:
orig_data.cov()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
age,186.0614,-110350.7,1.281849,7824.819,317.560742,11.58013
fnlwgt,-110350.6853,11140800000.0,-11729.527298,336662.5,-436030.333167,-24460.426185
education-num,1.281849,-11729.53,6.61889,2330.008,82.856445,4.705338
capital-gain,7824.818537,336662.5,2330.007877,54542540.0,-94085.760688,7150.032029
capital-loss,317.560742,-436030.3,82.856445,-94085.76,162376.937814,269.953755
hours-per-week,11.58013,-24460.43,4.705338,7150.032,269.953755,152.458995


# Age

In [11]:
age_describe = orig_data.age
print('Overall:',age_describe.describe())

Overall: count    32561.000000
mean        38.581647
std         13.640433
min         17.000000
25%         28.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: age, dtype: float64


# workclass

In [13]:
workclass_describe = orig_data.workclass
print('Overall:',workclass_describe.describe())

Overall: count        32561
unique           9
top        Private
freq         22696
Name: workclass, dtype: object


In [None]:
# education_level



In [None]:
# education-num



In [None]:
# marital-status



In [None]:
# occupation



In [None]:
# relationship



In [None]:
# race



In [None]:
# sex



In [None]:
# capital-gain



In [None]:
# capital-loss



In [None]:
# hours-per-week



In [None]:
# native-countr

## modified census dataset 

The modified census dataset consists of approximately 32,000 data points, with each datapoint having 13 features. This dataset is a modified version of the dataset published in the paper "Scaling Up the Accuracy of Naive-Bayes Classifiers: a Decision-Tree Hybrid", by Ron Kohavi. You may find this paper online, with the original dataset hosted on UCI.

#### Features

**age**: Age

**workclass**: Working Class (Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked)

**fnlwgt**: continuous

**education_level**: Level of Education (Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool)

**education-num**: Number of educational years completed

**marital-status**: Marital status (Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse)

**occupation**: Work Occupation (Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces)

**relationship**: Relationship Status (Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried)

**race**: Race (White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black)

**sex**: Sex (Female, Male)

**capital-gain**: Monetary Capital Gains

**capital-loss**: Monetary Capital Losses

**hours-per-week**: Average Hours Per Week Worked

**native-country**: Native Country (United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands)

### Target Variable

**income**: Income Class (<=50K, >50K)

In [13]:
# Load the Census dataset
data = pd.read_csv("./Finding-Donors-for-CharityML/census.csv")

# Success - Display the first record
display(data.head(n=5))

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [12]:
data.shape

(45222, 14)

In [61]:
# TODO: Total number of records
n_records = len(data)

# TODO: Number of records where individual's income is more than $50,000
n_greater_50k = len(data[data.income==">50K"])

# TODO: Number of records where individual's income is at most $50,000
n_at_most_50k = len(data[data.income=="<=50K"])

# TODO: Percentage of individuals whose income is more than $50,000
greater_percent = n_greater_50k/n_records*100.0

# Print the results
print ("Total number of records: {}".format(n_records))
print ("Individuals making more than $50,000: {}".format(n_greater_50k))
print ("Individuals making at most $50,000: {}".format(n_at_most_50k))
print ("Percentage of individuals making more than $50,000: {:.2f}%".format(greater_percent))

Total number of records: 45222
Individuals making more than $50,000: 11208
Individuals making at most $50,000: 34014
Percentage of individuals making more than $50,000: 24.78%
