## Tutorial: Spark Basics

### University of California, Santa Barbara  
### PSTAT 135/235  
### Last Updated: Oct 22, 2018

---  

### OBJECTIVES
- Getting started in Spark

### PREREQUISITES
- Spark 2.0 install

---  

In [1]:
# Initialize a SparkContext, using a local machine, one thread:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [2]:
import os

In [3]:
sc

In [11]:
data_path = '/home/jovyan/UCSB_BigDataAnalytics/data/'

In [12]:
data_filename = 'some_text.txt'

In [13]:
data_path_full = os.path.join(data_path, data_filename)

In [14]:
os.listdir()

['computer_vision.ipynb',
 'data_preprocessing.ipynb',
 'spark_basics.ipynb',
 'test_script_spark_basics.py',
 '.ipynb_checkpoints',
 'infant_birth_classifier.ipynb',
 'train_gbm_with_cross_validation.ipynb',
 'exploratory_data_analysis.ipynb']

In [15]:
data = sc.textFile(data_path_full)

In [16]:
data

/home/jovyan/UCSB_BigDataAnalytics/data/some_text.txt MapPartitionsRDD[4] at textFile at NativeMethodAccessorImpl.java:0

In [17]:
data.count()

3

In [18]:
data.first()

'1) spark is my favorite tool for building ML models'

In [19]:
type(data)

pyspark.rdd.RDD

In [20]:
ml = data.filter(lambda line: 'ML' in line)

In [21]:
ml.take(ml.count())

['1) spark is my favorite tool for building ML models']

In [22]:
words = data.flatMap(lambda x: x.split())
wordcounts = words.map(lambda x: (x, 1)) \
                  .reduceByKey(lambda x,y:x+y) \
                  .map(lambda x:(x[1],x[0])) \
                  .sortByKey(False)
wordcounts.take(10)

[(3, 'spark'),
 (2, 'is'),
 (2, 'my'),
 (2, 'for'),
 (1, 'favorite'),
 (1, 'good'),
 (1, 'of'),
 (1, '3)'),
 (1, '1)'),
 (1, 'tool')]

# Load some ML packages

In [23]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from pyspark.ml.regression import LinearRegression

In [27]:
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import LogisticRegression

# Work with SparkSession

In [28]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [29]:
spark