## Tutorial: Spark Basics

### University of California, Santa Barbara  
### PSTAT 135/235  
### Last Updated: Oct 22, 2018

---  

### OBJECTIVES
- Getting started in Spark

### PREREQUISITES
- Spark 2.0 install

---  

In [62]:
# Initialize a SparkContext, using a local machine, one thread:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [63]:
import os

In [64]:
sc

In [65]:
data_path = '/home/jovyan/work/data/'

In [66]:
data_filename = 'some_text.txt'

In [67]:
data_path_full = os.path.join(data_path, data_filename)

In [68]:
os.listdir()

['.ipynb_checkpoints', 'some_text.txt']

In [69]:
data = sc.textFile(data_path_full)

In [70]:
data

/home/jovyan/work/data/some_text.txt MapPartitionsRDD[27] at textFile at NativeMethodAccessorImpl.java:0

In [71]:
data.count()

3

In [72]:
data.first()

'1) spark is my favorite tool for building ML models'

In [73]:
type(data)

pyspark.rdd.RDD

In [74]:
ml = data.filter(lambda line: 'ML' in line)

In [75]:
ml.take(ml.count())

['1) spark is my favorite tool for building ML models']

In [76]:
words = data.flatMap(lambda x: x.split())
wordcounts = words.map(lambda x: (x, 1)) \
                  .reduceByKey(lambda x,y:x+y) \
                  .map(lambda x:(x[1],x[0])) \
                  .sortByKey(False)
wordcounts.take(10)

[(3, 'spark'),
 (2, 'is'),
 (2, 'my'),
 (2, 'for'),
 (1, 'favorite'),
 (1, 'good'),
 (1, 'of'),
 (1, '3)'),
 (1, '1)'),
 (1, 'tool')]

# Load some ML packages

In [1]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from pyspark.ml.regression import LinearRegression

In [2]:
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import LogisticRegression

# Work with SparkSession

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [6]:
spark