# Pipelines

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SparkSession

In [2]:
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
    ], ["id", "text", "label"])
training.show()

+---+----------------+-----+
| id|            text|label|
+---+----------------+-----+
|  0| a b c d e spark|  1.0|
|  1|             b d|  0.0|
|  2|     spark f g h|  1.0|
|  3|hadoop mapreduce|  0.0|
+---+----------------+-----+



In [3]:
# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
    ], ["id", "text"])
test.show()

+---+------------------+
| id|              text|
+---+------------------+
|  4|       spark i j k|
|  5|             l m n|
|  6|spark hadoop spark|
|  7|     apache hadoop|
+---+------------------+



In [4]:
tokenizer = Tokenizer(inputCol='text', outputCol='words')
hashing_tf = HashingTF(inputCol='words', outputCol='features')
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashing_tf, lr])

In [5]:
# Fit the pipeline to training documents
model = pipeline.fit(training)

In [6]:
# Make prediction on the test document
predictions = model.transform(test)
results = predictions.select('id', 'text', 'probability', 'prediction')
for row in results.collect():
    rid, text, prob, prediction = row
    print("({}, {}) --> prob={}, prediction={}".format(rid, text, str([round(prob[0], 1), round(prob[1], 1)]), prediction))

(4, spark i j k) --> prob=[0.2, 0.8], prediction=1.0
(5, l m n) --> prob=[0.8, 0.2], prediction=0.0
(6, spark hadoop spark) --> prob=[0.1, 0.9], prediction=1.0
(7, apache hadoop) --> prob=[1.0, 0.0], prediction=0.0


# Feature Transformation

In [7]:
training = spark.createDataFrame([
    ('campaign_1', 'candycrush', 5, 4, 3, 1),
    ('campaign_2', 'farmvilles', 2, 3, 2, 1),
    ('campaign_1', 'candycrush', 0, 2, 4, 0),
    ('campaign_2', 'candycrush', 0, 8, 8, 0),
    ('campaign_2', 'farmvilles', 6, 3, 4, 1),
    ('campaign_1', 'candycrush', 9, 2, 1, 1)
    ], 
    ['campaignId', 'sourceGameId', 'startCount', 
    'viewCount', 'clickCount', 'install'])
training.show()

+----------+------------+----------+---------+----------+-------+
|campaignId|sourceGameId|startCount|viewCount|clickCount|install|
+----------+------------+----------+---------+----------+-------+
|campaign_1|  candycrush|         5|        4|         3|      1|
|campaign_2|  farmvilles|         2|        3|         2|      1|
|campaign_1|  candycrush|         0|        2|         4|      0|
|campaign_2|  candycrush|         0|        8|         8|      0|
|campaign_2|  farmvilles|         6|        3|         4|      1|
|campaign_1|  candycrush|         9|        2|         1|      1|
+----------+------------+----------+---------+----------+-------+



In [8]:
test = spark.createDataFrame([
    ('campaign_1', 'candycrush', 2, 4, 3, 1),
    ('campaign_2', 'farmvilles', 2, 3, 2, 1),
    ('campaign_3', 'mangotimes', 2, 3, 2, 1)
    ], 
    ['campaignId', 'sourceGameId', 'startCount', 
    'viewCount', 'clickCount', 'install'])
test.show()

+----------+------------+----------+---------+----------+-------+
|campaignId|sourceGameId|startCount|viewCount|clickCount|install|
+----------+------------+----------+---------+----------+-------+
|campaign_1|  candycrush|         2|        4|         3|      1|
|campaign_2|  farmvilles|         2|        3|         2|      1|
|campaign_3|  mangotimes|         2|        3|         2|      1|
+----------+------------+----------+---------+----------+-------+



In [9]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, RobustScaler, VectorAssembler

CATEGORICAL_FEATS = ["campaignId", "sourceGameId"]
NUMERICAL_FEATS = ["startCount", "viewCount", "clickCount"]

str_indexer = StringIndexer(inputCols=CATEGORICAL_FEATS,
                            outputCols=["campaignIdIndex", "sourceGameIdIndex"],
                            handleInvalid='keep')
encoder = OneHotEncoder(inputCols=str_indexer.getOutputCols(),
                        outputCols=["campaignIdVec", "sourceGameIdVec"],
                        handleInvalid='keep')

In [10]:
assembler1 = VectorAssembler(inputCols=NUMERICAL_FEATS, 
                             outputCol="num_features")
scaler = RobustScaler(inputCol='num_features',
                      outputCol='scaled' )

In [11]:
assembler2 = VectorAssembler(inputCols=["campaignIdVec", "sourceGameIdVec", "scaled"],
                             outputCol="features")
lr = LogisticRegression(featuresCol="features", labelCol="install", 
                        maxIter=10, regParam=0.001)

In [12]:
pipeline = Pipeline(stages=[str_indexer, encoder, assembler1, scaler, assembler2, lr])
pipeline_model = pipeline.fit(training)
training = pipeline_model.transform(training)
training.select("features", "install").show()

+--------------------+-------+
|            features|install|
+--------------------+-------+
|[1.0,0.0,0.0,1.0,...|      1|
|[0.0,1.0,0.0,0.0,...|      1|
|(9,[0,3,7,8],[1.0...|      0|
|(9,[1,3,7,8],[1.0...|      0|
|[0.0,1.0,0.0,0.0,...|      1|
|[1.0,0.0,0.0,1.0,...|      1|
+--------------------+-------+



In [13]:
predictions = pipeline_model.transform(test)
predictions.select("features", "prediction").show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[1.0,0.0,0.0,1.0,...|       1.0|
|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,0.0,1.0,0.0,...|       1.0|
+--------------------+----------+



# Model Selection Via Hyperparameter Tuning

In [14]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [15]:
# Prepare training documents, which are labeled.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0),
    (4, "b spark who", 1.0),
    (5, "g d a y", 0.0),
    (6, "spark fly", 1.0),
    (7, "was mapreduce", 0.0),
    (8, "e spark program", 1.0),
    (9, "a e c l", 0.0),
    (10, "spark compile", 1.0),
    (11, "hadoop software", 0.0)
], ["id", "text", "label"])

In [16]:
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol='text', outputCol='words')
hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features')
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashing_tf, lr])

In [17]:
# We will wrap the pipeline estimator in a cross validator instance.
# This allows us to jointly test various parameters over ALL stages of the pipeline.
# A CrossValidator requires: (i) an estimator, (ii) a grid of ParamMaps and (iii) and evaluator
# ParamGridBuilder can be used to build a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam, we will have a grid 
# of 2 X 3 to search over.
param_grid = ParamGridBuilder() \
    .addGrid(hashing_tf.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.001]) \
    .build()

## Cross Validation

In [18]:
cross_val = CrossValidator(estimator=pipeline,
                           estimatorParamMaps=param_grid,
                           evaluator=BinaryClassificationEvaluator(),
                           numFolds=2)

In [19]:
cv_model = cross_val.fit(training)

In [20]:
# Prepare test documents, which are unlabeled.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "mapreduce spark"),
    (7, "apache hadoop")
], ["id", "text"])

In [21]:
# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cv_model.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    print(row)

Row(id=4, text='spark i j k', probability=DenseVector([0.3413, 0.6587]), prediction=1.0)
Row(id=5, text='l m n', probability=DenseVector([0.9438, 0.0562]), prediction=0.0)
Row(id=6, text='mapreduce spark', probability=DenseVector([0.3451, 0.6549]), prediction=1.0)
Row(id=7, text='apache hadoop', probability=DenseVector([0.9561, 0.0439]), prediction=0.0)


## Train Validation

In [22]:
from pyspark.ml.tuning import TrainValidationSplit

In [23]:
# Prepare test documents, which are unlabeled.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "mapreduce spark"),
    (7, "apache hadoop")
], ["id", "text"])

In [24]:
train_val = TrainValidationSplit(estimator=pipeline,
                                 estimatorParamMaps=param_grid,
                                 evaluator=BinaryClassificationEvaluator(),
                                 trainRatio=0.8)

In [25]:
model = train_val.fit(training)

In [26]:
# Make predictions on test data. model is the model with combination of parameters
# that performed best.
model.transform(test)\
    .select("features", "prediction")\
    .show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|(10,[0,1,6],[1.0,...|       1.0|
|(10,[0,5],[2.0,1.0])|       0.0|
|(10,[0,6],[1.0,1.0])|       1.0|
|(10,[3,5],[1.0,1.0])|       0.0|
+--------------------+----------+



# Sampling

In [27]:
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
    ], ["id", "text", "label"])
training.show()

+---+----------------+-----+
| id|            text|label|
+---+----------------+-----+
|  0| a b c d e spark|  1.0|
|  1|             b d|  0.0|
|  2|     spark f g h|  1.0|
|  3|hadoop mapreduce|  0.0|
+---+----------------+-----+



In [34]:
new_df = training.sample(fraction=0.8, withReplacement=True)
new_df.show()

+---+----------------+-----+
| id|            text|label|
+---+----------------+-----+
|  1|             b d|  0.0|
|  3|hadoop mapreduce|  0.0|
+---+----------------+-----+



# Resilient Distributed Data Basics

In [23]:
from pyspark import SparkContext, SparkConf

In [9]:
# conf = SparkConf().setAppName('tutorial').setMaster('local[2]')
# sc = SparkContext(conf=conf)

In [15]:
data = [1, 2, 3, 4, 5, 6]
dist_data = sc.parallelize(data)

In [18]:
dist_data.collect()

[1, 2, 3, 4, 5, 6]

In [27]:
lines = sc.textFile("requirements.txt")
line_length = lines.map(lambda s: len(s))
total_length = line_length.reduce(lambda a, b: a + b)
print(total_length)

4129


In [28]:
def word_count(s):
    words = s.split("=")
    return len(words)

In [30]:
word_num = lines.map(word_count)
# word_num.collect()

In [33]:
pairs = lines.map(lambda s: (s, 1))
counts = pairs.reduceByKey(lambda a, b: a + b)
counts.collect()

[('_libgcc_mutex=0.1=main', 1),
 ('anaconda=custom=py35_0', 1),
 ('astroid=2.0.4=py35_0', 1),
 ('babel=2.2.0=py35_0', 1),
 ('beautifulsoup4=4.4.1=py35_0', 1),
 ('chest=0.2.3=py35_0', 1),
 ('cloudpickle=0.1.1=py35_0', 1),
 ('clyent=1.2.1=py35_0', 1),
 ('conda=4.5.11=py35_0', 1),
 ('conda-build=1.20.0=py35_0', 1),
 ('cryptography=1.4=py35_0', 1),
 ('curl=7.45.0=0', 1),
 ('cycler=0.10.0=py35_0', 1),
 ('cytoolz=0.7.5=py35_0', 1),
 ('decorator=4.0.9=py35_0', 1),
 ('expat=2.2.9=he6710b0_2', 1),
 ('fastcache=1.0.2=py35_0', 1),
 ('flask=0.10.1=py35_1', 1),
 ('flask-cors=2.1.2=py35_0', 1),
 ('fontconfig=2.13.0=h9420a91_0', 1),
 ('glib=2.63.1=h5a9c865_0', 1),
 ('greenlet=0.4.9=py35_0', 1),
 ('gst-plugins-base=1.14.0=hbbd80ab_1', 1),
 ('gstreamer=1.14.0=hb453b48_1', 1),
 ('hdf5=1.8.15.1=2', 1),
 ('icu=58.2=he6710b0_3', 1),
 ('idna=2.0=py35_0', 1),
 ('intel-openmp=2019.4=243', 1),
 ('ipykernel=4.3.1=py35_0', 1),
 ('ipython=4.1.2=py35_1', 1),
 ('ipywidgets=4.1.1=py35_0', 1),
 ('jbig=2.1=0', 1),
 ('