In [0]:
# Lab 00a: Before we begin, confirm all files are loaded
display(dbutils.fs.ls("dbfs:/FileStore/tables/"))

# Mod 11: Spark MLLIB/ML

### Lab 01 - Collaborative Filter - Movie Recommendation via MLLIB (RDDs)

In [0]:
%scala
// Lab 01a: Let's first do CF Lab via RDD (MLLib)

import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel



// Load data: Schema = (user, movie, rating (Min-1-5-Max), ts)
val raw = sc.textFile("/FileStore/tables/u.data")

// Using Rating library, assign schema and pluck out just 3 columns you want and put into an Array
val ratings = raw.map(_.split("\t") match {case Array(user,movie,rate,ts) => Rating(user.toInt, movie.toInt, rate.toDouble)})

// Display User, Movie, Rating
ratings.take(2)

In [0]:
%scala
// Lab 01b: Create Model: Assign rank (# factors), iterations, lambda (controls Overfitting)

val rank = 10
val numIterations = 5
val lambda = 0.01
val model = ALS.train(ratings, rank, numIterations, lambda)

// Returns (UserFeatures (Int) and productFeatures (Array)
model.userFeatures.take(1)

// Count # of UserFeatures factors
model.userFeatures.count

// Count # of ProductFeatures factors
model.productFeatures.count

In [0]:
%scala
// Lab 01c: Map MovieID to MovieName using 'collectAsMap' to create a Key Value lookup

val movies = sc.textFile("/FileStore/tables/u.item")

// Pluck out Movie ID and Movie Name and put into KV pair using collectAsMap()
val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt, array(1))).collectAsMap()

// What is Movie title for Movie ID 180  (Hint: A great war movie)
println(titles(180))

In [0]:
%scala

// What is Movie title for Movie ID 180  (Hint: A great war movie)
println(titles(180))

In [0]:
%scala
// Lab 01d: Make Movie Recommendations for a specific User

// Generate Top 10 MovieID recommendation for User 2
val topKRecs = model.recommendProducts(2,10)

In [0]:
%scala
// Lab 01e: Make Movie Name Recommendations for a specific User

// Generate Top 10 Movie Title recommendations for User 2
topKRecs.map(rating => (titles(rating.product), rating.rating)).foreach(println)

### Lab 02: Collaborative Filter - Movie Recommendation via ML (Spark SQL)

In [0]:
# Lab 02a: Load Data, Remove 'timestamp' column and Display

movieDF = spark.read.csv("/FileStore/tables/movielens_ratings.txt", header=True, inferSchema=True)
movieDF2 = movieDF.drop("timestamp")
display(movieDF2)

In [0]:
# Lab 02b:  Split into TRAIN and TEST DataFrames

(training, test) = movieDF2.randomSplit([0.8, 0.2])

In [0]:
# Lab 02c: Load Movie Name and remove excess Columns

nameDF = spark.read.csv("/FileStore/tables/movieid.txt", sep = "|", header=True, inferSchema=True)
nameDF2 = nameDF.drop("dt").drop("c0").drop("c1").drop("c2").drop("c3").drop("c4").drop("c5").drop("c5").drop("c6").drop("c7").drop("c8").drop("c9").drop("c10")     .drop("c11").drop("c12").drop("c13").drop("c14").drop("c15").drop("c16").drop("c17").drop("c18").drop("c19").drop("url")
display(nameDF2) 

In [0]:
# Lab 02d: Build the recommendation model using ALS on the TRAIN data
#          Note we set 'cold start strategy' to 'drop' to ensure we don't get NaN evaluation metrics

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

als = ALS(maxIter=5, regParam=0.01, userCol="user", itemCol="id", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [0]:
# Lab 02e: Generate top 10 Movie ID recommendations for a specified set of users (User 0,1,2)
users = movieDF2.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
display(userSubsetRecs)

In [0]:
# Lab 02f: To convert Movie ID to Movie Name, we first EXPLODE out the 'recommendations' column
#          Now we have the value of the 'id' column

from pyspark.sql.functions import explode, col

nrecommendations = userSubsetRecs\
    .withColumn("rec_exp", explode("recommendations"))\
    .select('user', col("rec_exp.id"), col("rec_exp.rating"))

display(nrecommendations)

In [0]:
# Lab 02g: Now JOIN nrecommendations to Movie Name dataframe (nameDF2) and see if Movie recommendations make sense

display(nrecommendations.join(nameDF2, on='id').sort('user','rating', ascending=False))

### Lab 03: Correlation - Which Correlates most to Wins? Hits, Runs, Home Runs, ERA?

In [0]:
# Lab 03a: Load Baseball data and Display

bbDF = spark.read.csv("/FileStore/tables/teams.csv", header=True, inferSchema=True)
display(bbDF)

In [0]:
# Lab 03b: Which Correlates more to W(ins)?  (H)its, (R)uns, HR (Home Runs) or ERA?

bbDF.stat.corr("w", "h")

In [0]:
# Lab 03b: Which Correlates more to W(ins)?  (H)its, (R)uns, HR (Home Runs) or ERA?

bbDF.stat.corr("w", "r")

In [0]:
# Lab 03b: Which Correlates more to W(ins)?  (H)its, (R)uns, HR (Home Runs) or ERA?

bbDF.stat.corr("w", "hr")

In [0]:
# Lab 03b: Which Correlates more to W(ins)?  (H)its, (R)uns, HR (Home Runs) or ERA?

bbDF.stat.corr("w", "era")

### Lab 04: KMeans - Divide Heart patients into 2 Clusters
#### The classification will not tell us which have heart disease (that’s what logistic regression did in the previous post), 
#### but you can logically deduce that one set of patients is sick and the other is not, since the indicators of health are the input data

In [0]:
# Lab 04a: Load Data
# Column 'thal' indicates if patient has heart problem:
  # 3 = Healthy
  # 6 = Repaired
  # 7 = Needs Surgery

import pandas as pd

df1 = spark.read.format("csv").load("dbfs:/FileStore/tables/heart1.csv",  inferSchema="true", header="true")
df1.show()

In [0]:
# Lab 04b: Assign X-variables to Vector

from pyspark.ml.feature import VectorAssembler

features =   ('age', 'sex', 'chest_pain', 'bp', 'chol', 'sugar', 'ecg', 'maxhr',  
              'angina', 'depress', 'slope', 'vessels') 

assembler = VectorAssembler(inputCols=features,outputCol="features")
df3=assembler.transform(df1)
df3.select("features").show(truncate=False)

In [0]:
# Lab 04c: Run Means and produce 2 Clusters.  Then Evaluate
from pyspark.ml.clustering import KMeans

# Trains a K-means model for 2 Clusters
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(df3)

# Make predictions
predictions = model.transform(df3)

predictions.show()

In [0]:
# Lab 04d: Evaluate Clusters for Euclidean Distance
from pyspark.ml.evaluation import ClusteringEvaluator

# Evaluate clustering by computing Euclidean Distance
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Eculidean Distance. 
# If NEG, data cannot be separated.  If 1, maximum separation
# If 0, barely separated.  We got .57, which isn't bad

# Shows the result
print("Cluster Centers: ")
ctr=[]
centers = model.clusterCenters()
for center in centers:
    ctr.append(center)
    print(center)

### Lab 05:
### Logistic Regression - Goal: Predict Purchase based on Gender, Age, Salary

In [0]:
%scala
// Lab 05a:  Load DataFrame
// https://medium.com/@Sushil_Kumar/machine-learning-pipelines-with-spark-ml-94cd9b4c973d

val advertDF = spark.read.format("csv").option("header", "true")
              .option("inferSchema", "true").load("dbfs:/FileStore/tables/advert.csv")
display(advertDF)

In [0]:
%scala
// Lab 05b: Split DataFrame into 2 parts

val splits = advertDF.randomSplit(Array(0.8, 0.2), seed = 1234L)
val train = splits(0)
val test = splits(1)

In [0]:
%scala
// Lab 05c: Perform 'OneHotEncoding' to Convert Categorical 'Gender' column (String) into Numeric

import org.apache.spark.ml.feature.OneHotEncoder
import org.apache.spark.ml.feature.StringIndexer

val genderIndexer = new StringIndexer().setInputCol("Gender").setOutputCol("GenderIndex")
val genderOneHotEncoder = new OneHotEncoder().setInputCol("GenderIndex").setOutputCol("GenderOHE")

In [0]:
%scala
// Lab 05d: Define X-var Features (Gender, Age Salary) must be formated into Vector. 
//           Y-var = 'Purchase' colum
//           Normalize data via Scaler (optional)                     

import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

val features = Array("GenderOHE", "Age", "EstimatedSalary")
val dependentVariable = "Purchased"

val vectorAssembler = new VectorAssembler().setInputCols(features).setOutputCol("features")
val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures")

In [0]:
%scala
// Lab 05e: Tie Scaled Features (X-var) to 'Purchase' (Y-var) via Logistic Regression
// This is our LabelPoint 

import org.apache.spark.ml.classification.LogisticRegression

val logisticRegression = new LogisticRegression()
  .setFeaturesCol("scaledFeatures")
  .setLabelCol(dependentVariable)

In [0]:
%scala
// Lab 05f: Assemble the Pipeline
import org.apache.spark.ml.{Pipeline, PipelineModel}

val stages = Array(genderIndexer, genderOneHotEncoder, vectorAssembler, scaler, logisticRegression)
val pipeline = new Pipeline().setStages(stages)

// Since will reiterate during Model creation, cache 'train' DataFrame
train.cache()

// Fit the Pipeline (Create Model using 'train')
val model = pipeline.fit(train)

In [0]:
%scala
// Lab 05g: Using 'model', make Prediction on 'test'

val results = model.transform(test)
display(results)

In [0]:
%scala
// Lab 05h: Calculate Model Accuracy

import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

val evaluator = new BinaryClassificationEvaluator()
      .setLabelCol(dependentVariable)
      .setRawPredictionCol("rawPrediction")
      .setMetricName("areaUnderROC")

val accuracy = evaluator.evaluate(results)

println(s"Accuracy of Model : ${accuracy}")

### Lab 06
### Logistic Regression again - Goal: Predict Income: < $50k or > $50k

This tutorial is designed to get you started with Apache Spark MLlib. It investigates a binary classification problem - can you predict if an individual's income is greater than $50,000 based on demographic data? The dataset is from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Adult) and is provided with Databricks Runtime. This notebook demonstrates some of the capabilities available in MLlib, including tools for data preprocessing, machine learning pipelines, and several different machine learning algorithms.

This notebook includes the following steps:

0. Load the dataset
0. Feature preprocessing
0. Define the model
0. Build the pipeline
0. Evaluate the model
0. Hyperparameter tuning
0. Make predictions and evaluate model performance

## Requirements
Databricks Runtime 7.0 or above or Databricks Runtime 7.0 ML or above. If you are running Databricks Runtime 6.x or Databricks Runtime 6.x ML, see ([AWS](https://docs.databricks.com/getting-started/spark/machine-learning.html)|[Azure](https://docs.microsoft.com/azure/databricks/getting-started/spark/machine-learning/)) for the correct notebook.

## Step 1:  Load the Data into Schema and create TRAIN/TEST. Then examine data

###  Lab 06a: View file

In [0]:
%fs head --maxBytes=1024 databricks-datasets/adult/adult.data

### Lab 06b: Create a Schema to assign column names and datatypes

In [0]:
# Lab 06b:

schema = """`age` DOUBLE,
`workclass` STRING,
`fnlwgt` DOUBLE,
`education` STRING,
`education_num` DOUBLE,
`marital_status` STRING,
`occupation` STRING,
`relationship` STRING,
`race` STRING,
`sex` STRING,
`capital_gain` DOUBLE,
`capital_loss` DOUBLE,
`hours_per_week` DOUBLE,
`native_country` STRING,
`income` STRING"""

dataset = spark.read.csv("/databricks-datasets/adult/adult.data", schema=schema)

### Lab 06c: Randomly split data into training and test sets, and set seed for reproducibility.
### It's best to split the data before doing any preprocessing. This allows the test dataset to more closely simulate new data when we evaluate the model.

In [0]:
# Lab 06c:
trainDF, testDF = dataset.randomSplit([0.8, 0.2], seed=42)
print(trainDF.cache().count()) # Cache because accessing training data multiple times
print(testDF.count())

### Lab 06d: Let's review the DataFrame

In [0]:
# Lab 06d:
trainDF.show()

### Lab 06e: What's the distribution of the number of `hours_per_week`?

In [0]:
# Lab 06e:
display(trainDF.select("hours_per_week").summary())

### Lab 06f: How about `education` status?

In [0]:
# Lab 06f:
display(trainDF
        .groupBy("education")
        .count()
        .sort("count", ascending=False))

## Background: Transformers, Estimators, and Pipelines

Three important concepts in MLlib machine learning that are illustrated in this notebook are **Transformers**, **Estimators**, and **Pipelines**. 

- **Transformer**: Takes a DataFrame as input, and returns a new DataFrame. Transformers do not learn any parameters from the data and simply apply rule-based transformations to either prepare data for model training or generate predictions using a trained MLlib model. You call a transformer with a `.transform()` method.

- **Estimator**: Learns (or "fits") parameters from your DataFrame via a `.fit()` method and returns a Model, which is a transformer.

- **Pipeline**: Combines multiple steps into a single workflow that can be easily run. Creating a machine learning model typically involves setting up many different steps and iterating over them. Pipelines help you automate this process.

For more information:
[ML Pipelines](https://spark.apache.org/docs/latest/ml-pipeline.html#ml-pipelines)

## Step 2. Feature Engineering (Preprocessing) 

The goal of this notebook is to build a model that predicts the `income` level from the features included in the dataset (education level, marital status, occupation, and so on). The first step is to manipulate, or preprocess, the features so they are in the format MLlib requires.

### Lab 06g: Convert categorical variables to numeric

Some machine learning algorithms, such as linear and logistic regression, require numeric features. The Adult dataset includes categorical features such as education, occupation, and marital status. 

The following code block illustrates how to use `StringIndexer` and `OneHotEncoder` to convert categorical variables into a set of numeric variables that only take on values 0 and 1. 

- `StringIndexer` converts a column of string values to a column of label indexes. For example, it might convert the values "red", "blue", and "green" to 0, 1, and 2. 
- `OneHotEncoder` maps a column of category indices to a column of binary vectors, with at most one "1" in each row that indicates the category index for that row.

One-hot encoding in Spark is a two-step process. You first use the StringIndexer, followed by the OneHotEncoder. The following code block defines the StringIndexer and OneHotEncoder but does not apply it to any data yet.

For more information:   
[StringIndexer](http://spark.apache.org/docs/latest/ml-features.html#stringindexer)   
[OneHotEncoder](https://spark.apache.org/docs/latest/ml-features.html#onehotencoder)

In [0]:
# Lab 06g:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

categoricalCols = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex"]

# The following two lines are estimators. They return functions that we will later apply to transform the dataset.
stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=[x + "Index" for x in categoricalCols]) 
encoder = OneHotEncoder(inputCols=stringIndexer.getOutputCols(), outputCols=[x + "OHE" for x in categoricalCols]) 

# The label column ("income") is also a string value - it has two possible values, "<=50K" and ">50K". 
# Convert it to a numeric value using StringIndexer.
labelToIndex = StringIndexer(inputCol="income", outputCol="label")

### Lab 06h: In this notebook, we'll build a pipeline combining all of our feature engineering and modeling steps. But let's take a minute to look more closely at how estimators and transformers work by applying the `stringIndexer` estimator that we created in the previous code block.

You can call the `.fit()` method to return a `StringIndexerModel`, which you can then use to transform the dataset. 

The `.transform()` method of `StringIndexerModel` returns a new DataFrame with the new columns appended. Scroll right to see the new columns

For more information: [StringIndexerModel](https://spark.apache.org/docs/latest/api/java/org/apache/spark/ml/feature/StringIndexerModel.html)

In [0]:
# Lab 06h:
stringIndexerModel = stringIndexer.fit(trainDF)
display(stringIndexerModel.transform(trainDF))

### Lab 06i: Combine all feature columns into a single feature vector

Most MLlib algorithms require a single features column as input. Each row in this column contains a vector of data points corresponding to the set of features used for prediction. 

MLlib provides the `VectorAssembler` transformer to create a single vector column from a list of columns.

The following code block illustrates how to use VectorAssembler.

For more information: [VectorAssembler](https://spark.apache.org/docs/latest/ml-features.html#vectorassembler)

In [0]:
# Lab 06i:
from pyspark.ml.feature import VectorAssembler

# This includes both the numeric columns and the one-hot encoded binary vector columns in our dataset.
numericCols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
assemblerInputs = [c + "OHE" for c in categoricalCols] + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

## Step 3.  Lab 06j: Define the Model

This notebook uses a [logistic regression](https://spark.apache.org/docs/latest/ml-classification-regression.html#logistic-regression) model.

In [0]:
# Lab 06j:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="label", regParam=1.0)

## Step 4. Lab 06k: Build the Pipeline and Score Model via TEST

A `Pipeline` is an ordered list of transformers and estimators. You can define a pipeline to automate and ensure repeatability of the transformations to be applied to a dataset. In this step, we define the pipeline and then apply it to the test dataset.

Similar to what we saw with `StringIndexer`, a `Pipeline` is an estimator. The `pipeline.fit()` method returns a `PipelineModel`, which is a transformer.

For more information:   
[Pipeline](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.Pipeline)  
[PipelineModel](https://spark.apache.org/docs/latest/api/java/org/apache/spark/ml/PipelineModel.html)

In [0]:
# Lab 06k:
from pyspark.ml import Pipeline

# Define the pipeline based on the stages created in previous steps.
pipeline = Pipeline(stages=[stringIndexer, encoder, labelToIndex, vecAssembler, lr])

# Define the pipeline Model on TRAIN dataset
pipelineModel = pipeline.fit(trainDF)

# Apply the pipeline Model to the TEST dataset.
predDF = pipelineModel.transform(testDF)

### Lab 06l: Display the predictions from the Model. The `features` column is a [sparse vector](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.linalg.SparseVector), which is often the case after one-hot encoding, because there are so many 0 values.

In [0]:
# Lab 06l: Diplay Prediction

display(predDF.select("features", "label", "prediction", "probability"))

## Step 5. Lab 06m: Evaluate the Model using ROC

The `display` command has a built-in ROC curve option.

In [0]:
# Lab 06m:
display(pipelineModel.stages[-1], predDF.drop("prediction", "rawPrediction", "probability"), "ROC")

To evaluate the model, we use the `BinaryClassificationEvaluator` to evalute the area under the ROC curve and the `MulticlassClassificationEvaluator` to evalute the accuracy.

For more information:  
[BinaryClassificationEvaluator](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.BinaryClassificationEvaluator)  
[MulticlassClassificationEvaluator](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.MulticlassClassificationEvaluator)

In [0]:
# Lab 06n:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

bcEvaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
print(f"Area under ROC curve: {bcEvaluator.evaluate(predDF)}")

mcEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(f"Accuracy: {mcEvaluator.evaluate(predDF)}")

## Step 6. Hyperparameter tuning

MLlib provides methods to facilitate hyperparameter tuning and cross validation. 
- For hyperparameter tuning, `ParamGridBuilder` lets you define a grid search over a set of model hyperparameters.
- For cross validation, `CrossValidator` lets you specify an estimator (the pipeline to apply to the input dataset), an evaluator, a grid space of hyperparameters, and the number of folds to use for cross validation.
  
For more information:   
[Model selection using cross-validation](https://spark.apache.org/docs/latest/ml-tuning.html)  
[ParamGridBuilder](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#module-pyspark.ml.tuning)  
[CrossValidator](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.CrossValidator)

Use `ParamGridBuilder` and `CrossValidator` to tune the model. This example uses three values for `regParam` and three for `elasticNetParam`, for a total of 3 x 3 = 9 hyperparameter combinations for `CrossValidator` to examine.

In [0]:
# Lab 06o
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .build())

Whenever you call `CrossValidator` in MLlib, Databricks automatically tracks all of the runs using [MLflow](https://mlflow.org/). You can use the MLflow UI ([AWS](https://docs.databricks.com/applications/mlflow/index.html)|[Azure](https://docs.microsoft.com/azure/databricks/applications/mlflow/)) to compare how each model performed.

In this example we use the pipeline we created as the estimator.

In [0]:
# WARNING: This query take about 5-10 minutes to Execute
# Lab 06p: Create a 3-fold CrossValidator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=bcEvaluator, numFolds=3, parallelism = 4)

# Run cross validations. This step takes a few minutes and returns the best model found from the cross validation.
cvModel = cv.fit(trainDF)

## Step 7. 
###Lab 06q: Make predictions and evaluate model performance
###Use the best Model identified by the Cross-Validation to make predictions on the test dataset, and then evaluate the model's performance using the area under the ROC curve.

In [0]:
# Lab 06q: Use the model identified by the cross-validation to make predictions on the TEST dataset
cvPredDF = cvModel.transform(testDF)

# Evaluate the model's performance based on area under the ROC curve and accuracy 
print(f"Area under ROC curve: {bcEvaluator.evaluate(cvPredDF)}")
print(f"Accuracy: {mcEvaluator.evaluate(cvPredDF)}")

Using SQL commands, you can also display predictions grouped by age and occupation. This requires creating a temporary view of the predictions dataset.

In [0]:
# Lab 06r:

cvPredDF.createOrReplaceTempView("finalPredictions")

In [0]:
%sql
-- Lab 06s:

SELECT occupation, prediction, count(*) AS count
FROM finalPredictions
GROUP BY occupation, prediction
ORDER BY occupation

In [0]:
%sql
-- Lab 06t:

SELECT age, prediction, count(*) AS count
FROM finalPredictions
GROUP BY age, prediction
ORDER BY age

### Lab 07: 
### Goal: Predict Bike rental counts (per hour) using Gradient Boost Trees Regression

In [0]:
# Lab 07a: Load Data

bikeDF = spark.read.csv("/FileStore/tables/hour.csv", header=True, inferSchema=True)
bikeDF.show()

In [0]:
# Lab 07b: Remove Columns not needed for Prediction

bikeDF2 = bikeDF.drop("instant").drop("dteday").drop("casual").drop("registered")
bikeDF2.show(5)   

In [0]:
# Lab 07c: Change all Data types to Double

from pyspark.sql.functions import col
bikeDF2.printSchema()
bikeDF3 = bikeDF2.select([col(c).cast("double").alias(c) for c in bikeDF2.columns])
bikeDF3.printSchema()

In [0]:
# Lab 07d: Visualize
# Line Chart: In Plot options, Key = hr, Values = cnt
display(bikeDF3.select ("hr", "cnt"))

In [0]:
# Lab 07e: Create TRAIN and TEST DataFrames

train, test = bikeDF3.randomSplit([0.7, 0.3])
"We have %d TRAIN and %d TEST rows." % (train.count(), test.count())

In [0]:
# Lab 07f: Define Feature Pipeline

from pyspark.ml.feature import VectorAssembler, VectorIndexer
featuresCols = bikeDF3.columns
featuresCols.remove('cnt')
# // This concatenates all feature columns into a single feature vector in a new column "rawFeatures"

vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="rawFeatures")
# // This identifies categorical features and indexes them.
vectorIndexer = VectorIndexer(inputCol="rawFeatures", outputCol="features", maxCategories=4)

In [0]:
# Lab 07g: Define Y-var column for GBTRegressor

from pyspark.ml.regression import GBTRegressor

# Takes the "features" column and learns to predict "cnt"
gbt = GBTRegressor(labelCol="cnt")

In [0]:
# Lab 07h: Add Cross Validation to Pipeline
# This will help determine best parameters to enter for better Model Accuracy

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

#// Define a grid of hyperparameters to test:
#//  - maxDepth: max depth of each decision tree in the GBT ensemble
#//  - maxIter: iterations, i.e., number of trees in each GBT ensemble
#// In this example, we keep these values small.  In practice, to get the highest accuracy, you would likely 
#// want to try deeper trees (10 or higher) and more trees in the ensemble (>100).
paramGrid = ParamGridBuilder().addGrid(gbt.maxDepth, [2, 5]).addGrid(gbt.maxIter, [5, 50]).build()

# We define evaluation metric.  This tells CrossValidator how well we are doing by comparing true labels with predictions.
evaluator = RegressionEvaluator(metricName="rmse", labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol())

# Declare the CrossValidator, which runs model tuning for us.
cv = CrossValidator(estimator=gbt, evaluator=evaluator, estimatorParamMaps=paramGrid)

In [0]:
# Lab 07i: Tie Feature Processing/Model training stages together into single Pipeline

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv])

In [0]:
# Lab 07j: Create Model (takes 6-7 minutes to run)

train.cache()

# This takes a while to run

pipelineModel = pipeline.fit(train)

In [0]:
# Lab 07k: Score Model on TEST

predictions = pipelineModel.transform(test)
predictions.select("cnt", "prediction", *featuresCols).show()

In [0]:
# Lab 07l: Visualize the Prediction
# In Plot options, Key = hr, Values = cnt

# Line Chart: In Plot options, Key = hr, Values = cnt
# If it looks similar to earlier Chart, Accuracy is good
display(predictions.select ("hr", "cnt"))

### Lab 08: Predict Titanic Survive, Not Survive using Decision Tree
#### https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/5722190290795989/3865595167034368/8175309257345795/latest.html

### Lab 08a: Load Libraries and Load/View DataFrame

In [0]:
# Lab 08a:

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean,col,split, col, regexp_extract, when, lit
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import QuantileDiscretizer

In [0]:
# Lab 08a:

titanic_df = spark.read.csv("/FileStore/tables/titanic_master.txt", sep = '\t', header = 'True',inferSchema='True')
display(titanic_df)

### Lab 08b: View Data Statistics

In [0]:
# Lab 08b: Out of 891 passengers in dataset, only about 342 survived

groupBy_output = titanic_df.groupBy("Survived").count()
display(groupBy_output)

In [0]:
# Lab 08b: Although the number of males are more than females on ship, the female survivors are twice the number of males saved.
titanic_df.groupBy("Gender","Survived").count().show()

In [0]:
# Lab 08b: 1st Class had good Survival rate, 3rd Class not so good
titanic_df.groupBy("Pclass","Survived").count().orderBy("Pclass", "Survived").show()

### Lab 08c: Clean the Data

In [0]:
# Lab 08c: This function finds NULLs

def null_value_count(df):
  null_columns_counts = []
  numRows = df.count()
  for k in df.columns:
    nullRows = df.where(col(k).isNull()).count()
    if(nullRows > 0):
      temp = k,nullRows
      null_columns_counts.append(temp)
  return(null_columns_counts)

In [0]:
# Lab 08c: Calling function

null_columns_count_list = null_value_count(titanic_df)
spark.createDataFrame(null_columns_count_list, ['Column_With_Null_Value', 'Null_Values_Count']).show()

In [0]:
# Lab 08c: Find mean Age

mean_age = titanic_df.select(mean('Age')).collect()
display(mean_age)

In [0]:
# Lab 08c: To replace these NaN values, we can assign them the mean age of the dataset.But the problem is, there were many people with many different #ages. We just cant assign a 4 year kid with the mean age that is 29 years. 
# we can check the Name feature. Looking upon the feature, we can see that the names have a salutation like Mr or Mrs. Thus we can assign the mean #values of Mr and Mrs to the respective groups
# Using the Regex ""[A-Za-z]+)." we extract the initials from the Name. It looks for strings which lie between A-Z or a-z and followed by a .(dot).

titanic_df = titanic_df.withColumn("Initial",regexp_extract(col("Name"),"([A-Za-z]+)\.",1))
titanic_df.select("Initial").distinct().show()

In [0]:
# Lab 08c: There are some misspelled Initials like Mlle or Mme that stand for Miss. I will replace them with Miss and same thing for other values.

titanic_df = titanic_df.replace(['Mlle','Mme', 'Ms', 'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
               ['Miss','Miss','Miss','Mr','Mr',  'Mrs',  'Mrs',  'Other',  'Other','Other','Mr','Mr','Mr'])

titanic_df.select("Initial").distinct().show()


In [0]:
# Lab 08c: Lets check the average Age by Initials

titanic_df.groupby('Initial').avg('Age').collect()

In [0]:
# Lab 08c: Let's calcualte missing values in Age feature based on average age of Initials

titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Miss") & (titanic_df["Age"].isNull()), 22).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Other") & (titanic_df["Age"].isNull()), 46).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Master") & (titanic_df["Age"].isNull()), 5).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Mr") & (titanic_df["Age"].isNull()), 33).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Mrs") & (titanic_df["Age"].isNull()), 36).otherwise(titanic_df["Age"]))

In [0]:
# Lab 08c: Majority Passengers boarded from "S". We can assign with "S"

titanic_df.groupBy("Embarked").count().show()
titanic_df = titanic_df.na.fill({"Embarked" : 'S'})

In [0]:
# Lab 08c: We can drop Cabin features as it has lots of NULL values

titanic_df = titanic_df.drop("Cabin")

In [0]:
# Lab 08c: We can create a new feature called "Family_size" and "Alone" and analyse it. This feature is the summation of Parch(parents/children) and #SibSp(siblings/spouses). It gives us a combined data so that we can check if survival rate have anything to do with family size of the passengers

titanic_df = titanic_df.withColumn("Family_Size",col('SibSp')+col('Parch'))
titanic_df = titanic_df.withColumn('Alone',lit(0))
titanic_df = titanic_df.withColumn("Alone",when(titanic_df["Family_Size"] == 0, 1).otherwise(titanic_df["Alone"]))

### Lab 08d: Convert Categorical to Numeic and then Pipeline and Transform. Then View

In [0]:
# Lab 08d: Lets convert Gender, Embarked & Initial columns from string to number using StringIndexer

indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(titanic_df) for column in ["Gender","Embarked","Initial"]]
pipeline = Pipeline(stages=indexers)
titanic_df = pipeline.fit(titanic_df).transform(titanic_df)

In [0]:
# Lab 08d: 

display(titanic_df)

### Lab 08e: Drop Columns not Required

In [0]:
# Lab 08s: Drop columns which are not required

titanic_df = titanic_df.drop("PassengerId","Name","Ticket","Cabin","Embarked","Gender","Initial")
titanic_df.show()

### Lab 08f: Put all X-variables into a Vector and Transform. Then split into TRAIN/TEST

In [0]:
# Lab 08f: Put all Features (X-vars) into a Vector

feature = VectorAssembler(inputCols=titanic_df.columns[1:],outputCol="features")
feature_vector= feature.transform(titanic_df)

In [0]:
# Lab 08f: Now that the data is all set, let's split it into Training and Test. I'll be using 80% of it.

(trainingData, testData) = feature_vector.randomSplit([0.8, 0.2],seed = 11)

### Lab 08g: Run Decision Tree Classifer to create Model then make Prediction

In [0]:
# Lab 08g: Decision Tree classifier
from pyspark.ml.classification import DecisionTreeClassifier

trainingData.cache()
dt = DecisionTreeClassifier(labelCol="Survived", featuresCol="features")
dt_model = dt.fit(trainingData)

dt_prediction = dt_model.transform(testData)
dt_prediction.select("prediction", "Survived", "features").show(200, False)

### Lab 08h: Evaluate Accuracy of Decision Tree

In [0]:
# Lab 0h: 81% Accuracy 

dtDF = dt_prediction.select("prediction", "Survived")
dtDF.createOrReplaceTempView("gbt_view")

spark.sql("SELECT (SELECT cast(count(*) as dec(5,2)) FROM gbt_view WHERE cast(prediction as integer) = Survived)/(SELECT cast(count(*) as dec(5,2)) FROM gbt_view)").show()

# End of Mod-11 (MLib-ML) 
## Ignore Past here

### Bonus Lab
### Goal: Linear Regression: Predict Home Prices

In [0]:
# Lab 08a: Load Data

homeDF = spark.read.csv("/FileStore/tables/realestate.csv", header=True, inferSchema=True)
homeDF.show()

In [0]:
# Lab 08b: Select Y-variable and X-variables

homeDF = homeDF.select("price", "baths", "beds", "sqft")
homeDF = homeDF[homeDF.baths > 0]
homeDF = homeDF[homeDF.beds > 0]
homeDF = homeDF[homeDF.sqft > 0]
display(homeDF)

In [0]:
# Lab 08c: Create LabelPoint
import pyspark.mllib
import pyspark.mllib.regression
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.functions import *

lpRDD = homeDF.rdd.map(lambda c:LabeledPoint(c[0], [c[1:]]))

lpRDD.take(3)

In [0]:
# Lab 08d: Via 'summary', notice 'mean' and 'stddev' much higher, so Normalize the Data
from pyspark.mllib.feature import StandardScaler

homeDF.summary().show()

# Pluck out just X-vars (bath, bed, sqft)
xVarRDD = homeDF.rdd.map(lambda c: c[1:])

# Normalize X-vars
standarizer = StandardScaler()
model = standarizer.fit(xVarRDD)
normalXvarRDD = model.transform(xVarRDD)

normalXvarRDD.take(3)

In [0]:
# Lab 08e: Create new Normalized LabelPoint. Append Y and X-var in new LabelPoint using 'zip()' function

YvarRDD = homeDF.rdd.map(lambda c: c[0])

newKVRDD = YvarRDD.zip(normalXvarRDD)
newKVRDD.take(3)

# Remove 'DenseVector'
newlpRDD = newKVRDD.map(lambda c: LabeledPoint(c[0], [c[1]]))
newlpRDD.take(3)

In [0]:
# Lab 08f: Create TRAIN/TEST, Model, Score Model

import pyspark.mllib
import pyspark.mllib.regression

# Create TRAIN/TEST
trainRDD, testRDD = newlpRDD.randomSplit([.8, .2], seed =1234)

from pyspark.mllib.regression import LinearRegressionWithSGD

# Want 1000 interations with Step size
linearModel = LinearRegressionWithSGD.train(trainRDD, 1000, .2)

# Look at first 10 rows of TEST
testRDD.take(10)

# Enter 9th row house X-Var to see what 'Price' prediction it comes up with
# Hopefully it'll be close to ACTUAL 'Price' of $181,872

linearModel.predict([1.4929, 3.5205, 1.7353])