# Notebook for Content Based Filtering with GBT and Random Forest for 1M rows

In [None]:
# Intialization
import os
import time

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

# spark imports
from pyspark.sql import SparkSession

# data science imports
import numpy as np
import pandas as pd

# Read in data through spark since the data is sored in hadoop and format the columns
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import *
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import *
sqlContext = SQLContext(sc)

# Spark model imports
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel

from pyspark.ml.regression import RandomForestRegressor, RandomForestRegressionModel
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.regression import GBTRegressor, GBTRegressionModel

In [None]:
data_path = 'hdfs:///user/andrew/'

In [None]:
# sdf indicates spark dataframe
movies_sdf = sqlContext.read.parquet(data_path + 'movie_metadata_OHE_subset')
users_full_sdf = sqlContext.read.parquet(data_path + 'users_metadata') 
users_full_sdf = users_full_sdf.na.fill(0)

Rating = Row("user_id", "item_id", "label") # Ignore timestamp
ratings = sc.textFile(data_path + 'ratings.dat')\
    .map(lambda line: line.split("::")[0:3])\
    .map(lambda line: map(int, line))\
    .map(lambda r: Rating(*r))
ratings_sdf = sqlContext.createDataFrame(ratings)

In [None]:
# Join the full ratings with the movies' profile data, then add the users' profile data
ratings_metadata_sdf = ratings_sdf.join(movies_sdf, ['item_id'])

ratings_muf_sdf = ratings_metadata_sdf.join(users_full_sdf, ['user_id']) \
        .drop('user_id', 'item_id', 'title', 'imdb_id')

In [None]:
# Convert the full ratings with movie and user metadata to a dataframe of label and features
ratings_muf_rdd = ratings_muf_sdf.rdd.map(lambda x: (x[0], Vectors.dense(x[1:])))
ratings_muf_sdf_2 = sqlContext.createDataFrame(ratings_muf_rdd, schema = ['label', 'features'])

In [None]:
# Split into train and test sets
rmuf_sdf_train, rmuf_sdf_test = ratings_muf_sdf_2.randomSplit([0.75, 0.25], seed = 42)

## Metadata and Full User Data
### Decision Tree Classifier

In [None]:
%%time
# Train a single decision tree with a depth of 30
dtc  = DecisionTreeClassifier(maxDepth=30, labelCol="label", seed=42, maxMemoryInMB = 8192)
dtc_model = dtc.fit(rmuf_sdf_train)
dtc_model.featureImportances

In [None]:
%%time
# Make predictions on the test set
dtc_model_preds = dtc_model.transform(rmuf_sdf_test)
dtc_model_preds.show(2)

In [None]:
%%time
# Evaluate the test set predictions
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print 'f1 score:', evaluator.evaluate(dtc_model_preds)
print 'Weighted Precision score:', evaluator.evaluate(dtc_model_preds, {evaluator.metricName: "weightedPrecision"})
print 'Weighted Recall score:', evaluator.evaluate(dtc_model_preds, {evaluator.metricName: "weightedRecall"})
print 'Accuracy:', evaluator.evaluate(dtc_model_preds, {evaluator.metricName: "accuracy"})

### Random Forest Classifier

In [None]:
%%time
# Train a Random Forest Classifier with 100 trees with depth of 30
rfc  = RandomForestClassifier(numTrees=100, maxDepth=30, 
                              labelCol="label", seed=42, 
                              maxMemoryInMB= 1024, 
                              featureSubsetStrategy = 'auto',
                              minInstancesPerNode = 20,
                              # minInfoGain = ?,
                              subsamplingRate = 0.6,
                              maxBins = 5)
rfc_model = rfc.fit(rmuf_sdf_train)

In [None]:
%%time
# Make predictions on the test set
rfc_model_preds = rfc_model.transform(rmuf_sdf_test)
rfc_model_preds.show(2)

In [None]:
%%time
# Evaluate the test set predictions
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print 'f1 score:', evaluator.evaluate(rfc_model_preds)
print 'Weighted Precision score:', evaluator.evaluate(rfc_model_preds, {evaluator.metricName: "weightedPrecision"})
print 'Weighted Recall score:', evaluator.evaluate(rfc_model_preds, {evaluator.metricName: "weightedRecall"})
print 'Accuracy:', evaluator.evaluate(rfc_model_preds, {evaluator.metricName: "accuracy"})

### Random Forest Regressor

In [None]:
%%time
# Train a Random Forest Regressor with a 100 trees with depth of 30
rfr  = RandomForestRegressor(numTrees=100, maxDepth=30, 
                              labelCol="label", seed=42, 
                              maxMemoryInMB= 1024, 
                              featureSubsetStrategy = 'auto',
                              minInstancesPerNode = 20,
                              # minInfoGain = ?,
                              subsamplingRate = 0.6)
rfr_model = rfr.fit(rmuf_sdf_train)

In [None]:
%%time
# Make predictions on the test set
rfr_model_preds = rfr_model.transform(rmuf_sdf_test)
rfr_model_preds.show(2)

In [None]:
%%time
# Evaluate the test set predictions
evaluator = RegressionEvaluator(predictionCol="prediction")
print 'RMSE score:', evaluator.evaluate(rfr_model_preds)
print 'R-squared:', evaluator.evaluate(rfr_model_preds, {evaluator.metricName: "r2"})
print 'Mean Absolute Error:', evaluator.evaluate(rfr_model_preds, {evaluator.metricName: "mae"})

### Gradient Boosted Tree Regressor
##### Depth 30

In [None]:
%%time
# Train a GBT Regressor with a depth of 30
gbtr  = GBTRegressor(maxDepth=30, labelCol="label", seed=42, subsamplingRate=0.7, stepSize = 0.1, maxMemoryInMB= 2048)
gbtr_model = gbtr.fit(rmuf_sdf_train)

In [None]:
%%time
# Make predictions on the test set
gbtr_model_preds = gbtr_model.transform(rmuf_sdf_test)
gbtr_model_preds.show(2)

In [None]:
%%time
# Evaluate the test set predictions
evaluator = RegressionEvaluator(predictionCol="prediction")
print 'RMSE score:', evaluator.evaluate(gbtr_model_preds)
print 'R-squared:', evaluator.evaluate(gbtr_model_preds, {evaluator.metricName: "r2"})
print 'Mean Absolute Error:', evaluator.evaluate(gbtr_model_preds, {evaluator.metricName: "mae"})

##### Depth 10

In [None]:
%%time
# Train a GBT Regressor with a depth of 10
gbtr_2  = GBTRegressor(maxDepth=10, labelCol="label", seed=42, stepSize = 0.1, maxMemoryInMB= 2048)
gbtr_model_2 = gbtr_2.fit(rmuf_sdf_train)

In [None]:
%%time
# Make predictions on the test set
gbtr_model_preds_2 = gbtr_model_2.transform(rmuf_sdf_test)
gbtr_model_preds_2.show(2)

In [None]:
%%time
# Evaluate the test set predictions
evaluator = RegressionEvaluator(predictionCol="prediction")
print 'RMSE score:', evaluator.evaluate(gbtr_model_preds_2)
print 'R-squared:', evaluator.evaluate(gbtr_model_preds_2, {evaluator.metricName: "r2"})
print 'Mean Absolute Error:', evaluator.evaluate(gbtr_model_preds_2, {evaluator.metricName: "mae"})

In [None]:
# Save and load model
gbtr_model_2.save(data_path + 'GBTRegD10Model')
sameModel = GBTRegressionModel.load(data_path + 'GBTRegD10Model')

In [None]:
%%time
# Make predictions on the test set
sameModel_preds_2 = sameModel.transform(rmuf_sdf_test)
sameModel_preds_2.show(2)

In [None]:
%%time
# Evaluate the loaded model's test set predictions
evaluator = RegressionEvaluator(predictionCol="prediction")
print 'RMSE score:', evaluator.evaluate(sameModel_preds_2)
print 'R-squared:', evaluator.evaluate(sameModel_preds_2, {evaluator.metricName: "r2"})
print 'Mean Absolute Error:', evaluator.evaluate(sameModel_preds_2, {evaluator.metricName: "mae"})

### Hyperparameter Grid Search
#### Gradient Boosted Trees

In [None]:
# %%time
# from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# gbtr = GBTRegressor(seed = 42, maxMemoryInMB= 2048)
# grid = ParamGridBuilder() \
#         .addGrid(gbtr.maxDepth, [10, 20]) \
#         .addGrid(gbtr.subsamplingRate, [0.6, 0.7]) \
#         .addGrid(gbtr.stepSize, [0.01, 0.05, 0.1]) \
#         .build()
# evaluator = RegressionEvaluator(predictionCol="prediction", metricName="mae")
# cv = CrossValidator(estimator=gbtr, estimatorParamMaps=grid, evaluator=evaluator, seed = 42)
# cv_model = cv.fit(rmuf_sdf_train_2)
# evaluator.evaluate(cv_model.transform(rmuf_sdf_train_2))

In [None]:
# evaluator.evaluate(cv_model.transform(rmuf_sdf_test_2))

In [None]:
# params = [{p.name: v for p, v in m.items()} for m in cv_model.getEstimatorParamMaps()]
# [ps.update({cv_model.getEvaluator().getMetricName(): metric}) for ps, metric in zip(params, cv_model.avgMetrics)]
# params_df = pd.DataFrame(params)

In [None]:
# print 'Best Parameters:'
# params_df.iloc[np.where(np.argmin(params_df.metric))]

In [None]:
sc.stop()