In [1]:
# Intialization
import os
import time

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

# spark imports
from pyspark.sql import SparkSession

# data science imports
import numpy as np
import pandas as pd

# Read in data through spark since the data is sored in hadoop and format the columns
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import *
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import *
sqlContext = SQLContext(sc)

# Classification
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Regression
from pyspark.ml.regression import RandomForestRegressor, RandomForestRegressionModel
from pyspark.ml.regression import GBTRegressor, GBTRegressionModel
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
data_path = 'hdfs:///user/andrew/'

In [3]:
# sdf indicates spark dataframe
movies = sqlContext.read.parquet(data_path + 'movie_20m_metadata_OHE_subset')
users_full = sqlContext.read.parquet(data_path + 'users_metadata_20m') 
users_full = users_full.na.fill(0)

ratings = sqlContext.read.parquet(data_path + 'ratings_20m')
ratings = ratings.drop('timestamp')
ratings = ratings.withColumn("userId", ratings["userId"].cast("int"))
ratings = ratings.withColumn("rating", ratings["rating"] * 2) #Multiply by 2 so that values are whole numbers -> values 1 to 10

In [4]:
ratings_metadata = ratings.join(movies, ratings.movieId == movies.item_id)
# ratings with metadata and users full
ratings_muf = ratings_metadata.join(users_full, ratings.userId == users_full.user_id) \
        .drop('userId', 'user_id', 'movieId', 'item_id', 'title', 'imdb_id')

In [5]:
# ratings with metadata and users full converted to a dataframe of label and features
ratings_muf_rdd = ratings_muf.rdd.map(lambda x: (x[0], Vectors.dense(x[1:])))
ratings_muf_2 = sqlContext.createDataFrame(ratings_muf_rdd, schema = ['label', 'features'])

In [6]:
rmuf_train, rmuf_test = ratings_muf_2.randomSplit([0.75, 0.25], seed = 42)

## Metadata and Full User Data

### Random Forest Classifier

In [10]:
%%time
rfc  = RandomForestClassifier(numTrees=200, maxDepth=10, 
                              labelCol="label", seed=42, 
                              maxMemoryInMB= 8192, 
                              featureSubsetStrategy = 'auto',
                              minInstancesPerNode = 20,
                              # minInfoGain = ?,
                              subsamplingRate = 0.6,
                              maxBins = 10)
rfc_model = rfc.fit(rmuf_train)

CPU times: user 118 ms, sys: 49.1 ms, total: 167 ms
Wall time: 10min 15s


In [11]:
%%time
rfc_model_preds = rfc_model.transform(rmuf_test)
rfc_model_preds.show(2)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  1.0|[0.0,0.0,0.0,0.0,...|[0.0,1.9943577525...|[0.0,0.0099717887...|       8.0|
|  1.0|[0.0,0.0,0.0,88.0...|[0.0,10.307658893...|[0.0,0.0515382944...|       8.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 2 rows

CPU times: user 3.42 ms, sys: 1.82 ms, total: 5.24 ms
Wall time: 33.1 s


In [12]:
%%time
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print 'f1 score:', evaluator.evaluate(rfc_model_preds)
print 'Weighted Precision score:', evaluator.evaluate(rfc_model_preds, {evaluator.metricName: "weightedPrecision"})
print 'Weighted Recall score:', evaluator.evaluate(rfc_model_preds, {evaluator.metricName: "weightedRecall"})
print 'Accuracy:', evaluator.evaluate(rfc_model_preds, {evaluator.metricName: "accuracy"})

f1 score: 0.264468191464
Weighted Precision score: 0.336493997634
Weighted Recall score: 0.345180319496
Accuracy: 0.345180319496
CPU times: user 123 ms, sys: 33.2 ms, total: 156 ms
Wall time: 10min 14s


### Random Forest Regressor

In [13]:
%%time
rfr  = RandomForestRegressor(numTrees=200, maxDepth=10, 
                              labelCol="label", seed=42, 
                              maxMemoryInMB= 8192, 
                              featureSubsetStrategy = 'auto',
                              minInstancesPerNode = 20,
                              # minInfoGain = ?,
                              subsamplingRate = 0.6)
rfr_model = rfr.fit(rmuf_train)

CPU times: user 201 ms, sys: 70.6 ms, total: 272 ms
Wall time: 30min 17s


In [14]:
rfr_model_preds = rfr_model.transform(rmuf_test)
rfr_model_preds.show(2)

+-----+--------------------+------------------+
|label|            features|        prediction|
+-----+--------------------+------------------+
|  1.0|[0.0,0.0,0.0,0.0,...| 6.173916921134639|
|  1.0|[0.0,0.0,0.0,88.0...|5.1369872989845575|
+-----+--------------------+------------------+
only showing top 2 rows



In [15]:
evaluator = RegressionEvaluator(predictionCol="prediction")
print 'RMSE score:', evaluator.evaluate(rfr_model_preds)
print 'R-squared:', evaluator.evaluate(rfr_model_preds, {evaluator.metricName: "r2"})
print 'Mean Absolute Error:', evaluator.evaluate(rfr_model_preds, {evaluator.metricName: "mae"})

RMSE score: 1.68841502653
R-squared: 0.355981716853
Mean Absolute Error: 1.29749801383


### Gradient Boosted Tree Regressor
##### Depth 30 (Long run time)

In [None]:
%%time
gbtr  = GBTRegressor(maxDepth=30, labelCol="label", seed=42, subsamplingRate=0.7, stepSize = 0.1, maxMemoryInMB= 8192)
gbtr_model = gbtr.fit(rmuf_train)

In [None]:
%%time
gbtr_model_preds = gbtr_model.transform(rmuf_test)
gbtr_model_preds.show(2)

In [None]:
%%time
evaluator = RegressionEvaluator(predictionCol="prediction")
print 'RMSE score:', evaluator.evaluate(gbtr_model_preds)
print 'R-squared:', evaluator.evaluate(gbtr_model_preds, {evaluator.metricName: "r2"})
print 'Mean Absolute Error:', evaluator.evaluate(gbtr_model_preds, {evaluator.metricName: "mae"})

##### Depth 10

In [7]:
%%time
gbtr_2  = GBTRegressor(maxDepth=10, labelCol="label", seed=42, stepSize = 0.1, maxMemoryInMB= 2048)
gbtr_model_2 = gbtr_2.fit(rmuf_train)
# gbtr_model.featureImportances

CPU times: user 1.27 s, sys: 535 ms, total: 1.81 s
Wall time: 11min 48s


In [8]:
%%time
gbtr_model_preds_2 = gbtr_model_2.transform(rmuf_test)
gbtr_model_preds_2.show(2)

+-----+--------------------+-----------------+
|label|            features|       prediction|
+-----+--------------------+-----------------+
|  1.0|[0.0,0.0,0.0,0.0,...|5.205399228048784|
|  1.0|[0.0,0.0,0.0,88.0...|4.049581997039118|
+-----+--------------------+-----------------+
only showing top 2 rows

CPU times: user 1.66 ms, sys: 2.47 ms, total: 4.13 ms
Wall time: 31.5 s


In [9]:
%%time
evaluator = RegressionEvaluator(predictionCol="prediction")
print 'RMSE score:', evaluator.evaluate(gbtr_model_preds_2)
print 'R-squared:', evaluator.evaluate(gbtr_model_preds_2, {evaluator.metricName: "r2"})
print 'Mean Absolute Error:', evaluator.evaluate(gbtr_model_preds_2, {evaluator.metricName: "mae"})

RMSE score: 1.60541041015
R-squared: 0.417746739554
Mean Absolute Error: 1.22559769904
CPU times: user 58 ms, sys: 9.13 ms, total: 67.1 ms
Wall time: 7min 36s


In [None]:
# Save and load model
gbtr_model_2.save(data_path + 'GBTRegD10Model_20m')
# sameModel = GBTRegressionModel.load(sc, 'hdfs:///user/andrew/GBTRegD10Model_20m')

In [None]:
sameModel = GBTRegressionModel.load(data_path + 'GBTRegD10Model_20m')

In [None]:
%%time
sameModel_preds_2 = sameModel.transform(rmuf_test)
sameModel_preds_2.show(2)

In [None]:
%%time
evaluator = RegressionEvaluator(predictionCol="prediction")
print 'RMSE score:', evaluator.evaluate(sameModel_preds_2)
print 'R-squared:', evaluator.evaluate(sameModel_preds_2, {evaluator.metricName: "r2"})
print 'Mean Absolute Error:', evaluator.evaluate(sameModel_preds_2, {evaluator.metricName: "mae"})

In [None]:
# %%time
# from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# gbtr = GBTRegressor(seed = 42, maxMemoryInMB= 2048)
# grid = ParamGridBuilder() \
#         .addGrid(gbtr.maxDepth, [10, 20]) \
#         .addGrid(gbtr.subsamplingRate, [0.6, 0.7]) \
#         .addGrid(gbtr.stepSize, [0.01, 0.05, 0.1]) \
#         .build()
# evaluator = RegressionEvaluator(predictionCol="prediction", metricName="mae")
# cv = CrossValidator(estimator=gbtr, estimatorParamMaps=grid, evaluator=evaluator, seed = 42)
# cv_model = cv.fit(rmuf_sdf_train_2)
# evaluator.evaluate(cv_model.transform(rmuf_sdf_train_2))

In [None]:
# evaluator.evaluate(cv_model.transform(rmuf_sdf_test_2))

In [None]:
# params = [{p.name: v for p, v in m.items()} for m in cv_model.getEstimatorParamMaps()]
# [ps.update({cv_model.getEvaluator().getMetricName(): metric}) for ps, metric in zip(params, cv_model.avgMetrics)]
# params_df = pd.DataFrame(params)

In [None]:
# print 'Best Parameters:'
# params_df.iloc[np.where(np.argmin(params_df.metric))]

In [None]:
sc.stop()