## Задача 3. Факторизация матрицы (3 балла)

In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

SMALL_DATA_PATH = 'file:///home/cloudera/Desktop/ml_data/ml_latest_small/ratings.csv'
BIG_DATA_PATH = 'file:///home/cloudera/Desktop/ml_data/ml_latest_big/ratings.csv'

In [2]:
def get_train_and_test_data(data_path, parts, seed=123):
    print(f'Use data path: "{data_path}"')
    
    ratings_df = sqlContext.read.load(
        data_path,
        format='com.databricks.spark.csv',
        header='true',
        inferSchema='true',
        sep=','
    )
    ratings_df.persist()
    ratings_df.show(5)

    df_train, df_test = ratings_df.randomSplit(parts, seed=seed)
    df_train.persist(), df_test.persist()
    print(f'Train data count: {df_train.count()}. Test data count: {df_test.count()}\n\n')
    
    return df_train, df_test

**Разделите данные с рейтингами на обучающее (train - 0.8) и тестовое подмножества (test - 0.2).**

In [3]:
parts = (0.8, 0.2)

small_df_train, small_df_test = get_train_and_test_data(SMALL_DATA_PATH, parts)
big_df_train, big_df_test = get_train_and_test_data(BIG_DATA_PATH, parts)

Use data path: "file:///home/cloudera/Desktop/ml_data/ml_latest_small/ratings.csv"
+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows

Train data count: 80838. Test data count: 19998


Use data path: "file:///home/cloudera/Desktop/ml_data/ml_latest_big/ratings.csv"
+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    307|   3.5|1256677221|
|     1|    481|   3.5|1256677456|
|     1|   1091|   1.5|1256677471|
|     1|   1257|   4.5|1256677460|
|     1|   1449|   4.5|1256677264|
+------+-------+------+----------+
only showing top 5 rows

Train data count: 22200602. Test data count: 5552842




**Oпределите среднее значение рейтинга в обучающем подмножестве.**

In [4]:
from pyspark.sql import functions as F


def get_mean_movie_rating(df_train):
    return df_train.select(F.mean('rating').alias('avg')).collect()[0]['avg']


small_mean_movie_rating = get_mean_movie_rating(small_df_train)
print(f'Mean rating in small data: {small_mean_movie_rating}\n')

big_mean_movie_rating = get_mean_movie_rating(big_df_train)
print(f'Mean rating in big data: {big_mean_movie_rating}')

Mean rating in small data: 3.4985155496177542

Mean rating in big data: 3.530478362703858


**Bычислите RMSE для тестового подмножества, если для всех значений из test предсказывается среднее значение рейтинга.**

In [5]:
from pyspark.ml.evaluation import RegressionEvaluator

rmse_evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')


def fill_prediction_column(df_test, mean_movie_rating):
    df_test_with_prediction = df_test.withColumn('prediction', F.lit(mean_movie_rating))
    df_test_with_prediction.show(5)
    return df_test_with_prediction


small_df_test_with_prediction = fill_prediction_column(small_df_test, small_mean_movie_rating)
print(f'RMSE for small data: {rmse_evaluator.evaluate(small_df_test_with_prediction)}\n')

big_df_test_with_prediction = fill_prediction_column(big_df_test, big_mean_movie_rating)
print(f'RMSE for big data: {rmse_evaluator.evaluate(big_df_test_with_prediction)}')

+------+-------+------+---------+------------------+
|userId|movieId|rating|timestamp|        prediction|
+------+-------+------+---------+------------------+
|     1|      3|   4.0|964981247|3.4985155496177542|
|     1|      6|   4.0|964982224|3.4985155496177542|
|     1|     47|   5.0|964983815|3.4985155496177542|
|     1|    151|   5.0|964984041|3.4985155496177542|
|     1|    163|   5.0|964983650|3.4985155496177542|
+------+-------+------+---------+------------------+
only showing top 5 rows

RMSE for small data: 1.0425819010275184

+------+-------+------+----------+-----------------+
|userId|movieId|rating| timestamp|       prediction|
+------+-------+------+----------+-----------------+
|     1|    481|   3.5|1256677456|3.530478362703858|
|     1|   1091|   1.5|1256677471|3.530478362703858|
|     1|   1257|   4.5|1256677460|3.530478362703858|
|     1|   2478|   4.0|1256677239|3.530478362703858|
|     1|   2986|   2.5|1256677496|3.530478362703858|
+------+-------+------+----------

**Выберите модель ALS по минимальному значению RMSE. Для этого используйте kfolds c k=4.**

**Если какие-то элементы из тестового/валидационного подмножества не встречались в обучающем, то RMSE будет NaN.**

**Поэтому заранее уберите из тестового/валидационного подмножества такие элементы.**

In [10]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Параметры
ranks = (5, 10, 15)  # Kоличество факторов
regParams = (0.001, 0.01, 0.1, 1, 10)  # Pегуляризация
kfolds = 4

als = ALS(
    seed=123,
    maxIter=10,
    numUserBlocks=10,
    numItemBlocks=10,
    userCol='userId',
    itemCol='movieId',
    ratingCol='rating',
    coldStartStrategy='drop',
)
paramsGrid = (
    ParamGridBuilder()
        .addGrid(als.rank, ranks)
        .addGrid(als.regParam, regParams)
        .build()
)
cross_validator = CrossValidator(estimator=als,
                                 estimatorParamMaps=paramsGrid,
                                 evaluator=rmse_evaluator,
                                 numFolds=kfolds)

def calculate_rmse_for_best_als_model(df_train, df_test):
    cv_model = cross_validator.fit(df_train)
    print(f'Best rank: {cv_model.bestModel.rank}')
    # FIXME(a.telyshev): py4j.Py4JException: Method getRegParam([]) does not exist
    # print(f'Best regularization: {cv_model.bestModel._java_obj.getRegParam()}')
    
    predictions = cv_model.transform(df_test)
    predictions.show(5)
    
    return rmse_evaluator.evaluate(predictions)

In [11]:
print('Small data processing...')
print(f'RMSE: {calculate_rmse_for_best_als_model(small_df_train, small_df_test)}\n')

Small data processing...
Best rank: 10
+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|   133|    471|   4.0| 843491793| 2.9822454|
|   599|    471|   2.5|1498518822| 2.6454818|
|   603|    471|   4.0| 954482443| 2.7023568|
|   217|    471|   2.0| 955943727| 3.4777524|
|   136|    471|   4.0| 832450058| 3.9017246|
+------+-------+------+----------+----------+
only showing top 5 rows

RMSE: 0.8838615498556199



In [None]:
print('Big data processing...')
print(f'RMSE: {calculate_rmse_for_best_als_model(big_df_train, big_df_test)}')

Big data processing...
