In [3]:
from pyspark.sql import SparkSession

MAX_MEMORY = '5g'
spark = SparkSession.builder.appName("241212_01_MLlib_ALS")\
            .config("spark.executer.memory",MAX_MEMORY)\
            .config("spark.driver.memory", MAX_MEMORY)\
            .getOrCreate()

## DATA LOAD

In [5]:
rating_df = spark.read.format('csv')\
    .option('header', 'true')\
    .option('inferSchema', 'true')\
    .load('data/ratings.csv')

                                                                                

In [6]:
rating_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



## 전처리

In [8]:
rating_df = rating_df.select(["userId",'movieId','rating'])

## 데이터 세트 분할

In [10]:
train_ratio = 0.8
test_ratio = 0.2
train_df, test_df = rating_df.randomSplit([train_ratio, test_ratio], seed=42)

## ALS 모델 객체 생성

In [18]:
from pyspark.ml.recommendation import ALS

als = ALS(
    maxIter=5,
    regParam=0.1,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy='drop'
)

## 모델 학습

In [19]:
als_model = als.fit(train_df)

                                                                                

## 예측 확인

In [20]:
#userId, movieId > rating
predictions = als_model.transform(test_df)
predictions.show(5)



+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
| 26480|    148|   2.0| 1.9439574|
|151614|    148|   1.0| 2.7431073|
| 28229|    148|   1.0|  2.403362|
|  6491|    148|   4.0| 2.4508655|
| 14831|    148|   3.0|  2.687292|
+------+-------+------+----------+
only showing top 5 rows



                                                                                

In [21]:
predictions.select("rating", "prediction").describe().show()



+-------+------------------+------------------+
|summary|            rating|        prediction|
+-------+------------------+------------------+
|  count|           4998109|           4998109|
|   mean|3.5341648211353536|3.4117775723363426|
| stddev|1.0609230261741123|0.6413064384231564|
|    min|               0.5|        -4.1310825|
|    max|               5.0|          6.545404|
+-------+------------------+------------------+



                                                                                

## ALS 모델의 하이퍼파라미터 조정

In [16]:
#데이터가 적을 때 사용하는 전략, 데이터 없을 때 대체하는 방식
als.setColdStartStrategy('drop')

ALS_6f02f10c59d3

In [None]:
predictions = als_model.transform(test_df)
predictions.show(5)

## 평가

RMSE 측정

In [23]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='prediction')


In [25]:
rmse = evaluator.evaluate(predictions)
rmse

                                                                                

0.810656549900932

##  활용

1. 사용자id > 추천목록
2. 영화id > 사용자목록

In [None]:
#als_model.getRecommendForAllusers(3) #user별로 추천할 영화 3개 고르기

In [None]:
#als_model.recommandForAllItems(3) #item별로 추천할 유저 3명 고르기

In [28]:
from pyspark.sql.types import IntegerType

user_list = [4,11,26,66,]
user_df = spark.createDataFrame(user_list, IntegerType()).toDF("userid")
user_df.show()

+------+
|userid|
+------+
|     4|
|    11|
|    26|
|    66|
+------+



                                                                                

In [35]:
user_recommend_movies = als_model.recommendForUserSubset(user_df, 3)
user_recommend_movies.show()

                                                                                

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    26|[{194434, 5.60306...|
|     4|[{194434, 5.93477...|
|    11|[{203086, 5.50464...|
|    66|[{177209, 6.19961...|
+------+--------------------+



In [31]:
# movie-list load

In [36]:
movie_file = 'data/movies.csv'
movies_df = spark.read.csv(movie_file, inferSchema=True, header=True)

In [37]:
movies_df.show(3)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows



In [38]:
# 66번 사용자 추천영화 목록
movies_list = user_recommend_movies.collect()[3].recommendations
movies_list

                                                                                

[Row(movieId=177209, rating=6.199610233306885),
 Row(movieId=203086, rating=6.101006031036377),
 Row(movieId=127252, rating=6.039139747619629)]

In [40]:
rec_df = spark.createDataFrame(movies_list)
rec_df.show()

+-------+-----------------+
|movieId|           rating|
+-------+-----------------+
| 177209|6.199610233306885|
| 203086|6.101006031036377|
| 127252|6.039139747619629|
+-------+-----------------+



In [41]:
# 영화정보와 추천영화 목록 조인

In [42]:
rec_df.createOrReplaceTempView('recommend')
movies_df.createOrReplaceTempView('movies')

In [44]:
query = '''
SELECT *
FROM movies join recommend on movies.movieId = recommend.movieId
ORDER BY rating DESC
'''
recommend_movies = spark.sql(query)
recommend_movies.show()

+-------+--------------------+--------------------+-------+-----------------+
|movieId|               title|              genres|movieId|           rating|
+-------+--------------------+--------------------+-------+-----------------+
| 177209|      Acı Aşk (2009)|               Drama| 177209|6.199610233306885|
| 203086|Truth and Justice...|               Drama| 203086|6.101006031036377|
| 127252|The Veil of Twili...|Crime|Fantasy|Mys...| 127252|6.039139747619629|
+-------+--------------------+--------------------+-------+-----------------+



In [45]:
spark.stop()