## 잠재요인기반 추천시스템 ALS 모델 만들기

In [6]:
from pyspark.sql import SparkSession
MAX_MEMORY = '5g'
spark = SparkSession.builder.appName('241213_02_MLlib_ALS')\
        .config("spark.executeor.memory", MAX_MEMORY)\
        .config("spark.driver.memory", MAX_MEMORY)\
        .getOrCreate()

In [7]:
rating_df = spark.read.format("csv")\
    .option("header", 'true')\
    .option('inferSchema', 'true')\
    .load('data/ratings.csv')

                                                                                

In [8]:
rating_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [10]:
rating_df.show(5)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
+------+-------+------+----------+
only showing top 5 rows



In [27]:
rating_df.count()

169209

## 전처리

- timestamp column 제거
  

In [29]:
rating_df = rating_df.select('userId','movieId','rating')

## Dataset 분할

In [14]:
train_ratio = 0.8
test_ratio = 0.2
train_df, test_df = rating_df.randomSplit([train_ratio,test_ratio], seed = 42)

In [None]:
## ALS 모델 객체 생성

In [18]:
from pyspark.ml.recommendation import ALS

#깡통 ALS 모델 생성
als = ALS(
    maxIter = 5,
    regParam = 0.1,
    userCol = "userId",
    itemCol = "movieId",
    ratingCol = "rating"
)

## 모델 학습

In [21]:
als_model = als.fit(train_df)

## 예측 확인 

In [32]:
#userId, movieId -> rating
predictions = als_model.transform(test_df)
predictions.show(5)



+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   897|    471|   4.0| 2.9620304|
|    12|    471|   4.0| 3.7224145|
|   346|    471|   5.0| 3.3885448|
|   846|    471|   3.0| 2.8563602|
|   318|    471|   4.0| 3.9682584|
+------+-------+------+----------+
only showing top 5 rows



                                                                                

In [33]:
predictions.select("rating","prediction").describe().show()



+-------+------------------+----------+
|summary|            rating|prediction|
+-------+------------------+----------+
|  count|             33908|     33908|
|   mean| 3.566857378789666|       NaN|
| stddev|1.0480526319009411|       NaN|
|    min|               0.5|0.07923436|
|    max|               5.0|       NaN|
+-------+------------------+----------+



                                                                                

## ALS 모델 hyperparameter 조정
- coldStartStrategy = 'drop'

In [34]:
als.setColdStartStrategy('drop')

ALS_8df216b69810

In [35]:
##userid, movieid > rating
predictions = als_model.transform(test_df)
predictions.show(5)



+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   897|    471|   4.0| 2.9620304|
|    12|    471|   4.0| 3.7224145|
|   346|    471|   5.0| 3.3885448|
|   846|    471|   3.0| 2.8563602|
|   318|    471|   4.0| 3.9682584|
+------+-------+------+----------+
only showing top 5 rows



                                                                                

In [28]:
predictions.count()

                                                                                

33908

## 평가

- RMSE 측정

In [30]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName = 'rmse', labelCol = 'rating', predictionCol = 'prediction')

In [31]:
rmse = evaluator.evaluate(predictions)
rmse

                                                                                

nan

## 활용

1. 사용자id -> 추천목록
2. 영화id -> 사용자목록

In [51]:
als_model.getRecommendForAllUsers(3) #user별로 추천할 영화 3개

AttributeError: 'ALSModel' object has no attribute 'getRecommendForAllUsers'

In [None]:
als_model.recommendForAllItems(3) #item별로 추천할 유저 3명 

In [38]:
from pyspark.sql.types import IntegerType

user_list = [65,78,81]
user_df = spark.createDataFrame(user_list, IntegerType()).toDF("userID")
user_df.show()

+------+
|userID|
+------+
|    65|
|    78|
|    81|
+------+



In [39]:
user_recommend_movies = als_model.recommendForUserSubset(user_df, 3)
user_recommend_movies.show()

                                                                                

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    65|[{127019, 6.08112...|
|    78|[{82591, 6.123962...|
|    81|[{3896, 4.564083}...|
+------+--------------------+



In [42]:
#movie list load 

movie_file = 'data/movies.csv'
movies_df= spark.read.csv(movie_file, inferSchema = True, header = True)

In [43]:
movies_df.show(3)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows



In [49]:
#65번 사용자의 추천영화 목록
movies_list = user_recommend_movies.collect()[0].recommendations
movies_list

                                                                                

[Row(movieId=127019, rating=6.081121921539307),
 Row(movieId=198185, rating=5.965970039367676),
 Row(movieId=5792, rating=5.677191734313965)]

In [50]:
rec_df = spark.createDataFrame(movies_list)
rec_df.show()

+-------+-----------------+
|movieId|           rating|
+-------+-----------------+
| 127019|6.081121921539307|
| 198185|5.965970039367676|
|   5792|5.677191734313965|
+-------+-----------------+



In [53]:
#영화정보와 추천영화목록 조인 

rec_df.createOrReplaceTempView('recommend')
movies_df.createOrReplaceTempView('movies')

In [56]:
query = '''
select *
from movies join recommend on movies.movieId = recommend.movieId
order by rating desc
'''
spark.sql(query).show()

+-------+--------------------+-------------+-------+-----------------+
|movieId|               title|       genres|movieId|           rating|
+-------+--------------------+-------------+-------+-----------------+
| 127019|Line of Sight (2012)|  Documentary| 127019|6.081121921539307|
| 198185|   Twin Peaks (1989)|Drama|Mystery| 198185|5.965970039367676|
|   5792| Roger Dodger (2002)| Comedy|Drama|   5792|5.677191734313965|
+-------+--------------------+-------------+-------+-----------------+



In [4]:
spark.stop()