In [1]:
# [+] SparkSession 설정

from pyspark.sql import SparkSession
ss = SparkSession.builder.appName('movie-recommendation').getOrCreate()

In [2]:
# [+] movielens 데이터 불러오기
# ratings_short.csv: 원본 데이터에서 7만개의 평점 데이터만 선택한 버전
ratings_df = ss.read.csv('./data/ratings_short.csv', inferSchema=True, header=True)

In [3]:
ratings_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



In [4]:
ratings_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [5]:
# [+] 타임스탬프 제외한 컬럼 선택
ratings_df = ratings_df.select(['userId', 'movieId', 'rating'])

In [6]:
# [+] describe(): 기본 통계치 출력
ratings_df.select('rating').describe().show()

+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|             71921|
|   mean|3.5821387355570695|
| stddev| 1.042406032579843|
|    min|               0.5|
|    max|               5.0|
+-------+------------------+



In [7]:
# [+] randomSplit(): 훈련 데이터셋과 테스트 데이터셋을 나누기

train_df, test_df = ratings_df.randomSplit([0.8, 0.2])

In [8]:
# [+] 추천 알고리즘(Alternating Least Squares) 임포트
# pyspark.ml.recommendation.ALS
from  pyspark.ml.recommendation import ALS

In [9]:
# 추천 알고리즘 설정

als = ALS(
    maxIter=5,
    regParam=0.1,
    userCol='userId',
    itemCol='movieId',
    ratingCol='rating',
    coldStartStrategy='drop'
)

In [10]:
# [+] 모델 학습
model = als.fit(train_df)

In [11]:
# # 메모리 부족으로 인한 오류 발생시, 아래의 코드를 실행
# from pyspark.sql import SparkSession

# MAX_MEMORY = '5g'
# ss = SparkSession.builder.appName('movie-recommendation')\
#     .config('spark.executor.memory', MAX_MEMORY)\
#     .config('spark.driver.memory', MAX_MEMORY)\
#     .getOrCreate()

In [12]:
# [+] 모델 예측
predictions = model.transform(test_df)

In [13]:
predictions.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   243|   1580|   3.0| 3.0759199|
|    31|   8638|   2.0| 2.4276896|
|   368|   1580|   3.5|  3.732476|
|    76|   1342|   3.5| 3.1450458|
|    76|   1959|   5.0| 3.4782035|
|   501|   1580|   5.0| 3.6485245|
|    12|    471|   4.0|   3.86801|
|   548|   1591|   2.5| 2.7636259|
|   548|   7982|   5.0| 3.5855055|
|    91|   8638|   3.0| 3.2730346|
|    91|  96488|   2.0| 3.2402313|
|   285|   1088|   4.0| 2.5594413|
|   285|   1959|   4.0| 2.1162264|
|   233|   1580|   5.0| 3.6286223|
|   416|   1645|   4.0| 2.7413275|
|   132|   1959|   4.0| 2.6624627|
|   355|  68135|   3.0| 3.1128476|
|   325|   1580|   3.0| 3.2281606|
|     1|   1088|   4.0| 3.0745966|
|   442|    471|   5.0|  3.360355|
+------+-------+------+----------+
only showing top 20 rows



In [14]:
# [+] 평점과 예측평점에 대한 통계 출력
predictions.select('rating', 'prediction').describe().show()

+-------+------------------+------------------+
|summary|            rating|        prediction|
+-------+------------------+------------------+
|  count|             13665|             13665|
|   mean| 3.607610684229784| 3.429338112438593|
| stddev|1.0369176538122387|0.7555889584286211|
|    min|               0.5|       -0.11027547|
|    max|               5.0|         5.7190866|
+-------+------------------+------------------+



In [15]:
# 모델 성능 평가: RMSE(Root Mean Squared Error)
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    metricName='rmse',
    labelCol='rating',
    predictionCol='prediction'
)

In [16]:
# [+] RMSE 측정
rmse = evaluator.evaluate(predictions)

In [17]:
rmse

0.9080325694594664

In [18]:
# [+] 학습된 모델을 이용하여 유저별 아이템을 3개씩 추천
model.recommendForAllUsers(3).show()



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{3881, 5.20797},...|
|     2|[{3676, 5.4592395...|
|     3|[{8582, 4.878655}...|
|     4|[{1564, 5.048351}...|
|     5|[{8582, 5.110438}...|
|     6|[{127098, 5.53012...|
|     7|[{8582, 4.6796584...|
|     8|[{26003, 5.070140...|
|     9|[{193065, 5.90253...|
|    10|[{6530, 4.976697}...|
|    11|[{4546, 5.389875}...|
|    12|[{4903, 4.755283}...|
|    13|[{8582, 5.0066895...|
|    14|[{8235, 5.482001}...|
|    15|[{3881, 5.9195447...|
|    16|[{3881, 5.6412206...|
|    17|[{105504, 4.87697...|
|    18|[{5108, 5.1238074...|
|    19|[{141432, 4.89997...|
|    20|[{6286, 5.752571}...|
+------+--------------------+
only showing top 20 rows



In [19]:
# [+] 학습된 모델을 이용하여 아이템별 유저를 3명씩 추천
model.recommendForAllItems(3).show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     12|[{198, 4.8726783}...|
|     26|[{252, 4.407937},...|
|     27|[{240, 4.305439},...|
|     28|[{235, 5.4227257}...|
|     31|[{198, 4.7027736}...|
|     34|[{87, 5.1358156},...|
|     44|[{240, 4.2506495}...|
|     65|[{153, 3.863797},...|
|     76|[{240, 4.403467},...|
|     78|[{87, 4.0492043},...|
|     81|[{274, 4.785816},...|
|     85|[{87, 4.681914}, ...|
|    101|[{87, 5.428907}, ...|
|    103|[{22, 4.4386964},...|
|    115|[{174, 4.032761},...|
|    155|[{22, 5.434549}, ...|
|    159|[{127, 4.238339},...|
|    183|[{240, 4.4472146}...|
|    193|[{199, 4.5595484}...|
|    210|[{240, 4.552015},...|
+-------+--------------------+
only showing top 20 rows



In [20]:
# 특정 유저 선택
user_lst = [1]

In [21]:
from pyspark.sql.types import IntegerType

In [22]:
# 데이터프레임생성
users_df = ss.createDataFrame(user_lst, IntegerType()).toDF('userID')

In [23]:
users_df.show()

+------+
|userID|
+------+
|     1|
+------+



In [24]:
# recommendForUserSubset(): 특정 유저 그룹에 대한 아이템 추천
user_recs = model.recommendForUserSubset(users_df, 5)

In [25]:
user_recs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{3881, 5.20797},...|
+------+--------------------+



In [26]:
# 추천결과를 파이썬 객체로 받아오기
movies_lst = user_recs.collect()[0].recommendations

In [27]:
movies_lst

[Row(movieId=3881, rating=5.207970142364502),
 Row(movieId=3470, rating=5.039545059204102),
 Row(movieId=7156, rating=5.002124786376953),
 Row(movieId=8327, rating=4.9655070304870605),
 Row(movieId=5767, rating=4.906437873840332)]

In [28]:
# movies_lst 에 대한 데이터프레임 생성
recs_df = ss.createDataFrame(movies_lst)
recs_df.show()

+-------+------------------+
|movieId|            rating|
+-------+------------------+
|   3881| 5.207970142364502|
|   3470| 5.039545059204102|
|   7156| 5.002124786376953|
|   8327|4.9655070304870605|
|   5767| 4.906437873840332|
+-------+------------------+



In [29]:
# [+] 영화 데이터에 대한 데이터프레임 생성
movies_df = ss.read.csv('./data/movies_short.csv', inferSchema=True, header=True)
movies_df.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [30]:
# [+] recs_df, movies_df 에 대한 Temporary View 생성
recs_df.createOrReplaceTempView('recommendations')
movies_df.createOrReplaceTempView('movies')

In [31]:
# [+] SQL JOIN 연산을 통해 추천된 영화 제목 받아오기
ss.sql("SELECT * FROM movies \
        JOIN recommendations ON movies.movieID = recommendations.movieID \
        ORDER BY rating DESC").toPandas()

Unnamed: 0,movieId,title,genres,movieId.1,rating
0,8327,Dolls (2002),Drama|Romance,8327,4.965507


In [32]:
"""
    유저 별 영화 추천 서비스를 함수로 정의하기
    1. 쿼리문 작성
    2. 추천 함수 작성
"""

query = """
SELECT * 
FROM movies
JOIN recommendations ON movies.movieID = recommendations.movieID \
ORDER BY rating DESC
"""

In [33]:
def get_recommendations(user_id, num_recs):
    users_df = ss.createDataFrame([user_id], IntegerType()).toDF('userID')
    users_recs_df = model.recommendForUserSubset(users_df, num_recs)
    
    recs_lst = users_recs_df.collect()[0].recommendations
    recs_df = ss.createDataFrame(recs_lst)
    recommended_movies = ss.sql(query)
    return recommended_movies

In [34]:
# 1번 유저에 대한 영화추천
recs = get_recommendations(1, 5)



In [35]:
# toPandas(): Pandas 데이터프레임으로 출력
recs.toPandas()

Unnamed: 0,movieId,title,genres,movieId.1,rating
0,8327,Dolls (2002),Drama|Romance,8327,4.965507
