user_id: 사용자의 고유 ID.
movie_id: 영화의 고유 ID.
rating: 사용자가 영화에 준 평점.
timestamp: 평점이 기록된 시간.
#별점 1,2,3, 별로 카운트

### 0. 환경 설정 

In [21]:
from pyspark import SparkConf, SparkContext

#스파크 환경 설정 및 객체 생성

conf = SparkConf().setMaster("local").setAppName("241204_02_MovieLens")
spark = SparkContext(conf = conf).getOrCreate()

### 1. 데이터 불러오기 

In [22]:
import os 
directory = os.path.join(os.getcwd(),"data")
filename = "u.data"
filepath = os.path.join(directory, filename)

In [27]:
#RDD 생성
data = spark.textFile("file:///"+filepath.replace("ww", "/"))

In [28]:
data.take(5)

['196\t242\t3\t881250949',
 '186\t302\t3\t891717742',
 '22\t377\t1\t878887116',
 '244\t51\t2\t880606923',
 '166\t346\t1\t886397596']

### 2. 데이터 전처리

In [52]:
def Parse(row):
    fields = row.split('\t')
    user_id = fields[0]
    movie_id = fields[1]
    rating = fields[2]
    timestamp = fields[3]
    
    return user_id, movie_id, rating, timestamp

In [33]:
# 파싱 함수는 map()에서 처리 
total = data.map(Parse)
total.take(6)

[('196', '242', '3', '881250949'),
 ('186', '302', '3', '891717742'),
 ('22', '377', '1', '878887116'),
 ('244', '51', '2', '880606923'),
 ('166', '346', '1', '886397596'),
 ('298', '474', '4', '884182806')]

### 3. user_id 별 영화 별점 파악

In [39]:
# (user_id, (movie_id, rating)) 형태로 변환
user_movie_ratings = total.map(lambda x: (x[0], (x[1], x[2])))
user_movie_ratings.take(5)

[('196', ('242', '3')),
 ('186', ('302', '3')),
 ('22', ('377', '1')),
 ('244', ('51', '2')),
 ('166', ('346', '1'))]

In [78]:
#groupByKey() 그룹화
#user_id별로 movie_id와 rating을 그룹화
grouped_by_user = user_movie_ratings.groupByKey()
grouped_by_user.take(3)

[('196', <pyspark.resultiterable.ResultIterable at 0x7ff2c9262280>),
 ('186', <pyspark.resultiterable.ResultIterable at 0x7ff2c92622e0>),
 ('22', <pyspark.resultiterable.ResultIterable at 0x7ff2c9262340>)]

In [80]:
grouped_by_user = grouped_by_user.mapValues(list)
grouped_by_user.take(5)

[('196',
  [('242', '3'),
   ('393', '4'),
   ('381', '4'),
   ('251', '3'),
   ('655', '5'),
   ('67', '5'),
   ('306', '4'),
   ('238', '4'),
   ('663', '5'),
   ('111', '4'),
   ('580', '2'),
   ('25', '4'),
   ('286', '5'),
   ('94', '3'),
   ('692', '5'),
   ('8', '5'),
   ('428', '4'),
   ('1118', '4'),
   ('70', '3'),
   ('66', '3'),
   ('257', '2'),
   ('108', '4'),
   ('202', '3'),
   ('340', '3'),
   ('287', '3'),
   ('116', '3'),
   ('382', '4'),
   ('285', '5'),
   ('1241', '3'),
   ('1007', '4'),
   ('411', '4'),
   ('153', '5'),
   ('13', '2'),
   ('762', '3'),
   ('173', '2'),
   ('1022', '4'),
   ('845', '4'),
   ('269', '3'),
   ('110', '1')]),
 ('186',
  [('302', '3'),
   ('566', '5'),
   ('250', '1'),
   ('148', '4'),
   ('263', '3'),
   ('470', '5'),
   ('983', '3'),
   ('281', '4'),
   ('385', '4'),
   ('588', '4'),
   ('406', '1'),
   ('925', '5'),
   ('977', '3'),
   ('322', '5'),
   ('53', '1'),
   ('333', '3'),
   ('591', '4'),
   ('742', '3'),
   ('770', '2'),

### 4. 영화별 평균 평점 

In [108]:
#movie_id, ratings,1 
movie_ratings = total.map(lambda x: (x[1], (float(x[2]), 1)))

In [109]:
#같은 키를 가진 값 합산 
sum_ratings = movie_ratings.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
sum_ratings.take(3)

[('242', (467.0, 117)), ('302', (1236.0, 297)), ('377', (28.0, 13))]

In [116]:
average_ratings = sum_ratings.mapValues(lambda x: round(x[0] / x[1],2))
average_ratings.collect()

[('242', 3.99),
 ('302', 4.16),
 ('377', 2.15),
 ('51', 3.46),
 ('346', 3.64),
 ('474', 4.25),
 ('265', 3.86),
 ('465', 3.56),
 ('451', 3.35),
 ('86', 3.94),
 ('257', 3.75),
 ('1014', 3.06),
 ('222', 3.66),
 ('40', 2.89),
 ('29', 2.67),
 ('785', 3.15),
 ('387', 3.38),
 ('274', 3.5),
 ('1042', 3.14),
 ('1184', 2.5),
 ('392', 3.54),
 ('486', 3.8),
 ('144', 3.87),
 ('118', 3.22),
 ('1', 3.88),
 ('546', 3.03),
 ('95', 3.81),
 ('768', 3.08),
 ('277', 3.46),
 ('234', 3.77),
 ('246', 3.94),
 ('98', 4.29),
 ('193', 3.92),
 ('88', 3.54),
 ('194', 4.06),
 ('1081', 2.75),
 ('603', 4.39),
 ('796', 3.08),
 ('32', 3.79),
 ('16', 3.21),
 ('304', 3.54),
 ('979', 3.2),
 ('564', 2.04),
 ('327', 3.38),
 ('201', 3.52),
 ('1137', 3.97),
 ('241', 3.55),
 ('4', 3.55),
 ('332', 3.46),
 ('100', 4.16),
 ('432', 3.77),
 ('322', 3.09),
 ('181', 4.01),
 ('196', 3.92),
 ('679', 3.05),
 ('384', 2.78),
 ('143', 3.77),
 ('423', 3.83),
 ('515', 4.2),
 ('20', 3.42),
 ('288', 3.44),
 ('219', 3.17),
 ('526', 3.83),
 ('919

In [20]:
spark.stop()