# Configurando bibliotecas e dependencias

In [1]:
!pip install pyspark



In [2]:
!pip install findspark



In [None]:
# instalar as dependências
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz

In [None]:
# configurar as variáveis de ambiente
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

# tornar o pyspark "importável"
import findspark
findspark.init('spark-2.4.4-bin-hadoop2.7')

In [8]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator #evaluation é a biblioteca para verificação da qualidade do modelo
from pyspark.ml.recommendation import ALS # ALS é o modelo de recomendação que será utilizadp
from pyspark.sql import Row #row é o formato que o ALS trabalha, row conterá o id do usuario, id filme, nota e timestamp

In [9]:
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [10]:
lines = spark.read.text("sample_movielens_ratings.txt").rdd

In [11]:
parts = lines.map(lambda row: row.value.split("::"))

In [15]:
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), \
                                     movieId=int(p[1]), \
                                     rating=float(p[2]), \
                                     timestamp=int(p[3])))

In [16]:
ratings = spark.createDataFrame(ratingsRDD)

In [19]:
lines.collect()

[Row(value='0::2::3::1424380312'),
 Row(value='0::3::1::1424380312'),
 Row(value='0::5::2::1424380312'),
 Row(value='0::9::4::1424380312'),
 Row(value='0::11::1::1424380312'),
 Row(value='0::12::2::1424380312'),
 Row(value='0::15::1::1424380312'),
 Row(value='0::17::1::1424380312'),
 Row(value='0::19::1::1424380312'),
 Row(value='0::21::1::1424380312'),
 Row(value='0::23::1::1424380312'),
 Row(value='0::26::3::1424380312'),
 Row(value='0::27::1::1424380312'),
 Row(value='0::28::1::1424380312'),
 Row(value='0::29::1::1424380312'),
 Row(value='0::30::1::1424380312'),
 Row(value='0::31::1::1424380312'),
 Row(value='0::34::1::1424380312'),
 Row(value='0::37::1::1424380312'),
 Row(value='0::41::2::1424380312'),
 Row(value='0::44::1::1424380312'),
 Row(value='0::45::2::1424380312'),
 Row(value='0::46::1::1424380312'),
 Row(value='0::47::1::1424380312'),
 Row(value='0::48::1::1424380312'),
 Row(value='0::50::1::1424380312'),
 Row(value='0::51::1::1424380312'),
 Row(value='0::54::1::1424380312

In [17]:
ratings.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     0|      2|   3.0|1424380312|
|     0|      3|   1.0|1424380312|
|     0|      5|   2.0|1424380312|
|     0|      9|   4.0|1424380312|
|     0|     11|   1.0|1424380312|
|     0|     12|   2.0|1424380312|
|     0|     15|   1.0|1424380312|
|     0|     17|   1.0|1424380312|
|     0|     19|   1.0|1424380312|
|     0|     21|   1.0|1424380312|
|     0|     23|   1.0|1424380312|
|     0|     26|   3.0|1424380312|
|     0|     27|   1.0|1424380312|
|     0|     28|   1.0|1424380312|
|     0|     29|   1.0|1424380312|
|     0|     30|   1.0|1424380312|
|     0|     31|   1.0|1424380312|
|     0|     34|   1.0|1424380312|
|     0|     37|   1.0|1424380312|
|     0|     41|   2.0|1424380312|
+------+-------+------+----------+
only showing top 20 rows



In [20]:
(training, test) = ratings.randomSplit([0.8, 0.2]) #divide o df em porções para treinamento e teste

In [21]:
als = ALS(maxIter=5, \
          regParam=0.01, \
          userCol="userId", \
          itemCol="movieId", \
          ratingCol="rating", \
          coldStartStrategy="drop")

In [22]:
model = als.fit(training) #treina o modelo com o dataset de treinamento

In [23]:
predictions = model.transform(test) #aplica o modelo no conjunto de teste para fazer predições
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                               predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Erro médio quadrático = " + str(rmse))

Erro médio quadrático = 1.6240737641633205


In [24]:
userRec = model.recommendForAllUsers(10)

In [25]:
userRec.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    20|[{17, 4.5367002},...|
|    10|[{74, 4.766108}, ...|
|     0|[{46, 4.9702287},...|
|     1|[{62, 3.8224728},...|
|    21|[{53, 4.744372}, ...|
|    11|[{7, 7.4007297}, ...|
|    12|[{47, 6.8356557},...|
|    22|[{90, 5.7079144},...|
|     2|[{29, 5.2440696},...|
|    13|[{58, 4.086324}, ...|
|     3|[{32, 6.3329787},...|
|    23|[{7, 5.300976}, {...|
|     4|[{53, 5.718257}, ...|
|    24|[{69, 5.046017}, ...|
|    14|[{29, 5.1617775},...|
|     5|[{55, 4.7821045},...|
|    15|[{46, 5.0599613},...|
|    25|[{47, 3.547579}, ...|
|    26|[{23, 5.101212}, ...|
|     6|[{47, 4.928897}, ...|
+------+--------------------+
only showing top 20 rows



In [26]:
movieRecs = model.recommendForAllItems(10) #faz a transposta da matriz de ratings, a fim de recomendar usuários em potencial para itens específicos

In [27]:
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     20|[{17, 4.7529573},...|
|     40|[{8, 4.223166}, {...|
|     10|[{3, 4.0368366}, ...|
|     50|[{11, 4.3936524},...|
|     80|[{23, 3.9274583},...|
|     70|[{14, 3.4263542},...|
|     60|[{3, 3.0335703}, ...|
|     90|[{22, 5.7079144},...|
|     30|[{3, 5.302606}, {...|
|      0|[{9, 3.3708017}, ...|
|     31|[{14, 3.1936545},...|
|     81|[{28, 4.7628627},...|
|     91|[{29, 4.1838617},...|
|      1|[{15, 3.7412543},...|
|     41|[{28, 4.432292}, ...|
|     61|[{16, 2.8496714},...|
|     51|[{19, 5.3332987},...|
|     21|[{26, 3.0027144},...|
|     11|[{18, 4.031199}, ...|
|     71|[{12, 3.4178684},...|
+-------+--------------------+
only showing top 20 rows



In [28]:
users = ratings.select(als.getUserCol()).distinct() #selecina os usuários que existem nesse universo

In [29]:
users.show()

+------+
|userId|
+------+
|    26|
|    29|
|    19|
|     0|
|    22|
|     7|
|    25|
|     6|
|     9|
|    27|
|    17|
|    28|
|     5|
|     1|
|    10|
|     3|
|    12|
|     8|
|    11|
|     2|
+------+
only showing top 20 rows



In [30]:
UserRecsOnlyItemId = userRec.select(userRec['userId'], \
                                    userRec['recommendations']['movieid'])

In [31]:
UserRecsOnlyItemId.show(10, False) #mostra somente as recomendações por usuário

+------+----------------------------------------+
|userId|recommendations.movieid                 |
+------+----------------------------------------+
|20    |[17, 22, 94, 20, 75, 62, 46, 36, 88, 13]|
|10    |[74, 87, 2, 40, 62, 95, 49, 85, 70, 4]  |
|0     |[46, 92, 12, 9, 19, 26, 91, 85, 32, 43] |
|1     |[62, 53, 75, 46, 85, 28, 22, 17, 94, 20]|
|21    |[53, 74, 2, 41, 62, 29, 58, 76, 85, 43] |
|11    |[7, 32, 30, 48, 18, 27, 23, 55, 79, 49] |
|12    |[47, 18, 17, 35, 64, 58, 27, 13, 55, 73]|
|22    |[90, 30, 75, 74, 88, 51, 62, 32, 69, 27]|
|2     |[29, 8, 39, 93, 51, 83, 37, 92, 63, 66] |
|13    |[58, 76, 93, 29, 53, 47, 41, 74, 43, 18]|
+------+----------------------------------------+
only showing top 10 rows

