In [1]:
# Install PySpark here
!pip install --force-reinstall pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.7 (from pyspark)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=146cbcf86d2f01ac606af8b49d18a7ae87c263dc2bfc7d8fcc82fd49dd82c56e
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0.10.9.7
  

In [2]:
# Load Dataset MIND
import pandas as pd
import datetime as dt
from functools import reduce
from operator import add
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType, TimestampType, StructType, StructField, IntegerType, StringType, FloatType, DoubleType, ArrayType
from pyspark.sql.functions import col, sqrt, desc, asc, split, explode, from_json, get_json_object, inline, array, monotonically_increasing_id, lit, min, max
spark = SparkSession.builder \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
    .master("local[*]") \
    .config("spark.executor.memory", "12g") \
    .config("spark.driver.memory", "12g") \
    .config("spark.memory.offHeap.enabled", True) \
    .config('spark.sql.parquet.int96RebaseModeInRead', 'LEGACY') \
    .config('spark.sql.parquet.int96RebaseModeInWrite', 'LEGACY') \
    .config("spark.memory.offHeap.size","12g") \
    .config("spark.sql.shuffle.partitions",64) \
    .getOrCreate()


In [3]:
# Connet to my drive
from google.colab import drive
drive.mount('/content/drive/')
dir = '/content/drive/MyDrive/BTL Big Data & Business Intelligence'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
training_set = spark.read.csv('/content/drive/MyDrive/BTL Big Data & Business Intelligence/train_set.csv', header=True, inferSchema=True)
training_set = training_set.select("new_userid", "new_newsid", "click")

In [5]:
test_set = spark.read.csv('/content/drive/MyDrive/BTL Big Data & Business Intelligence/test_set.csv', header=True, inferSchema=True)
test_set = test_set.select("new_userid", "new_newsid", "click")

In [33]:
user_ids_map = spark.read.csv('/content/drive/MyDrive/BTL Big Data & Business Intelligence/user_ids_map.csv', header=True, inferSchema=True)
user_ids_map = user_ids_map.select("userID", "new_userid")

In [35]:
news_ids_map = spark.read.csv('/content/drive/MyDrive/BTL Big Data & Business Intelligence/news_ids_map.csv', header=True, inferSchema=True)
news_ids_map = news_ids_map.select("newsID", "new_newsid", "title")

In [6]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

# Create ALS Model
als=ALS(maxIter=5,
        regParam=0.09,
        rank=25,
        userCol="new_userid",
        itemCol="new_newsid",
        ratingCol="click",
        coldStartStrategy="drop",
        nonnegative=True)

model=als.fit(training_set)


In [7]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="click",predictionCol="prediction")
predictions=model.transform(test_set)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))

RMSE=0.26636793925597174


In [26]:
predictions.show()


+----------+----------+-----+----------+
|new_userid|new_newsid|click|prediction|
+----------+----------+-----+----------+
|     33306|     18581|  1.0|0.68203014|
|     33306|     37944|  1.0| 0.7068195|
|     33306|     18326|  1.0|0.73179686|
|     33306|      2178|  0.0|0.71599245|
|     33306|     21107|  1.0|0.25126266|
|     33306|     14043|  1.0|0.69540066|
|     33306|     12762|  1.0|0.67338234|
|     33306|      1362|  1.0| 0.6826984|
|     33306|     24084|  1.0|0.48907375|
|      5908|     19441|  1.0| 0.9428393|
|      5908|     44185|  1.0|0.15582116|
|      5908|      2178|  0.0|0.98042923|
|      5908|     43718|  1.0|0.93135613|
|      5908|      5474|  1.0| 0.9165281|
|      5908|     16754|  1.0|0.91589534|
|      5908|     15014|  1.0|0.91610116|
|      5908|      9217|  1.0|  0.932705|
|      5908|      2099|  1.0|0.93663573|
|      5908|     36835|  0.0|       0.0|
|      5908|     12032|  1.0| 0.9158074|
+----------+----------+-----+----------+
only showing top

In [30]:
def get_recs_top(x):
  item_recs = model.recommendForAllUsers(x)
  item_recs = item_recs.withColumn("recommendations", explode(item_recs.recommendations))
  item_recs = item_recs.withColumn("new_newsid", item_recs.recommendations.new_newsid)
  item_recs = item_recs.drop("recommendations")
  return item_recs


In [38]:
item_recs = get_recs_top(6)
item_recs = item_recs.join(user_ids_map, on=["new_userid"], how="inner")
item_recs = item_recs.join(news_ids_map, on=["new_newsid"], how="inner")
item_recs = item_recs.select("userID", "newsID", "title")


In [41]:
item_recs.toPandas().to_csv("recommendation.csv")