In [2]:
# Import dependencies
from __future__ import print_function

import sys
if sys.version >= '3':
    long = int

    
import pandas as pd
from pyspark.sql import SparkSession

# PySpark ML Imports
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

# SparkContext
from pyspark import SparkContext

# PySpark ML Imports

from pprint import pprint


In [3]:
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("ALSExample")\
        .getOrCreate()
  


In [4]:
df = pd.read_csv("Dataset.csv")
df.head(2)


Unnamed: 0,0,0.1,0.2,290
0,1,1,1,112.0
1,2,2,2,87.8


In [5]:
lines = spark.read.text("Dataset.csv").rdd
print(lines.take(10))


[Row(value='0,0,0,290'), Row(value='1,1,1,112'), Row(value='2,2,2,87.8'), Row(value='3,3,3,69.7'), Row(value='4,4,4,65.7'), Row(value='5,5,5,62'), Row(value='6,6,6,61.6'), Row(value='7,7,7,60.7'), Row(value='8,8,8,57.7'), Row(value='9,9,9,56.2')]


In [6]:
# Need to convert p[1] from str to int
parts = lines.map(lambda row: row.value.split(","))
print(parts.take(2))
type(parts)
# RDD mapped as int and float from Dataset

[['0', '0', '0', '290'], ['1', '1', '1', '112']]


pyspark.rdd.PipelinedRDD

In [7]:
ratingsRDD = parts.map(lambda p: Row(userId=int(p[1]),repoId=int(p[2]),repoCount=float(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
type(ratings) #pyspark.sql.dataframe.DataFrame
# print(ratings.head(10))

pyspark.sql.dataframe.DataFrame

In [8]:
# Implicit Rating

ratingsRDD = parts.map(lambda p: Row(user=int(p[1]),item=int(p[2]),rating=float(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])
als = ALS(rank=5, maxIter=5, alpha = 1.0, implicitPrefs=True, seed=0)
model = als.fit(ratings)

In [9]:
model.userFactors.orderBy("id").collect()

[Row(id=0, features=[0.6985026001930237, -0.13982200622558594, 0.4282741844654083, 1.874091625213623, 0.5279054045677185]),
 Row(id=1, features=[1.6072622537612915, -0.5426971912384033, -0.5080831050872803, -1.7013717889785767, 1.2187761068344116]),
 Row(id=2, features=[-0.8563839793205261, -1.9099193811416626, 1.8150508403778076, -0.7095034718513489, -0.13360075652599335]),
 Row(id=3, features=[-0.2773630619049072, 2.11722731590271, -1.2831660509109497, -0.5341355800628662, 1.064345121383667]),
 Row(id=4, features=[-0.5552871227264404, -1.5861384868621826, 0.722503662109375, 0.6926171779632568, 1.8143209218978882]),
 Row(id=5, features=[0.7414836287498474, 2.297808885574341, 0.6350209712982178, -0.22918884456157684, -1.2262318134307861]),
 Row(id=6, features=[1.0631933212280273, 0.5365792512893677, 1.6327977180480957, -1.0746279954910278, 1.6575722694396973]),
 Row(id=7, features=[0.342332124710083, 1.0564613342285156, -2.433532238006592, -0.17372813820838928, 0.3367983400821686]),
 R

In [None]:
test = test.drop("rating")


In [None]:
predictions = sorted(model.transform(test).collect(), key=lambda r: r[0])

user_recs = model.recommendForAllUsers(test.count())
item_recs = model.recommendForAllItems(test.count())



In [None]:
user_recs.where(user_recs.user == 222).select("recommendations.item", "recommendations.rating").collect()


In [None]:
item_recs.where(item_recs.item == 2).select("recommendations.user", "recommendations.rating").collect()
