# [Link](http://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#module-pyspark.ml.recommendation)

In [18]:
# Import dependencies
from __future__ import print_function

import sys
if sys.version >= '3':
    long = int
    
import pandas as pd
from pyspark.sql import SparkSession

# PySpark ML Imports
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

# SparkContext
from pyspark import SparkContext

# PySpark ML Imports

from pprint import pprint

In [19]:
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("ALSExample")\
        .getOrCreate()
  


In [20]:
# Load data
df = pd.DataFrame()
df = pd.read_csv('TopStaredRepositories.csv')

In [21]:
# Remove columns not needed
#df.set_index(['Repository Name'])
COLUMNS_TO_REMOVE_LIST = ['Description',
                          'Last Update Date', 'Language', 
                          'Tags', 'Url','Gravatar' ,'Unnamed: 0']



for column in COLUMNS_TO_REMOVE_LIST :
    try:
        del df[column]
    except Exception:
        pass
print(df.head(2))

       Username Repository Name Number of Stars
0  freeCodeCamp    freeCodeCamp            290k
1          twbs       bootstrap            112k


In [22]:
# DF User
def get_metadata(df, key, val):
    #create a new column with index 
    df['index'] = df.index
    if key == "Username":
        return {str(row[key]): row[val] for _, row in df.iterrows()}
    else:
        return {row[key]: row[val] for _, row in df.iterrows()}

# embd2idx is a mapping. type(embd2idx) = dictionary    
emb2idxU = get_metadata(df, "index", "Username")

# print(emb2idxU)
len(emb2idxU)

# DF User
# Create DataFrame from 0...n
d = dict()
size = len(emb2idxU)
for x in range(0,size):
    d[x]=x
# print(d)

# Convert dict to DataFrame
dfU = pd.DataFrame.from_dict(d,orient='index')
# type(dfU)

# DF Repo
def get_metadata(df, key, val):
    #create a new column with index 
    df['index'] = df.index
    if key == "Repository Name":
        return {str(row[key]): row[val] for _, row in df.iterrows()}
    else:
        return {row[key]: row[val] for _, row in df.iterrows()}

# embd2idx is a mapping. type(embd2idx) = dictionary    
emb2idxR = get_metadata(df, "index", "Repository Name")

# print(emb2idx)
len(emb2idxR)

# DF Repo
# Create DataFrame from 0...n
d = dict()
size = len(emb2idxR)
for x in range(0,size):
    d[x]=x
# print(d)

# Convert dict to DataFrame
dfR = pd.DataFrame.from_dict(d,orient='index')
type(dfR)

pandas.core.frame.DataFrame

In [None]:
# type(emb2idxU) dict
# type(dfU) pandas.core.frame.DataFrame

In [23]:
df1 = df['Repository Name'].map(lambda x: x) # pandas.core.series.Series
df1 = df1.to_frame()                         # pandas.core.frame.DataFrame
df2 = df['Number of Stars'].map(lambda x: x.rstrip('k'))
df2 = df2.to_frame()

In [24]:
# df1.set_index(['Repository Name'])
df2.set_index(['Number of Stars'])
type(df2)

pandas.core.frame.DataFrame

In [25]:
#Combine the three data frames
# fdf = User, Repo, of stars
fdf = pd.concat([dfU,dfR, df2], axis=1, ignore_index=False,sort = False) #Sort Warning

#print(fdf)

In [26]:
fdf.to_csv('Preprocessing.csv')

In [27]:
# Remove top Row containing Coloumn names
with open("Preprocessing.csv",'r+') as f:
    with open("Dataset.csv",'w') as f1:
        f.readline() # skip header line
        for line in f:
            f1.write(line)

# Datasetcsv

In [28]:
df = pd.read_csv("Dataset.csv")
df.head(2)


Unnamed: 0,0,0.1,0.2,290
0,1,1,1,112.0
1,2,2,2,87.8


In [29]:
lines = spark.read.text("Dataset.csv").rdd
print(lines.take(3))


[Row(value='0,0,0,290'), Row(value='1,1,1,112'), Row(value='2,2,2,87.8')]


In [30]:
# Need to convert p[1] from str to int
parts = lines.map(lambda row: row.value.split(","))
print(parts.take(2))
type(parts)
# RDD mapped as int and float from Dataset

[['0', '0', '0', '290'], ['1', '1', '1', '112']]


pyspark.rdd.PipelinedRDD

In [31]:
ratingsRDD = parts.map(lambda p: Row(userId=int(p[1]),
                                     repoId=int(p[2]),
                                     repoCount=float(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
type(ratings) #pyspark.sql.dataframe.DataFrame
# print(ratings.head(10))

pyspark.sql.dataframe.DataFrame

In [32]:
# Implicit Rating

ratingsRDD = parts.map(lambda p: Row(user=int(p[1]),
                                     item=int(p[2]),
                                     rating=float(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])
als = ALS(rank=5, maxIter=5, alpha = 1.0, 
          implicitPrefs=True, seed=0)
model = als.fit(ratings)

In [33]:
# model.userFactors.orderBy("id").collect()

In [34]:
test = test.drop("rating")

In [35]:
predictions = sorted(model.transform(test).collect(), 
                     key=lambda r: r[0])

user_recs = model.recommendForAllUsers(3)
item_recs = model.recommendForAllItems(3)
# user_recs of type pyspark.sql.dataframe.DataFrame



In [123]:
user_recs

DataFrame[user: int, recommendations: array<struct<item:int,rating:float>>]

In [36]:
user_recs.where(user_recs.user == 222) \
         .select("recommendations.item", "recommendations.rating") \
         .collect()


[Row(item=[6, 44, 42], rating=[0.16403163969516754, 0.14122384786605835, 0.11740853637456894])]

In [49]:
item_recs.where(item_recs.item == 2)       \
         .select("recommendations.user",   \
                 "recommendations.rating") \
          .collect()


[Row(user=[2, 26, 13], rating=[0.8256587982177734, 0.5344247817993164, 0.5243312120437622])]

# Mapping back (In progress)

In [179]:
row_number = 4

In [180]:
user_recommended = user_recs.toPandas()
user_recommended

Unnamed: 0,user,recommendations
0,471,"[(0, 0.005226806737482548), (23, 0.00444022519..."
1,463,"[(59, 0.004731948487460613), (14, 0.0043658032..."
2,833,"[(0, 0.0003944472991861403), (22, 0.0002864262..."
3,496,"[(3, 0.004276900552213192), (7, 0.004190643783..."
4,148,"[(9, 0.2732253968715668), (21, 0.2586356997489..."
5,540,"[(13, 0.0027574943378567696), (2, 0.0025258106..."
6,392,"[(16, 0.009262772276997566), (65, 0.0079017588..."
7,243,"[(9, 0.12746505439281464), (21, 0.097083449363..."
8,623,"[(14, 0.001248658518306911), (62, 0.0012246754..."
9,737,"[(6, 0.0007072219159454107), (44, 0.0005289434..."


In [181]:
user_id = user_recommended.user[row_number]

In [172]:
type(user_id)

numpy.int32

In [182]:
# Usernme works
user_name = emb2idxU[user_id]
print(user_name)

zenorocha


In [176]:
repo_id = user_recommended.recommendations[row_number][0].item

In [178]:
repo_name = emb2idxR[repo_id]
repo_name

'jQuery-File-Upload'