# BDCC project 1 

_[Big Data and Cloud Computing](http://www.dcc.fc.up.pt/~edrdo/aulas/bdcc), DCC/FCUP_


## Code necessary to run from the command line 

In [1]:
if __name__ == "__main__" :
    # This block is required to run the program from the command line
    # in interface with a single Spark instance
    from pyspark import SparkContext
    from pyspark.sql import SparkSession
    
    spark = SparkSession\
        .builder\
        .appName("BDCCp1")\
        .master("local[*]")\
        .getOrCreate()
    sc = spark.sparkContext
    sc.setLogLevel("WARN")

## Provided code - auxilliary functions

__You should not need to edit these.__

#### loadMovieLensData

In [2]:
from pyspark.sql import functions as F

def readCSV(file, debug=False):
    if debug:
      print('Reading ' + file)
    return spark.read.csv(file, inferSchema=True, header=True)

def readParquet(file, debug=False): 
    if debug:
       print('Reading ' + file)
    return spark.read.parquet(file)

def loadMovieLensData(path, format='parquet', debug=False):
    if format == 'parquet':
       movies = readParquet(path +'/movies.parquet', debug)
       ratings = readParquet(path +'/ratings.parquet', debug)
       tags = readParquet(path +'/tags.parquet', debug)
    else:
       movies = readCSV(path +'/movies.csv', debug)
       ratings = readCSV(path +'/ratings.csv', debug)
       tags = readCSV(path +'/tags.csv', debug)
    
    tags = tags.withColumn('tagl', F.explode(F.split(F.lower(F.col('tag')),'[ \*\+\&\/\%\-\$\#\'\)\(\[\[\],.!?;:\t\n"]+')))\
            .drop('tag')\
            .withColumnRenamed('tagl','tag')
    if (debug):
        print('> movies')
        movies.printSchema()
        movies.show()
        print('> ratings')
        ratings.printSchema()
        ratings.show()
        print('> tags')
        tags.printSchema()
        tags.show()
    return (movies, ratings, tags)

#### writeCSV / writeParquet (use them to write a data frame to CSV or Parquet format)

In [3]:
def writeCSV(df, path): 
    df.write.csv(path, header=True, mode='overwrite')

def writeParquet(df,path):
    df.write.parquet(path, mode='overwrite')


#### createTagListDF

In [4]:
def createTagListDF(csvTagList):
    return spark.createDataFrame([ (t,) for t in csvTagList.split(' ')], ['tag'])

#### Definition of functions available only in Spark 2.4 (GCP Spark instances run Spark 2.3) 

In [5]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType,IntegerType

# Define F.array_intersect if not defined (Spark version < 2.4)
if not hasattr(F,'array_intersect'):
  F.array_intersect = spark.udf\
    .register('array_intersect', 
       lambda x,y: list(set(x) & set(y)), ArrayType(IntegerType()))

# Define F.array_union if not defined (Spark version < 2.4)
if not hasattr(F,'array_union'):
  F.array_union = spark.udf\
    .register('array_union', 
       lambda x,y: list(set(x) | set(y)), ArrayType(IntegerType()))

# Define F.array_except if not defined (Spark version < 2.4)
if not hasattr(F,'array_except'):
  F.array_except = spark.udf\
    .register('array_except', 
       lambda x,y: list(set(x) - set(y)), ArrayType(IntegerType()))

## Functions to define 

__This is the section that will be evaluated.__

__Include your code for the various functions required in the assigment below.__

__You may include other auxilliary functions required for computation here
but NOT test code (see below).__



#### tfidfTags

In [6]:
# Auxiliary function to compute the tfidf of a given DF
def tfidf(data, term, document, debug=False):
    """Data is the Dataframe to apply tf-idf to.
    term is the name of the column containing the terms.
    document is the name of the column containing the documents"""
    
    f = data\
        .groupBy(term, document)\
            .agg(F.count(document)\
                 .alias('f')\
                )
    if debug:
        print('>>> TF-IDF Debugger')
        print('>>> Step 1 :: Compute number of times ' + term +
              'has been used in association to ' + document)
        f.show()
    
    f_max = f.groupBy(document)\
                .agg(F.max('f')\
                     .alias('f_max')\
                )
    f_f_max = f.join(f_max, document)
    if debug:
        print('>>> Step 2 :: Compute maximum absolute frequence of any' + term +
              'used for ' + document)
        f_f_max.show()
    
    tf = f_f_max\
            .withColumn('TF', f_f_max.f / f_f_max.f_max)
    if debug:
        print('>>> Step 3 :: TF value of' + term + 'for' + document)
        tf.show()
    
    n = data\
        .groupBy(term)\
        .agg(F.countDistinct(document)\
             .alias('n')\
        )
    tf_n = tf.join(n, term)
    if debug:
        print('>>> Step 4 :: Join with the number of ' + document +
              's with ' + term + ' at least once')
        tf_n.show()

    N = tags.select(document).distinct().count()
    idf = tf_n\
            .withColumn('IDF',  F.log2(N / tf_n.n))
    if debug:
        print('>>> Step 5 :: IDF value of ' + term +
              ' considering all ' + document + 's with ' + term)
        idf.show()
    
    tfidf = idf\
                .withColumn('TF_IDF',idf.TF * idf.IDF)
    if debug:
        print('>>> Step 6 :: TF-IDF value of ' + term + ' for ' + document)
        tfidf.show()
        print('>>> Finished TF-IDF processing')

    return tfidf

In [7]:
from pyspark.sql import functions as F

def tfidfTags(tags, debug=False):
    if debug:
        print('>> Step 1 :: Compute tfidf using "tag" as term and "movieId" as document')

    return tfidf(tags, 'tag', 'movieId', debug)
                

#### recommendByTag

In [8]:
from pyspark.sql import functions as F

def recommendByTag(singleTag, TFIDF_tags, movies, min_fmax=10, numberOfResults=10, debug=False):
    filt_tags = TFIDF_tags\
                        .filter((TFIDF_tags.tag == singleTag) &\
                                (TFIDF_tags.f_max >= min_fmax))\
                        .drop('tag', 'f', 'f_max', 'n', 'TF', 'IDF')
    if debug:
        print('>> Step 1 :: TFIDF of single tag & Filtered by >= ' + str(min_fmax))
        filt_tags.show()

    tags_movie = filt_tags.join(movies, 'movieId')
    if debug:
        print('>> Step 2 :: Join with the corresponding movie')
        tags_movie.show()

    rm_tag = tags_movie\
                .orderBy(['TF_IDF', 'title'], ascending=[0, 1])\
                .select('movieId', 'title', 'TF_IDF')\
                .limit(numberOfResults)
    if debug:
        print('>> Step 3 :: Limit to ' + str(numberOfResults) + ' ordered results')
        rm_tag.show()

    return rm_tag

#### recommendByTags

In [9]:
from pyspark.sql import functions as F

# Can it be done by using previous function recommendByTag?
# Even if possible is more computationally heavy
def recommendByTags(searchTags, TFIDF_tags, movies, min_fmax=10, numberOfResults=10, debug=False):
    searchTagsDF = createTagListDF(searchTags)
    if debug:
        print('>> Step 1 :: Search tags DF: ' + searchTags)
        searchTagsDF.show()

    filt_tags = TFIDF_tags\
                    .join(searchTagsDF, 'tag')\
                    .filter(F.col('f_max') >= min_fmax)
    if debug:
        print('>> Step 2 :: TFIDF of given tags & filtered by >=' + str(min_fmax))
        filt_tags.show()

    sum_tfidf = filt_tags\
                    .groupBy('movieID')\
                    .agg(F.sum('TF_IDF')\
                        .alias('SUM_TF_IDF')\
                    )
    if debug:
        print('>> Step 3 :: Sum of TF_IDF on same movies')
        sum_tfidf.show()

    tags_movie = sum_tfidf\
                    .join(movies, 'movieId')\
                    .orderBy(['SUM_TF_IDF', 'title'], ascending=[0, 1])\
                    .select('movieId', 'title', 'SUM_TF_IDF')\
                    .limit(numberOfResults)
    if debug:
        print('>> Step 4 :: Join with the corresponding movie & limit to ' +
              str(numberOfResults) + ' ordered results')
        tags_movie.show()

    return tags_movie

#### jiMovieSimilarity

In [10]:
def jiSimilarity(data, col_ref, col_set, debug=False):
    """"Data is the Dataframe to apply Jaccard Index to.
    col_ref is the name of the column of reference for the sets.
    col_ref should have as a last character a '1'.
    col_set is the name of the column to generate the sets from."""

    f1 = data\
                .groupBy(col_ref)\
                .agg(F.collect_set(data[col_set])\
                      .alias('f1')\
                    )
    if debug:
        print('>>> Jaccard Index debugger')
        print('>>> Step 1 :: ' + col_ref + ' & Set of ' + col_set +
              ' that are related with' + col_ref + ' (f1)')
        f1.show()

    col_ref_2 = col_ref[:-1] + '2'
    f2 = f1\
            .withColumnRenamed(col_ref, col_ref_2)\
            .withColumnRenamed('f1', 'f2')

    cross_prod = f1\
                .crossJoin(f2)\
                .filter(f1[col_ref] < f2[col_ref_2])
    if debug:
        print('>>> Step 2 :: Crossing different ' + col_ref[:-1] +
              ' and the respective sets of ' + col_set)
        cross_prod.show()

    i_u = cross_prod\
                .withColumn('i', F.size(\
                             F.array_intersect(cross_prod.f1,\
                                               cross_prod.f2)\
                                       )\
                           )\
                .withColumn('u', F.size(\
                           F.array_union(cross_prod.f1,\
                                         cross_prod.f2)\
                                       )\
                           )\
                .drop('f1', 'f2')
    if debug:
        print('>>> Step 3 :: Intersection between' + col_set +
              '(i) & Union between ' + col_set + ' (u)')
        i_u.show()

    ji = i_u\
            .withColumn('JI', i_u.i / i_u.u)
    if debug:
        print('>>> Step 4 :: Computed JI out of i & u')
        ji.show()
        print('>>> Finished Jaccard Index processing')

    return ji

In [11]:
from pyspark.sql import functions as F

def jiMovieSimilarity(ratings, minRatings=10, debug=False):
    liked_ratings = ratings\
                        .filter(ratings.rating >= 4.0)\
                        .withColumnRenamed('movieId', 'm1')
    if debug:
        print('>> Step 1 :: Filter ratings for liked movies & rename movieId to m1')
        liked_ratings.show()
        print('>> Step 2 :: Compute JI using "m1" as col_ref and "userId" as col_set')

    return jiSimilarity(liked_ratings, 'm1', 'userId', debug)

#### recommendBySimilarity

In [12]:
def getJiEntries(ji, entry_id, col_name_1, debug=False):
    """Gets the Entries of a Jaccard Index for the first two columns.
    Since in the given JI col1 < col2 we want to retrieve the DF were entry_id can be either
    on col1 or col2. 
    col_name_1 is the name of the first column in the JI.
    The col1 or col2 results will be returned in a column named
    col_name_1[:-1] (removing the digit of the given column name)"""

    col_name = col_name_1[:-1]
    col_name_2 = col_name + '2'
    
    col1_ji = ji\
                .filter(ji[col_name_1] == entry_id)\
                .drop(col_name_1, 'i', 'u')
    if debug:
        print('>>> GetJiEntries Debugger')
        print('>>> Step 1 :: Filter ji where col1 is' + str(entry_id))
        col1_ji.show()

    col2_ji = ji\
                .filter(ji[col_name_2] == entry_id)\
                .drop(col_name_2, 'i', 'u')
    if debug:
        print('>>> Step 2 :: Filter ji where col2 is' + str(entry_id))
        col2_ji.show()

    col_ji = col1_ji\
                .withColumnRenamed(col_name_2, col_name)\
                .union(\
                       col2_ji\
                           .withColumnRenamed(col_name_1, col_name)\
                      )
    if debug:
        print('>>> Step 3 :: Union of the two DFs presented before')
        col_ji.show()
        print('>>> Finished GetJiEntries processing')

    return col_ji

In [13]:
def recommendBySimilarity(movieId, movies, jiForMovies, numberOfResults=10, debug=False):
    if debug:
        print('>> Step 1 :: Get JI entries for the given Jaccard Index')

    ji_movieId = getJiEntries(jiForMovies, movieId, 'm1', debug)\
                    .withColumnRenamed('m', 'movieId')

    result = ji_movieId\
                    .join(movies, 'movieId')\
                    .select('movieId', 'title', 'JI')\
                    .orderBy('JI', ascending=False)\
                    .limit(numberOfResults)
    if debug:
        print('>> Step 2 :: Join with the respective movies and order results')
        result.show()
        
    return result

# Specify input data set and load it

In [14]:
# Load data
#bucket = 'gs://bdcc_up201503784_311' # Ed's bucket 
bucket = 'gs://bdcc_up201503316' # Foo's bucket 
#path = '/p1/data/'
path = '/p1/'
dataset = 'tiny2'
fullPath = bucket + path + dataset

(movies, ratings, tags) = \
  loadMovieLensData(fullPath, format='csv', debug=True)

Reading gs://bdcc_up201503316/p1/tiny2/movies.csv
Reading gs://bdcc_up201503316/p1/tiny2/ratings.csv
Reading gs://bdcc_up201503316/p1/tiny2/tags.csv
> movies
root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)

+-------+--------------------+
|movieId|               title|
+-------+--------------------+
|      1|    Toy Story (1995)|
|      2|      Jumanji (1995)|
|      3|Grumpier Old Men ...|
|      4|Waiting to Exhale...|
|      5|Father of the Bri...|
|      6|         Heat (1995)|
|      7|      Sabrina (1995)|
|      8| Tom and Huck (1995)|
|      9| Sudden Death (1995)|
|     10|    GoldenEye (1995)|
|     11|American Presiden...|
|     12|Dracula: Dead and...|
|     13|        Balto (1995)|
|     14|        Nixon (1995)|
|     15|Cutthroat Island ...|
|     16|       Casino (1995)|
|     17|Sense and Sensibi...|
|     18|   Four Rooms (1995)|
|     19|Ace Ventura: When...|
|     20|  Money Train (1995)|
+-------+--------------------+
only showing to

##  Test code 

__Include test code below that you may need here.__

__The initial contents are only meant as an example.__

__This section will NOT be evaluated.__

In [31]:
# Get TF-IDF for tags
tfidf = tfidfTags(tags, debug=True)

# tfidf.cache()
# tfidf.orderBy(['movieId','TF_IDF'],ascending=[1,0]).show()
tfidf.orderBy(['f','TF_IDF','movieId','tag'],ascending=[0,0,1,1]).show()



>>> TF-IDF Debugger
>>> Step 1 :: Compute number of times term has been used in association to document
+----------+-------+---+
|       tag|movieId|  f|
+----------+-------+---+
|      game|      2|  2|
|     moldy|      3|  1|
|alcoholism|     25|  1|
|     mafia|     16|  1|
|  williams|      2|  1|
| president|     14|  1|
|    austen|     17|  1|
|     robin|      2|  1|
|     pixar|      1|  2|
|     magic|      2|  1|
|   fantasy|      2|  1|
|    remake|      5|  1|
|      jane|     17|  1|
|       fun|      1|  1|
|  politics|     11|  1|
| pregnancy|      5|  1|
|    remake|      7|  1|
|    serial|     22|  1|
| president|     11|  1|
|     board|      2|  1|
+----------+-------+---+
only showing top 20 rows

>>> Step 2 :: Compute maximum absolute frequence of any term used for document
+-------+----------+---+-----+
|movieId|       tag|  f|f_max|
+-------+----------+---+-----+
|      2|      game|  2|    2|
|      3|     moldy|  1|    1|
|     25|alcoholism|  1|    1|
|    

In [32]:
# Recommend by tag 

rm = recommendByTag('pixar', tfidf, movies, min_fmax=1, debug=True)
rm.show()

rm = recommendByTag('politics', tfidf, movies, min_fmax=1)
rm.show()


rm = recommendByTag('remake', tfidf, movies, min_fmax=1)
rm.show()




>> Step 1 :: TFIDF of single tag & Filtered by *min_fmax*
+-------+------------------+
|movieId|            TF_IDF|
+-------+------------------+
|      1|3.5849625007211565|
+-------+------------------+

>> Step 2 :: Join with the corresponding movie
+-------+------------------+----------------+
|movieId|            TF_IDF|           title|
+-------+------------------+----------------+
|      1|3.5849625007211565|Toy Story (1995)|
+-------+------------------+----------------+

>> Step 3 :: Limit to *numberOfResults* ordered results
+-------+----------------+------------------+
|movieId|           title|            TF_IDF|
+-------+----------------+------------------+
|      1|Toy Story (1995)|3.5849625007211565|
+-------+----------------+------------------+

+-------+----------------+------------------+
|movieId|           title|            TF_IDF|
+-------+----------------+------------------+
|      1|Toy Story (1995)|3.5849625007211565|
+-------+----------------+------------------+



In [17]:
# Recommend by Tags

rm = recommendByTags('robin williams remake', tfidf, movies, min_fmax=1, debug=True)
rm.show()

rm = recommendByTags('pixar fantasy', tfidf, movies, min_fmax=1)
rm.show()

rm = recommendByTags('serial killer', tfidf, movies, min_fmax=1)
rm.show()

#rm = recommendByTags('hitchcock birds', tfidf, movies, numberOfResults=10)
#rm.show()




>> Step 1 :: Search tags DF: robin williams remake
+--------+
|     tag|
+--------+
|   robin|
|williams|
|  remake|
+--------+

>> Step 2 :: TFIDF of given tags & filter by *min_fmax*
+--------+-------+---+-----+---+---+------------------+------------------+
|     tag|movieId|  f|f_max| TF|  n|               IDF|            TF_IDF|
+--------+-------+---+-----+---+---+------------------+------------------+
|   robin|      2|  1|    2|0.5|  1|3.5849625007211565|1.7924812503605783|
|williams|      2|  1|    2|0.5|  1|3.5849625007211565|1.7924812503605783|
|  remake|      5|  1|    1|1.0|  2| 2.584962500721156| 2.584962500721156|
|  remake|      7|  1|    1|1.0|  2| 2.584962500721156| 2.584962500721156|
+--------+-------+---+-----+---+---+------------------+------------------+

>> Step 3 :: Sum of TF_IDF on same movies
+-------+------------------+
|movieID|        SUM_TF_IDF|
+-------+------------------+
|      5| 2.584962500721156|
|      7| 2.584962500721156|
|      2|3.5849625007211565

In [47]:
jiM = jiMovieSimilarity(ratings, debug=True)

#jiM.orderBy(['JI','m1','m2'], ascending=[0,1,1]).show()
jiM.orderBy(['i','JI','m1','m2'], ascending=[0,0,1,1]).show()




>> Step 1 :: Filter ratings for liked movies & rename movieId to m1
+---+------+------+
| m1|userId|rating|
+---+------+------+
|  1|     1|   4.0|
|  3|     1|   4.0|
|  6|     1|   4.0|
|  1|     5|   4.0|
| 21|     5|   4.0|
|  2|     6|   4.0|
|  3|     6|   5.0|
|  5|     6|   5.0|
|  6|     6|   4.0|
|  7|     6|   4.0|
| 11|     6|   4.0|
| 15|     6|   4.0|
| 16|     6|   4.0|
| 17|     6|   4.0|
| 22|     6|   5.0|
| 24|     6|   4.0|
|  1|     7|   4.5|
|  2|     8|   4.0|
| 11|     8|   4.0|
| 21|     8|   4.0|
+---+------+------+
only showing top 20 rows

>> Step 2 :: Compute JI using "m1" as col_ref and "userId" as col_set
>>> Jaccard Index debugger
>>> Step 1 :: *col_ref* & Set of *col_set* that are related with *col_ref* (f1)
+---+--------------------+
| m1|                  f1|
+---+--------------------+
| 12|     [380, 351, 276]|
| 22|[353, 99, 6, 42, ...|
|  1|[610, 277, 460, 2...|
| 13|           [20, 304]|
|  6|[610, 437, 577, 4...|
| 16|[610, 437, 483, 5...|
|  3|[

In [57]:
jiM.cache()

# Pulp Fiction
#sm = recommendBySimilarity(296, movies, jiM)
#sm.show()

# Fight club
#sm = recommendBySimilarity(2959, movies, jiM)
#sm.show()
    
# Shrek
#sm = recommendBySimilarity(4306, movies, jiM)
#sm.show()

# Toy Story
sm = recommendBySimilarity(1, movies, jiM, 10, True)
sm.show()

# Heat
sm = recommendBySimilarity(6, movies, jiM)
sm.show()

# Leaving Las Vegas
sm = recommendBySimilarity(25, movies, jiM)
sm.show()

>> Step 1 :: Get JI entries for the given Jaccard Index
>>> GetJiEntries Debugger
>>> Step 1 :: Filter ji where col1 is the given entry_id
+---+--------------------+
| m2|                  JI|
+---+--------------------+
| 12|0.013513513513513514|
| 22|0.013071895424836602|
| 13|0.006756756756756757|
|  6| 0.14285714285714285|
| 16|  0.0967741935483871|
|  3| 0.07142857142857142|
| 20|                 0.0|
|  5|0.046052631578947366|
| 19| 0.06451612903225806|
| 15|                 0.0|
| 17| 0.10650887573964497|
|  9|0.027210884353741496|
|  8|0.006802721088435374|
| 23|0.006666666666666667|
|  7|             0.04375|
| 10| 0.10160427807486631|
| 25|  0.1111111111111111|
| 24|0.033112582781456956|
| 21| 0.12941176470588237|
| 11| 0.09826589595375723|
+---+--------------------+
only showing top 20 rows

>>> Step 2 :: Filter ji where col2 is the given entry_id
+---+---+
| m1| JI|
+---+---+
+---+---+

>>> Step 3 :: Union of the two DFs presented before
+---+--------------------+
|  m|     

# Extended Functionalities

### tfidfMovies

In [25]:
def tfidfMoviesAndTags(movies, tags, debug=False):
    movie_title_w = movies\
                .withColumn('word',\
                    F.explode(F.split(F.col('title'), '( \([0-9]{4}\))| '))\
                           )\
                .filter(F.col('word') != '')\
                .drop('title')
    if debug:
        print('>> Step 1 :: Associate to each movie the words belonging to its title')
        movie_title_w.show()

    # Union keeps duplicates - intended
    movie_w = movie_title_w\
                    .union(tags\
                              .withColumnRenamed('tag', 'word')\
                              .drop('userId'))
    if debug:
        print('>> Step 2 :: Union of previous DF with the given tags')
        movie_w.orderBy('movieId').show()
        print('>> Step 3 :: Compute tfidf using "word" as term and "movieId" as document')

    return tfidf(movie_w, 'word', 'movieId', debug)

### jiTagSimilarity

In [None]:
def jiTagSimilarity():
    # TODO
    # Can use jiSimilarity function
    
    return None

# The recommendation functino can use the getJiEntries functino

### jiUserSimiliarity

In [15]:
# Calculate the Jaccard similarity between users based
# on what films they rate (independently of the value of the rating itself). 
def jiUserSimilarity(ratings, debug=False):
    u1_rat = ratings\
                .withColumnRenamed('userId', 'u1')
    if debug:
        print('>> Step 1 :: Rename userId to u1')
        u1_rat.show()
        print('>> Step 2 :: Compute JI using "u1" as col_ref and "movieId" as col_set')

    return jiSimilarity(u1_rat, 'u1', 'movieId', debug)

### recommendByUserSimilarity

In [24]:
from pyspark.sql.functions import array, lit

# Given an array returns a new array with the lit function applied to every member
def get_lit_array_from(arr):
    ret = [ lit(item) for item in arr[0] ]
    return array(ret)
    

# Given the id of a user, recommend the top-rated film per each of the most n
# similar users to user u, as long as u has not yet rated or tagged the movies at stake.
def recommendByUserSimilarity(userID, ratings, movies, jiForUsers, numberSimilarUsers=10, debug=False):
    related_users = getJiEntries(jiForUsers, userID, 'u1')\
                            .orderBy('JI', ascending=False)
    if debug:
        print('>> Step 1 :: Get the top related users ' \
              'to User' + str(userID))
        related_users.show()

    # Missing here the movies that u1 also tagged
    u1_mu1 = ratings\
                .filter(ratings.userId == userID)\
                .agg(F.collect_set(ratings.movieId)\
                      .alias('movies')\
                    )
    if debug:
        print('>> Step 2 :: Set of movies rated by user1 (mu1)')
        u1_mu1.show()

    relatedUsers_movies = related_users\
            .join(ratings, ratings.userId == related_users.u)\
            .groupBy('u')\
            .agg(F.collect_set('movieId')\
            .alias('movies'))
    
    if debug:
        print('>> Step 3 :: Join of the DFs with ratings')
        relatedUsers_movies.show()

    arr = get_lit_array_from(u1_mu1.collect()[0])
    relatedUsers_exceptMovies = relatedUsers_movies \
                            .withColumn("except_movies", arr)
    
    if debug:
        print('>> Step 4 :: Join of the DFs with movies')
        relatedUsers_exceptMovies.show()
    
    potencialMovies = F.array_except( \
                    relatedUsers_exceptMovies.movies \
                    ,relatedUsers_exceptMovies.except_movies \
                    )
    
    relatedUsers_potencialMovies=relatedUsers_exceptMovies \
                                .withColumn("potencialMovies", potencialMovies) \
                                .drop('movies', 'except_movies')

    if debug:
        print('>> Step 5 :: Potencial Movies Only')
        relatedUsers_potencialMovies.show()

        
        # Need to remove from the related users movies the ones u1 rated/ tagged,
    # and only then can I limit to (numberSimilar Users)
            #        escolher o com maior rating
    return 

# Test Code for Extended Fuctionalities

In [26]:
# Tests for tfidfMovies

tfidfMT = tfidfMoviesAndTags(movies, tags, debug=True)

tfidfMT.orderBy(['f','TF_IDF','movieId','word'],ascending=[0,0,1,1]).show()

>> Step 1 :: Associate to each movie the words belonging to its title
+-------+--------+
|movieId|    word|
+-------+--------+
|      1|     Toy|
|      1|   Story|
|      2| Jumanji|
|      3|Grumpier|
|      3|     Old|
|      3|     Men|
|      4| Waiting|
|      4|      to|
|      4|  Exhale|
|      5|  Father|
|      5|      of|
|      5|     the|
|      5|   Bride|
|      5|    Part|
|      5|      II|
|      6|    Heat|
|      7| Sabrina|
|      8|     Tom|
|      8|     and|
|      8|    Huck|
+-------+--------+
only showing top 20 rows

>> Step 2 :: Union of previous DF with the given tags
+-------+--------+
|movieId|    word|
+-------+--------+
|      1|     Toy|
|      1|   pixar|
|      1|   Story|
|      1|   pixar|
|      1|     fun|
|      2|williams|
|      2|    game|
|      2|   board|
|      2| Jumanji|
|      2|   robin|
|      2|    game|
|      2| fantasy|
|      2|   magic|
|      3|Grumpier|
|      3|     Old|
|      3|     Men|
|      3|     old|
|      3|   mo

In [59]:
# TODO - Tests for jiTagSimilarity

In [25]:
# Tests for jiUserSimilarity
jiU = jiUserSimilarity(ratings, debug=True)

#jiU.orderBy(['JI','m1','m2'], ascending=[0,1,1]).show()
#jiU.orderBy(['i','JI','m1','m2'], ascending=[0,0,1,1]).show()

jiU.cache()

# User 1
sm = recommendByUserSimilarity(1, ratings, movies, jiU, 10, True)
#sm.show()

# Heat
#sm = recommendByUserSimilarity(6, movies, jiU)
#sm.show()

# Leaving Las Vegas
#sm = recommendByUserSimilarity(25, movies, jiU)
#sm.show()



>> Step 1 :: Rename userId to u1
+-------+---+------+
|movieId| u1|rating|
+-------+---+------+
|      1|  1|   4.0|
|      3|  1|   4.0|
|      6|  1|   4.0|
|     21|  4|   3.0|
|      1|  5|   4.0|
|     21|  5|   4.0|
|      2|  6|   4.0|
|      3|  6|   5.0|
|      4|  6|   3.0|
|      5|  6|   5.0|
|      6|  6|   4.0|
|      7|  6|   4.0|
|      8|  6|   3.0|
|     10|  6|   3.0|
|     11|  6|   4.0|
|     13|  6|   3.0|
|     15|  6|   4.0|
|     16|  6|   4.0|
|     17|  6|   4.0|
|     19|  6|   2.0|
+-------+---+------+
only showing top 20 rows

>> Step 2 :: Compute JI using "u1" as col_ref and "movieId" as col_set
>>> Jaccard Index debugger
>>> Step 1 :: u1 & Set of movieId that are related withu1 (f1)
+---+--------------------+
| u1|                  f1|
+---+--------------------+
|471|                 [1]|
|243|                [10]|
| 31|[1, 5, 17, 10, 7,...|
|137|                 [1]|
|451|[1, 5, 17, 6, 7, 25]|
|580|[1, 16, 6, 10, 25...|
|458|      [5, 2, 21, 10]|
|588|[