# BDCC project 1 

_[Big Data and Cloud Computing](http://www.dcc.fc.up.pt/~edrdo/aulas/bdcc), DCC/FCUP_


## Code necessary to run from the command line 

In [7]:
if __name__ == "__main__" :
    # This block is required to run the program from the command line
    # in interface with a single Spark instance
    from pyspark import SparkContext
    from pyspark.sql import SparkSession
    
    spark = SparkSession\
        .builder\
        .appName("BDCCp1")\
        .master("local[*]")\
        .getOrCreate()
    sc = spark.sparkContext
    sc.setLogLevel("WARN")

## Provided code - auxilliary functions

__You should not need to edit these.__

#### loadMovieLensData

In [1]:
from pyspark.sql import functions as F

def readCSV(file, debug=False):
    if debug:
      print('Reading ' + file)
    return spark.read.csv(file, inferSchema=True, header=True)

def readParquet(file, debug=False): 
    if debug:
       print('Reading ' + file)
    return spark.read.parquet(file)

def loadMovieLensData(path, format='parquet', debug=False):
    if format == 'parquet':
       movies = readParquet(path +'/movies.parquet', debug)
       ratings = readParquet(path +'/ratings.parquet', debug)
       tags = readParquet(path +'/tags.parquet', debug)
    else:
       movies = readCSV(path +'/movies.csv', debug)
       ratings = readCSV(path +'/ratings.csv', debug)
       tags = readCSV(path +'/tags.csv', debug)
    
    tags = tags.withColumn('tagl', F.explode(F.split(F.lower(F.col('tag')),'[ \*\+\&\/\%\-\$\#\'\)\(\[\[\],.!?;:\t\n"]+')))\
            .drop('tag')\
            .withColumnRenamed('tagl','tag')
    if (debug):
        print('> movies')
        movies.printSchema()
        movies.show()
        print('> ratings')
        ratings.printSchema()
        ratings.show()
        print('> tags')
        tags.printSchema()
        tags.show()
    return (movies, ratings, tags)

#### writeCSV / writeParquet (use them to write a data frame to CSV or Parquet format)

In [2]:
def writeCSV(df, path): 
    df.write.csv(path, header=True, mode='overwrite')

def writeParquet(df,path):
    df.write.parquet(path, mode='overwrite')


#### createTagListDF

In [3]:
def createTagListDF(csvTagList):
    return spark.createDataFrame([ (t,) for t in csvTagList.split(' ')], ['tag'])

#### Definition of functions available only in Spark 2.4 (GCP Spark instances run Spark 2.3) 

In [4]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType,IntegerType

# Define F.array_intersect if not defined (Spark version < 2.4)
if not hasattr(F,'array_intersect'):
  F.array_intersect = spark.udf\
    .register('array_intersect', 
       lambda x,y: list(set(x) & set(y)), ArrayType(IntegerType()))

# Define F.array_union if not defined (Spark version < 2.4)
if not hasattr(F,'array_union'):
  F.array_union = spark.udf\
    .register('array_union', 
       lambda x,y: list(set(x) | set(y)), ArrayType(IntegerType()))

## Functions to define 

__This is the section that will be evaluated.__

__Include your code for the various functions required in the assigment below.__

__You may include other auxilliary functions required for computation here
but NOT test code (see below).__



#### tfidfTags

In [64]:
from pyspark.sql import functions as F

# TODO improve debug prints
def tfidfTags(tags, debug=False):
    f = tags\
        .groupBy('tag', 'movieId')\
        .agg(F.count('movieId').alias('f'))
    if debug:
        f.show()
    
    f_max = f.groupBy('movieId')\
                .agg(F.max('f').alias('f_max'))
    f_f_max = f.join(f_max, 'movieId')
    if debug:
        f_f_max.show()
    
    tf = f_f_max.withColumn('TF', f_f_max.f / f_f_max.f_max)
    if debug:
        tf.show()
    
    n = tags\
        .drop('userId')\
        .distinct()\
        .groupBy('tag')\
        .agg(F.count('movieId').alias('n'))
    tf_n = tf.join(n, 'tag')
    if debug:
        tf_n.show()

    N = tags.select('movieId').distinct().count()
    idf = tf_n\
            .withColumn('IDF',  F.log2(N / tf_n.n))
    if debug:
        idf.show()
    
    tfidf = idf\
                .withColumn('TF_IDF',idf.TF * idf.IDF)
    if debug:
        tfidf.show()
        
    return tfidf
                

#### recommendByTag

In [None]:
from pyspark.sql import functions as F

def recommendByTag(singleTag, TFIDF_tags, movies, min_fmax=10, numberOfResults=10, debug=False):
    # TODO
    
    return None 

#### recommendByTags

In [None]:
from pyspark.sql import functions as F

def recommendByTags(searchTags, TFIDF_tags, movies, min_fmax=10, numberOfResults=10, debug=False):
    searchTagsDF = createTagListDF(searchTags)
    if debug:
        print('> Search tags DF: ' + searchTags)
        searchTagsDF.show()
    # TODO
        
    return None 

#### jiMovieSimilarity

In [None]:
from pyspark.sql import functions as F

def jiMovieSimilarity(ratings, minRatings=10, debug=False):
  # TODO
  return None

#### recommendBySimilarity

In [None]:
def recommendBySimilarity(movieId, movies, jiForMovies, numberOfResults=10, debug=False):
    # TODO
        
    return None

# Specify input data set and load it

In [6]:
# Load data
bucket = 'gs://bdcc_up201503784_311' # Ed's bucket 
path = '/p1/data/'
dataset = 'tiny1'
fullPath = bucket + path + dataset

(movies, ratings, tags) = \
  loadMovieLensData(fullPath, format='csv', debug=True)

Reading gs://bdcc_up201503784_311/p1/data/tiny1/movies.csv
Reading gs://bdcc_up201503784_311/p1/data/tiny1/ratings.csv
Reading gs://bdcc_up201503784_311/p1/data/tiny1/tags.csv
> movies
root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)

+-------+--------------------+
|movieId|               title|
+-------+--------------------+
|      1|    Toy Story (1995)|
|      2|      Jumanji (1995)|
|      3|Grumpier Old Men ...|
|      4|Waiting to Exhale...|
|      5|Father of the Bri...|
+-------+--------------------+

> ratings
root
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)

+-------+------+------+
|movieId|userId|rating|
+-------+------+------+
|      1|     1|   4.0|
|      3|     1|   4.0|
|      1|     5|   4.0|
|      2|     6|   4.0|
|      3|     6|   5.0|
|      4|     6|   3.0|
|      5|     6|   5.0|
|      1|     7|   4.5|
|      2|     8|   4.0|
|      4|    14|   3.0|
|      

##  Test code 

__Include test code below that you may need here.__

__The initial contents are only meant as an example.__

__This section will NOT be evaluated.__

In [67]:
# Get TF-IDF for tags
tfidf = tfidfTags(tags, debug=False)

# tfidf.cache()
tfidf.orderBy(['movieId','TF_IDF'],ascending=[1,0]).show()
# tfidf.orderBy(['f','TF_IDF','movieId','tag'],ascending=[0,0,1,1]).show()



+---------+-------+---+-----+---+---+---+------+
|      tag|movieId|  f|f_max| TF|  n|IDF|TF_IDF|
+---------+-------+---+-----+---+---+---+------+
|    pixar|      1|  2|    2|1.0|  1|2.0|   2.0|
|      fun|      1|  1|    2|0.5|  1|2.0|   1.0|
|     game|      2|  2|    2|1.0|  1|2.0|   2.0|
|    robin|      2|  1|    2|0.5|  1|2.0|   1.0|
| williams|      2|  1|    2|0.5|  1|2.0|   1.0|
|  fantasy|      2|  1|    2|0.5|  1|2.0|   1.0|
|    board|      2|  1|    2|0.5|  1|2.0|   1.0|
|    magic|      2|  1|    2|0.5|  1|2.0|   1.0|
|      old|      3|  1|    1|1.0|  1|2.0|   2.0|
|    moldy|      3|  1|    1|1.0|  1|2.0|   2.0|
|   remake|      5|  1|    1|1.0|  1|2.0|   2.0|
|pregnancy|      5|  1|    1|1.0|  1|2.0|   2.0|
+---------+-------+---+-----+---+---+---+------+



In [None]:
# Recommend by tag 

rm = recommendByTag('cartoon', tfidf, movies)
rm.show()

rm = recommendByTag('cartoon', tfidf, movies, min_fmax=1)
rm.show()


rm = recommendByTag('cruise', tfidf, movies)
rm.show()




In [None]:




rm = recommendByTags('tom hanks cruise', tfidf, movies, numberOfResults=20)
rm.show()

rm = recommendByTags('tom hanks airport', tfidf, movies, numberOfResults=20)
rm.show()

rm = recommendByTags('tom hanks', tfidf, movies, numberOfResults=20)
rm.show()

rm = recommendByTags('hitchcock birds', tfidf, movies, numberOfResults=10)
rm.show()




In [None]:
jiM = jiMovieSimilarity(ratings)

#jiM.orderBy(['JI','m1','m2'], ascending=[0,1,1]).show()




In [None]:
#jiM.cache()

# Pulp Fiction
#sm = recommendBySimilarity(296, movies, jiM)
#sm.show()

# Fight club
#sm = recommendBySimilarity(2959, movies, jiM)
#sm.show()
    
# Shrek
#sm = recommendBySimilarity(4306, movies, jiM)
#sm.show()
