In [None]:
import math

def getLine(row):
	index, name, tags = row
	tags = tags.split('|')
	return (index, tags)

def getTag(line):
	index, tags = line
	return [(tag, 1) for tag in tags] 

def count(x, y):
	return x + y

class IDFScore(object):
	def __init__(self, tagCount, N):
		self.tagCount = dict(tagCount)
		self.N = N
		self.tagList = sorted(self.tagCount.keys())
		return

	def get(self, line):
		index, tags = line
		tags = frozenset(tags)
		indexIdf = [index]
		for tag in self.tagList:
			if tag in tags:
				indexIdf.append(math.log(self.N / self.tagCount[tag], 2))
			else:
				indexIdf.append(0.)
		return indexIdf

In [None]:
data = spark.read.csv('/user/hz333/data/project/movies.csv', header = True)

In [None]:
#(idx, name, tags) => (idx, [tag])
line = data.rdd.map(getLine)

#(idx, [tag]) => [(tag, 1)]
tags = line.flatMap(getTag)
#(tag,1) => (tag, count)
tagCount = tags.reduceByKey(count)
tagCount = tagCount.collect()

IDF = IDFScore(tagCount, data.rdd.count())

#(idx, [tag]) => (idx, tagIDFs)
movieIDF = line.map(IDF.get) #the same tag appears in a movie no more than once => IDF = TF-IDF

In [None]:
movieIDF.take(1)

In [None]:
movieIDFCSV = spark.createDataFrame(movieIDF, samplingRatio = 1)
movieIDFCSV.repartition(1).write.option('header', 'false').csv('/user/hz333/data/project/mMetaProfi.csv')