# DataPreparation
###WARNING!! This notebook will take up at least 6GB memory. 
###Kung-hsiang, Huang 11/24/2017

This notebook will generate a json dataframe containing key name **index** and tf-idfed **song**. 

In [1]:
from pyspark.sql.types import StringType
from pyspark import SQLContext
from collections import namedtuple
from pyspark.sql import functions as F
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
import re
sqlContext = SQLContext(sc)


#compute the length of song
size_ = udf(lambda xs: len(xs), IntegerType())

#filter out null song value  note: filter == where, dunno why do they create two functions...
raw_df = sqlContext.jsonFile("data/lyrics.json").where( (col('lyrics')).isNotNull() ).filter(size_(col('lyrics')) >0)

# .na.drop(subset=[""])
# raw_df.select(raw_df['lyrics'], F.where((raw_df['index'] == 160))).show()
raw_df
raw_df.show()

artist          genre index lyrics               song                 year
beyonce-knowles Pop   0     Oh baby, how you ... ego-remix            2009
beyonce-knowles Pop   1     playin' everythin... then-tell-me         2009
beyonce-knowles Pop   2     If you search
For... honesty              2009
beyonce-knowles Pop   3     Oh oh oh I, oh oh... you-are-my-rock      2009
beyonce-knowles Pop   4     Party the people,... black-culture        2009
beyonce-knowles Pop   5     I heard
Church be... all-i-could-do-wa... 2009
beyonce-knowles Pop   6     This is just anot... once-in-a-lifetime   2009
beyonce-knowles Pop   7     Waiting, waiting,... waiting              2009
beyonce-knowles Pop   8     [Verse 1:]
I read... slow-love            2009
beyonce-knowles Pop   9     N-n-now, honey
Yo... why-don-t-you-lov... 2009
beyonce-knowles Pop   10    I lay alone awake... save-the-hero        2009
beyonce-knowles Pop   11    Hello hello baby ... telephone            2009
beyonce-knowles Pop   12 

In [2]:
STOPWORDS_PATH = 'data/stopwords.txt'
stopwords = set(sc.textFile(STOPWORDS_PATH).collect())

In [3]:
split_regex = r'\W+'
def tokenize(string):
    """ An implementation of input string tokenization that excludes stopwords
    Args:
        string (str): input string
    Returns:
        list: a list of tokens without stopwords
    """
    
    regexed = [ token.lower() for token in re.split(split_regex, string) if len(token)]
    return  [token for token in regexed if token not in stopwords]#get rid of empty stuff

In [4]:
def tf(tokens):
    """ Compute TF
    Args:
        tokens (list of str): input list of tokens from tokenize
    Returns:
        dictionary: a dictionary of tokens to its TF values
    """
    result = {}
    for token in tokens:
        if token not in result:
            result[token] = 1
        else:
            result[token] += 1
    for key in result:
        result[key] /= float(len(tokens))
    return result

In [5]:
def idfs(corpus):
    """ Compute IDF
    Args:
        corpus (RDD): input corpus
    Returns:
        RDD: a RDD of (token, IDF value)
    """
    N = corpus.map(lambda x:  1).reduce(lambda a,b: a+b) + 0.0
    uniqueTokens = corpus.flatMap(lambda x: [(x[1][i], x[0]) for i in range(len(x[1]))])
    tokenCountPairTuple = uniqueTokens.groupByKey().mapValues(lambda x: list(set(x))).mapValues(lambda x: len(x))
    tokenSumPairTuple = tokenCountPairTuple.reduceByKey(lambda a,b: a+b).mapValues(lambda x: N/x)
    return (tokenSumPairTuple)

In [6]:
def tfidf(tokens, idfs):
    """ Compute TF-IDF
    Args:
        tokens (list of str): input list of tokens from tokenize
        idfs (dictionary): record to IDF value
    Returns:
        dictionary: a dictionary of records to TF-IDF values
    """
    # tf returns a dictionary of token to its tf
    tfs = tf(tokens)
    #multiply the two dictionaries
    tfIdfDict = dict((k, v * idfs[k]) for k, v in tfs.items() if k in idfs)
    return tfIdfDict

In [7]:

# get rid of special character and split, then tokenize
raw_to_token = raw_df.select('index','lyrics').rdd.map(lambda x: (x[0],re.sub(r"[^a-zA-Z0-9]+", ' ', x[1]).split(" ") ))\
.map(lambda x: (x[0],  tokenize(' '.join(x[1])) ))
raw_to_token.take(1)

[(0,
  [u'oh',
   u'baby',
   u'know',
   u'm',
   u'gonna',
   u'cut',
   u'right',
   u'chase',
   u'women',
   u'made',
   u'like',
   u'think',
   u'created',
   u'special',
   u'purpose',
   u'know',
   u'special',
   u'feel',
   u'baby',
   u'let',
   u'get',
   u'lost',
   u'need',
   u'call',
   u'work',
   u'cause',
   u're',
   u'boss',
   u'real',
   u'want',
   u'show',
   u'feel',
   u'consider',
   u'lucky',
   u'big',
   u'deal',
   u'well',
   u'got',
   u'key',
   u'heart',
   u'ain',
   u'gonna',
   u'need',
   u'd',
   u'rather',
   u'open',
   u'body',
   u'show',
   u'secrets',
   u'didn',
   u'know',
   u'inside',
   u'need',
   u'lie',
   u'big',
   u'wide',
   u'strong',
   u'won',
   u'fit',
   u'much',
   u'tough',
   u'talk',
   u'like',
   u'cause',
   u'back',
   u'got',
   u'big',
   u'ego',
   u'huge',
   u'ego',
   u'love',
   u'big',
   u'ego',
   u'much',
   u'walk',
   u'like',
   u'cause',
   u'back',
   u'usually',
   u'm',
   u'humble',
   u'right'

In [14]:
#select the top 10000 words as dictionary


{u'parcheggi': 2.960749550461994e-08,
 u'vivemos': 3.256824505508193e-07,
 u'transend': 5.921499100923988e-08,
 u'fjate': 1.7764497302771964e-07,
 u'woods': 4.725356282537342e-05,
 u'wannaaaa': 2.960749550461994e-08,
 u'spiders': 6.572864002025626e-06,
 u'woody': 4.4411243256929904e-06,
 u'trawling': 1.1842998201847975e-07,
 u'kalmadan': 5.921499100923988e-08,
 u'godaan': 5.921499100923988e-08,
 u'verplant': 1.1842998201847975e-07,
 u'spidery': 8.882248651385982e-08,
 u'takarja': 2.960749550461994e-08,
 u'canes': 2.6646745954157944e-06,
 u'scutter': 5.921499100923988e-08,
 u'igual': 3.813445420995048e-05,
 u'caned': 2.0725246853233957e-07,
 u'iguau': 2.960749550461994e-08,
 u'xodo': 2.960749550461994e-08,
 u'trecenis': 2.960749550461994e-08,
 u'daseba': 2.960749550461994e-08,
 u'spingono': 2.960749550461994e-08,
 u'raouf': 2.960749550461994e-08,
 u'pigment': 8.882248651385982e-07,
 u'bollwerk': 2.960749550461994e-08,
 u'disjoncter': 2.960749550461994e-08,
 u'czasach': 2.960749550461994

In [15]:
vocabList = sorted(tokenTF, key=tokenTF.get, reverse=True)[:NUM_FEATURES ]
vocabList

[u'm',
 u'love',
 u'like',
 u'know',
 u're',
 u'll',
 u'oh',
 u'got',
 u'get',
 u'one',
 u'go',
 u'time',
 u'see',
 u'never',
 u've',
 u'let',
 u'want',
 u'baby',
 u'come',
 u'yeah',
 u'way',
 u'cause',
 u'say',
 u'back',
 u'make',
 u'take',
 u'life',
 u'la',
 u'feel',
 u'away',
 u'right',
 u'gonna',
 u'd',
 u'heart',
 u'need',
 u'man',
 u'could',
 u'ain',
 u'day',
 u'tell',
 u'night',
 u'que',
 u'world',
 u'wanna',
 u'girl',
 u'give',
 u'de',
 u'think',
 u'keep',
 u'still',
 u'good',
 u'us',
 u'eyes',
 u'won',
 u'little',
 u'chorus',
 u'said',
 u'every',
 u'around',
 u'well',
 u'ya',
 u'y',
 u'would',
 u'look',
 u'mind',
 u'long',
 u'always',
 u'find',
 u'ever',
 u'nothing',
 u'home',
 u'die',
 u'things',
 u'hey',
 u'gone',
 u'yo',
 u'better',
 u'gotta',
 u'hold',
 u'hear',
 u'o',
 u'everything',
 u'shit',
 u'nigga',
 u'n',
 u'live',
 u'really',
 u'put',
 u'much',
 u'new',
 u'something',
 u'try',
 u'light',
 u'head',
 u'alone',
 u'tonight',
 u'leave',
 u'call',
 u'stop',
 u'believe',


In [17]:
import numpy as np
#select the top 10000 words as dictionary for vectorization
NUM_FEATURES = 10000
def storeVocablist(raw_to_token):
    
    tokenTF = tf(raw_to_token.flatMap(lambda x: x[1]).collect())
    vocabList = np.array(sorted(tokenTF, key=tokenTF.get, reverse=True)[:NUM_FEATURES ])
    np.save('vocabList',vocabList)
storeVocablist(raw_to_tokenenenenken)

In [10]:
def vectorizeDict(weightDict, vocabList):
    vector = []
    for vocab in vocabList:
        if vocab not in weightDict:
            vector.append(0.0)
        else:
            vector.append(weightDict[vocab])
    return vector

In [8]:
# compute IDFs 
idfsWeights = idfs(raw_to_token).collectAsMap()
idfsBroadcast = sc.broadcast(idfsWeights)

In [9]:
# compute tf-idf value
WeightsRDD = raw_to_token.map(lambda x: (x[0], tfidf(x[1], idfsBroadcast.value)))

In [18]:
#save as tfidfweight as parquet

from pyspark.sql.types import Row
def storeTFIDF(WeightsRDD):
    WeightsRDD.map(lambda x: Row(**{'index': x[0], 'song': x[1]})).toDF().save("tfidf.parquet")

storeTFIDF(WeightsRDD)
# .toDF().save("tfidf.parquet")
# ()

In [10]:
import json
from pyspark.sql.types import Row
WeightsRDD.cache()
del raw_df
del raw_to_token
del idfsBroadcast
def storeTFIDFJSON(WeightsRDD):
    with open('tfidf.json', 'w') as fp:
        json.dump(WeightsRDD.map(lambda x: Row(**{'index': x[0], 'song': x[1]})).toDF().toJSON().collect(), fp)
storeTFIDFJSON(WeightsRDD)

ERROR:py4j.java_gateway:Error while sending or receiving.
Traceback (most recent call last):
  File "/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 479, in send_command
    raise Py4JError("Answer from Java side is empty")
Py4JError: Answer from Java side is empty
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server
Traceback (most recent call last):
  File "/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 425, in start
    self.socket.connect((self.address, self.port))
  File "/usr/lib/python2.7/socket.py", line 224, in meth
    return getattr(self._sock,name)(*args)
error: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server
Traceback (most recent call last):
  File "/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 425, in start
    se

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 38944)
----------------------------------------


Traceback (most recent call last):
  File "/usr/lib/python2.7/SocketServer.py", line 295, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python2.7/SocketServer.py", line 321, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python2.7/SocketServer.py", line 334, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python2.7/SocketServer.py", line 649, in __init__
    self.handle()
  File "/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/accumulators.py", line 231, in handle
    num_updates = read_int(self.rfile)
  File "/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/serializers.py", line 528, in read_int
    raise EOFError
EOFError


Py4JNetworkError: An error occurred while trying to connect to the Java server

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server
Traceback (most recent call last):
  File "/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 425, in start
    self.socket.connect((self.address, self.port))
  File "/usr/lib/python2.7/socket.py", line 224, in meth
    return getattr(self._sock,name)(*args)
error: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server
Traceback (most recent call last):
  File "/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 425, in start
    self.socket.connect((self.address, self.port))
  File "/usr/lib/python2.7/socket.py", line 224, in meth
    return getattr(self._sock,name)(*args)
error: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server
Traceback (most recent call last):
  File "/usr/loc

In [14]:
import json
with open('tfidf.json', 'w') as fp:
    json.dump(tfidfJSON, fp)

# tfidfJSON.take(64)

TypeError: MapPartitionsRDD[59] at mapPartitions at DataFrame.scala:862 is not JSON serializable

In [6]:
import json

def storeGenre(raw_df ):
    genreJSON = raw_df.select('index', 'genre').rdd.map(lambda x: ",".join(map(str, x))).coalesce(1).saveAsTextFile("genre.csv")
    
storeGenre(raw_df)

In [17]:
sqlContext.parquetFile('tfidf.parquet/part-r-00001.parquet').rdd.take(5)

[Row(index=0, song={u'consider': 1.9795992632861004, u'show': 0.12019839090349117, u'love': 0.046626448026155135, u'find': 0.0383851169595696, u'arrogant': 6.732597494443321, u'cut': 0.16438833480932563, u'fit': 1.1587486741359423, u'created': 1.1724005981703025, u'better': 0.05249284753271387, u'choose': 0.3408482942049, u'smile': 0.10363367323611604, u'piano': 1.3412077848891035, u'must': 0.08072563031266997, u'real': 0.0657410303029705, u'get': 0.01870294566289694, u'beat': 0.12407487399667466, u'big': 0.8772036843352329, u'lucky': 0.5074569753274444, u'gonna': 0.07386805137567491, u'every': 0.03580603164334555, u'know': 0.06272702799121586, u'decide': 0.6313763667026698, u'd': 0.02921745104684622, u'like': 0.16074835828504208, u'lost': 0.061263331405808856, u'didn': 0.08861567041620844, u'leave': 0.052345355986203414, u'thighs': 1.6748579973861464, u'right': 0.0538874569144151, u'deal': 0.32372880120865294, u'back': 0.24871702521535313, u'see': 0.015965072417415634, u'blues': 0.385