# DataPreparation
###WARNING!! This notebook will take up at least 6GB memory. 
###Kung-hsiang, Huang 11/24/2017

This notebook will generate a json dataframe containing key name **index** and tf-idfed **song**. 

In [7]:
from pyspark.sql.types import StringType
from pyspark import SQLContext
from collections import namedtuple
from pyspark.sql import functions as F
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
import re
sqlContext = SQLContext(sc)


#compute the length of song
size_ = udf(lambda xs: len(xs), IntegerType())

#filter out null song value  note: filter == where, dunno why do they create two functions...
raw_df = sqlContext.jsonFile("data/lyrics.json").where( (col('lyrics')).isNotNull() ).filter(size_(col('lyrics')) >0)

# .na.drop(subset=[""])
# raw_df.select(raw_df['lyrics'], F.where((raw_df['index'] == 160))).show()
raw_df
raw_df.show()

artist          genre index lyrics               song                 year
beyonce-knowles Pop   0     Oh baby, how you ... ego-remix            2009
beyonce-knowles Pop   1     playin' everythin... then-tell-me         2009
beyonce-knowles Pop   2     If you search
For... honesty              2009
beyonce-knowles Pop   3     Oh oh oh I, oh oh... you-are-my-rock      2009
beyonce-knowles Pop   4     Party the people,... black-culture        2009
beyonce-knowles Pop   5     I heard
Church be... all-i-could-do-wa... 2009
beyonce-knowles Pop   6     This is just anot... once-in-a-lifetime   2009
beyonce-knowles Pop   7     Waiting, waiting,... waiting              2009
beyonce-knowles Pop   8     [Verse 1:]
I read... slow-love            2009
beyonce-knowles Pop   9     N-n-now, honey
Yo... why-don-t-you-lov... 2009
beyonce-knowles Pop   10    I lay alone awake... save-the-hero        2009
beyonce-knowles Pop   11    Hello hello baby ... telephone            2009
beyonce-knowles Pop   12 

In [8]:
STOPWORDS_PATH = 'data/stopwords.txt'
stopwords = set(sc.textFile(STOPWORDS_PATH).collect())

In [9]:
split_regex = r'\W+'
def tokenize(string):
    """ An implementation of input string tokenization that excludes stopwords
    Args:
        string (str): input string
    Returns:
        list: a list of tokens without stopwords
    """
    
    regexed = [ token.lower() for token in re.split(split_regex, string) if len(token)]
    return  [token for token in regexed if token not in stopwords]#get rid of empty stuff

In [10]:
def tf(tokens):
    """ Compute TF
    Args:
        tokens (list of str): input list of tokens from tokenize
    Returns:
        dictionary: a dictionary of tokens to its TF values
    """
    result = {}
    for token in tokens:
        if token not in result:
            result[token] = 1
        else:
            result[token] += 1
    for key in result:
        result[key] /= float(len(tokens))
    return result

In [11]:
def idfs(corpus):
    """ Compute IDF
    Args:
        corpus (RDD): input corpus
    Returns:
        RDD: a RDD of (token, IDF value)
    """
    N = corpus.map(lambda x:  1).reduce(lambda a,b: a+b) + 0.0
    uniqueTokens = corpus.flatMap(lambda x: [(x[1][i], x[0]) for i in range(len(x[1]))])
    tokenCountPairTuple = uniqueTokens.groupByKey().mapValues(lambda x: list(set(x))).mapValues(lambda x: len(x))
    tokenSumPairTuple = tokenCountPairTuple.reduceByKey(lambda a,b: a+b).mapValues(lambda x: N/x)
    return (tokenSumPairTuple)

In [12]:
def tfidf(tokens, idfs):
    """ Compute TF-IDF
    Args:
        tokens (list of str): input list of tokens from tokenize
        idfs (dictionary): record to IDF value
    Returns:
        dictionary: a dictionary of records to TF-IDF values
    """
    # tf returns a dictionary of token to its tf
    tfs = tf(tokens)
    #multiply the two dictionaries
    tfIdfDict = dict((k, v * idfs[k]) for k, v in tfs.items() if k in idfs)
    return tfIdfDict

In [13]:

# get rid of special character and split, then tokenize
raw_to_token = raw_df.select('index','lyrics').rdd.map(lambda x: (x[0],re.sub(r"[^a-zA-Z0-9]+", ' ', x[1]).split(" ") ))\
.map(lambda x: (x[0],  tokenize(' '.join(x[1])) ))
raw_to_token.take(1)

[(0,
  [u'oh',
   u'baby',
   u'know',
   u'm',
   u'gonna',
   u'cut',
   u'right',
   u'chase',
   u'women',
   u'made',
   u'like',
   u'think',
   u'created',
   u'special',
   u'purpose',
   u'know',
   u'special',
   u'feel',
   u'baby',
   u'let',
   u'get',
   u'lost',
   u'need',
   u'call',
   u'work',
   u'cause',
   u're',
   u'boss',
   u'real',
   u'want',
   u'show',
   u'feel',
   u'consider',
   u'lucky',
   u'big',
   u'deal',
   u'well',
   u'got',
   u'key',
   u'heart',
   u'ain',
   u'gonna',
   u'need',
   u'd',
   u'rather',
   u'open',
   u'body',
   u'show',
   u'secrets',
   u'didn',
   u'know',
   u'inside',
   u'need',
   u'lie',
   u'big',
   u'wide',
   u'strong',
   u'won',
   u'fit',
   u'much',
   u'tough',
   u'talk',
   u'like',
   u'cause',
   u'back',
   u'got',
   u'big',
   u'ego',
   u'huge',
   u'ego',
   u'love',
   u'big',
   u'ego',
   u'much',
   u'walk',
   u'like',
   u'cause',
   u'back',
   u'usually',
   u'm',
   u'humble',
   u'right'

In [14]:
#select the top 10000 words as dictionary


{u'parcheggi': 2.960749550461994e-08,
 u'vivemos': 3.256824505508193e-07,
 u'transend': 5.921499100923988e-08,
 u'fjate': 1.7764497302771964e-07,
 u'woods': 4.725356282537342e-05,
 u'wannaaaa': 2.960749550461994e-08,
 u'spiders': 6.572864002025626e-06,
 u'woody': 4.4411243256929904e-06,
 u'trawling': 1.1842998201847975e-07,
 u'kalmadan': 5.921499100923988e-08,
 u'godaan': 5.921499100923988e-08,
 u'verplant': 1.1842998201847975e-07,
 u'spidery': 8.882248651385982e-08,
 u'takarja': 2.960749550461994e-08,
 u'canes': 2.6646745954157944e-06,
 u'scutter': 5.921499100923988e-08,
 u'igual': 3.813445420995048e-05,
 u'caned': 2.0725246853233957e-07,
 u'iguau': 2.960749550461994e-08,
 u'xodo': 2.960749550461994e-08,
 u'trecenis': 2.960749550461994e-08,
 u'daseba': 2.960749550461994e-08,
 u'spingono': 2.960749550461994e-08,
 u'raouf': 2.960749550461994e-08,
 u'pigment': 8.882248651385982e-07,
 u'bollwerk': 2.960749550461994e-08,
 u'disjoncter': 2.960749550461994e-08,
 u'czasach': 2.960749550461994

In [15]:
vocabList = sorted(tokenTF, key=tokenTF.get, reverse=True)[:NUM_FEATURES ]
vocabList

[u'm',
 u'love',
 u'like',
 u'know',
 u're',
 u'll',
 u'oh',
 u'got',
 u'get',
 u'one',
 u'go',
 u'time',
 u'see',
 u'never',
 u've',
 u'let',
 u'want',
 u'baby',
 u'come',
 u'yeah',
 u'way',
 u'cause',
 u'say',
 u'back',
 u'make',
 u'take',
 u'life',
 u'la',
 u'feel',
 u'away',
 u'right',
 u'gonna',
 u'd',
 u'heart',
 u'need',
 u'man',
 u'could',
 u'ain',
 u'day',
 u'tell',
 u'night',
 u'que',
 u'world',
 u'wanna',
 u'girl',
 u'give',
 u'de',
 u'think',
 u'keep',
 u'still',
 u'good',
 u'us',
 u'eyes',
 u'won',
 u'little',
 u'chorus',
 u'said',
 u'every',
 u'around',
 u'well',
 u'ya',
 u'y',
 u'would',
 u'look',
 u'mind',
 u'long',
 u'always',
 u'find',
 u'ever',
 u'nothing',
 u'home',
 u'die',
 u'things',
 u'hey',
 u'gone',
 u'yo',
 u'better',
 u'gotta',
 u'hold',
 u'hear',
 u'o',
 u'everything',
 u'shit',
 u'nigga',
 u'n',
 u'live',
 u'really',
 u'put',
 u'much',
 u'new',
 u'something',
 u'try',
 u'light',
 u'head',
 u'alone',
 u'tonight',
 u'leave',
 u'call',
 u'stop',
 u'believe',


In [17]:
import numpy as np
#select the top 10000 words as dictionary for vectorization
NUM_FEATURES = 10000
def storeVocablist(raw_to_token):
    
    tokenTF = tf(raw_to_token.flatMap(lambda x: x[1]).collect())
    vocabList = np.array(sorted(tokenTF, key=tokenTF.get, reverse=True)[:NUM_FEATURES ])
    np.save('vocabList',vocabList)
storeVocablist(raw_to_tokenenenenken)

In [10]:
def vectorizeDict(weightDict, vocabList):
    vector = []
    for vocab in vocabList:
        if vocab not in weightDict:
            vector.append(0.0)
        else:
            vector.append(weightDict[vocab])
    return vector

In [14]:
# compute IDFs 
idfsWeights = idfs(raw_to_token).collectAsMap()
idfsBroadcast = sc.broadcast(idfsWeights)

In [15]:
# compute tf-idf value
WeightsRDD = raw_to_token.map(lambda x: (x[0], tfidf(x[1], idfsBroadcast.value)))

In [18]:
#save as tfidfweight as parquet

from pyspark.sql.types import Row
def storeTFIDF(WeightsRDD):
    WeightsRDD.map(lambda x: Row(**{'index': x[0], 'song': x[1]})).toDF().save("tfidf.parquet")

storeTFIDF(WeightsRDD)
# .toDF().save("tfidf.parquet")
# ()

In [27]:
import json
from pyspark.sql.types import Row
def storeTFIDFJSON(WeightsRDD):
    with open('tfidf.json', 'w') as fp:
        json.dump(WeightsRDD.map(lambda x: Row(**{'index': x[0], 'song': x[1]})).toDF().toJSON().collect(), fp)
storeTFIDFJSON(WeightsRDD)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 6 in stage 19.0 failed 1 times, most recent failure: Lost task 6.0 in stage 19.0 (TID 75, localhost): java.lang.OutOfMemoryError: Java heap space
	at java.util.Arrays.copyOf(Arrays.java:2271)
	at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:113)
	at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
	at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:140)
	at java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1876)
	at java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1785)
	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1188)
	at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:44)
	at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:80)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1204)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1193)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1192)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)


In [14]:
import json
with open('tfidf.json', 'w') as fp:
    json.dump(tfidfJSON, fp)

# tfidfJSON.take(64)

TypeError: MapPartitionsRDD[59] at mapPartitions at DataFrame.scala:862 is not JSON serializable

In [6]:
import json

def storeGenre(raw_df ):
    genreJSON = raw_df.select('index', 'genre').rdd.map(lambda x: ",".join(map(str, x))).coalesce(1).saveAsTextFile("genre.csv")
    
storeGenre(raw_df)

In [25]:
# sqlContext.parquetFile('tfidf.parquet/part-r-00001.parquet').toJSON().take(5)

['{"index":0,"song":{"maybe":0.09452214997758902,"find":0.0383851169595696,"feel":0.07906581298462564,"working":0.3771449511585,"body":0.10629863169278966,"huge":16.63589839605567,"women":0.3602608460602784,"choose":0.3408482942049,"oh":0.02286534002282442,"show":0.12019839090349117,"got":0.09087769421166395,"consider":1.9795992632861004,"real":0.0657410303029705,"want":0.023853521834594153,"killing":0.3896804280451435,"secrets":0.580198248241276,"decide":0.6313763667026698,"lucky":0.5074569753274444,"big":0.8772036843352329,"ego":17.106725709151583,"blues":0.38592074173596796,"must":0.08072563031266997,"fit":1.1587486741359423,"re":0.014838085144046161,"cut":0.16438833480932563,"every":0.03580603164334555,"confident":5.171044463412741,"site":3.9192642474857378,"piano":1.3412077848891035,"walk":0.35777772647520545,"talk":0.3643629454460953,"well":0.03729357210293007,"m":0.056335993880746595,"smile":0.10363367323611604,"yet":0.16560943666312114,"didn":0.08861567041620844,"strong":0.5030