In [1]:
from pyspark.sql.types import StringType
from pyspark import SQLContext, SparkContext
from collections import namedtuple
from pyspark.sql import functions as F
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, Row, ArrayType, StringType
import re
from pyspark.sql import SparkSession

import numpy as np

from pyspark.mllib.feature import HashingTF, IDF
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import  HashingTF,Tokenizer

In [2]:
spark = SparkSession \
    .builder \
    .appName("COMP 4651") \
    .getOrCreate()

In [3]:
# sqlContext = SQLContext(sc)



#compute the length of song
size_ = udf(lambda xs: len(xs), IntegerType())

#filter out null song value  note: filter == where, dunno why do they create two functions...
raw_df = spark.read.json("data/lyrics.json").where( (col('lyrics')).isNotNull() ).filter(size_(col('lyrics')) >0)

# .na.drop(subset=[""])
# raw_df.select(raw_df['lyrics'], F.where((raw_df['index'] == 160))).show()
raw_df
raw_df.show()

+---------------+-----+-----+--------------------+--------------------+----+
|         artist|genre|index|              lyrics|                song|year|
+---------------+-----+-----+--------------------+--------------------+----+
|beyonce-knowles|  Pop|    0|Oh baby, how you ...|           ego-remix|2009|
|beyonce-knowles|  Pop|    1|playin' everythin...|        then-tell-me|2009|
|beyonce-knowles|  Pop|    2|If you search
For...|             honesty|2009|
|beyonce-knowles|  Pop|    3|Oh oh oh I, oh oh...|     you-are-my-rock|2009|
|beyonce-knowles|  Pop|    4|Party the people,...|       black-culture|2009|
|beyonce-knowles|  Pop|    5|I heard
Church be...|all-i-could-do-wa...|2009|
|beyonce-knowles|  Pop|    6|This is just anot...|  once-in-a-lifetime|2009|
|beyonce-knowles|  Pop|    7|Waiting, waiting,...|             waiting|2009|
|beyonce-knowles|  Pop|    8|[Verse 1:]
I read...|           slow-love|2009|
|beyonce-knowles|  Pop|    9|N-n-now, honey
Yo...|why-don-t-you-lov...|2009|

In [4]:
# change genre to numeric
from pyspark.sql.functions import udf



genreList = raw_df.select('genre').distinct().rdd.map(lambda x:x.genre).collect()
genreList

def changeToNumeric(genre):
    return genreList.index(genre)

udf_changeToNumeric = udf(changeToNumeric, IntegerType()) # if the function returns an int
raw_df = raw_df.withColumn("label", udf_changeToNumeric('genre'))
raw_df.show()

+---------------+-----+-----+--------------------+--------------------+----+-----+
|         artist|genre|index|              lyrics|                song|year|label|
+---------------+-----+-----+--------------------+--------------------+----+-----+
|beyonce-knowles|  Pop|    0|Oh baby, how you ...|           ego-remix|2009|    6|
|beyonce-knowles|  Pop|    1|playin' everythin...|        then-tell-me|2009|    6|
|beyonce-knowles|  Pop|    2|If you search
For...|             honesty|2009|    6|
|beyonce-knowles|  Pop|    3|Oh oh oh I, oh oh...|     you-are-my-rock|2009|    6|
|beyonce-knowles|  Pop|    4|Party the people,...|       black-culture|2009|    6|
|beyonce-knowles|  Pop|    5|I heard
Church be...|all-i-could-do-wa...|2009|    6|
|beyonce-knowles|  Pop|    6|This is just anot...|  once-in-a-lifetime|2009|    6|
|beyonce-knowles|  Pop|    7|Waiting, waiting,...|             waiting|2009|    6|
|beyonce-knowles|  Pop|    8|[Verse 1:]
I read...|           slow-love|2009|    6|
|bey

In [5]:
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="lyrics", outputCol="tokenized_song")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features",numFeatures = 100)
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
raw_df = raw_df.select(col("index"), col("lyrics"), col('label'))
# Fit the pipeline to training documents.

#sample without replacement
training_df = raw_df.sample(False,0.7)
model = pipeline.fit(training_df)

In [6]:
# Prediction

#sample without replacement
validation_df = raw_df.sample(False,0.1)
prediction = model.transform(validation_df)
selected = prediction.select("index", "prediction")
combined_df = validation_df.select('index','label').join(selected, validation_df.index == selected.index)

# for row in combined_df.collect():
#     rid, prob, prediction = row
#     print("(%d) --> prob=%s, prediction=%f" % (rid, str(prob), prediction))
    
num_correct = combined_df.select( (combined_df.label == combined_df.prediction).alias('correct') ).rdd.map(lambda x: 1 if x.correct == True else 0).reduce(lambda a,b:a+b)
accuracy = (num_correct + 0.0) / combined_df.count()
accuracy

0.4646639128802103

In [12]:
# split_regex = r'\W+'
# STOPWORDS_PATH = 'data/stopwords.txt'
# stopwords = set(spark.read.text(STOPWORDS_PATH).rdd.map(lambda x: x.value).collect())
# def tokenize(string):
#     """ An implementation of input string tokenization that excludes stopwords
#     Args:
#         string (str): input string
#     Returns:
#         list: a list of tokens without stopwords
#     """
    
#     regexed = [ token.lower() for token in re.split(split_regex, string) if len(token)]
#     return  [token for token in regexed if token not in stopwords]#get rid of empty stuff
# udf_tokenize = udf(tokenize, ArrayType(StringType())) # if the function returns an int
# raw_df = raw_df.withColumn("tokenized_song", udf_changeToNumeric('lyrics'))
# raw_df.show(4)

+---------------+-----+-----+--------------------+---------------+----+-------------+
|         artist|genre|index|              lyrics|           song|year|numeric_genre|
+---------------+-----+-----+--------------------+---------------+----+-------------+
|beyonce-knowles|  Pop|    0|Oh baby, how you ...|      ego-remix|2009|            6|
|beyonce-knowles|  Pop|    1|playin' everythin...|   then-tell-me|2009|            6|
|beyonce-knowles|  Pop|    2|If you search
For...|        honesty|2009|            6|
|beyonce-knowles|  Pop|    3|Oh oh oh I, oh oh...|you-are-my-rock|2009|            6|
+---------------+-----+-----+--------------------+---------------+----+-------------+
only showing top 4 rows



In [7]:

# # get rid of special character and split, then tokenize
# raw_to_token = raw_df.select('index','lyrics').rdd.map(lambda x: (x[0],re.sub(r"[^a-zA-Z0-9]+", ' ', x[1]).split(" ") ))\
# .map(lambda x: (x[0],  tokenize(' '.join(x[1])) ))
# raw_to_token.take(1)

In [8]:


# hashingTF = HashingTF()
# tf = hashingTF.transform()

# # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
# # First to compute the IDF vector and second to scale the term frequencies by IDF.
# tf.cache()
# idf = IDF().fit(tf)
# tfidf = idf.transform(tf)
# #
# # spark.mllib's IDF implementation provides an option for ignoring terms
# # which occur in less than a minimum number of documents.
# # In such cases, the IDF for these terms is set to 0.
# # This feature can be used by passing the minDocFreq value to the IDF constructor.
# idfIgnore = IDF(minDocFreq=2).fit(tf)
# tfidfIgnore = idfIgnore.transform(tf)

TypeError: transform() takes at least 2 arguments (1 given)

In [3]:
spark.read.parquet('tfidf.parquet').toDF('index','song').createOrReplaceTempView('tfidfTable')

In [4]:
tfidfDf = spark.read.table('tfidfTable')

In [5]:
#loadVocabList
vocabList = np.load('vocabList.npy')
vocabList[:10]

array([u'm', u'love', u'like', u'know', u're', u'll', u'oh', u'got',
       u'get', u'one'],
      dtype='<U16')

In [6]:
#vectorize vocab
def vectorizeDict(weightDict):
    global vocabList
    
    vector = []
    for vocab in vocabList:
        if vocab not in weightDict:
            vector.append(0.0)
        else:
            vector.append(weightDict[vocab])
    return vector

In [7]:
tfidfDf = tfidfDf.rdd.map(lambda x: (x[0], vectorizeDict(x[1])) ).toDF().select(col("_1").alias("index"), col("_2").alias("song"))

In [8]:
#all the types of genre
genreDf = spark.read.csv('genre.csv').select(col("_c0").alias("index"), col("_c1").alias("genre"))


In [None]:
combinedDf = tfidfDf.join(genreDf, tfidfDf.index == genreDf.index)

In [None]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint


combinedDf.cache()
parsedData = combinedDf.rdd.sample(False, 0.01).map(lambda x: LabeledPoint (x[2], x[1]))

# Build the model
model = LogisticRegressionWithLBFGS.train(parsedData)

# # Evaluating the model on training data
# labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
# trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(parsedData.count())
# print("Training Error = " + str(trainErr))
