In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, VectorAssembler, StringIndexer
import pandas as pd

In [2]:
spark = SparkSession.builder.appName('genre_N-B').getOrCreate()

In [3]:
lyric_data = spark.read.format('csv').option('header','true').load('genre_NB_wgenre.csv')
lyric_data.show()

+---+-----+--------------------+
|_c0|genre|     body_text_clean|
+---+-----+--------------------+
|  0|  Rap|drake scorpion fe...|
|  1|  Rap|drake scorpion ni...|
|  2|  Rap|drake scorpion go...|
|  3|  Rap|drake scorpion no...|
|  5|  Rap|drake scorpion el...|
|  7|  Rap|drake scorpion im...|
|  9|  Rap|drake scorpion mo...|
| 10|  Rap|drake scorpion ca...|
| 11|  Rap|drake scorpion sa...|
| 12|  Rap|drake feat jayz s...|
| 13|  Rap|drake scorpion ho...|
| 14|  Rap|drake scorpion pe...|
| 15|  Rap|drake scorpion su...|
| 17|  Rap|drake scorpion fi...|
| 19|  Rap|drake scorpion th...|
| 20|  Rap|drake scorpion bl...|
| 21|  Rap|drake feat michae...|
| 22|  Rap|drake feat static...|
| 23|  Rap|drake scorpion fi...|
| 24|  Rap|drake scorpion 14...|
+---+-----+--------------------+
only showing top 20 rows



In [4]:
from pyspark.sql.functions import length
data = lyric_data.withColumn('length', length(lyric_data['body_text_clean']))
data.show()

+---+-----+--------------------+------+
|_c0|genre|     body_text_clean|length|
+---+-----+--------------------+------+
|  0|  Rap|drake scorpion fe...|   610|
|  1|  Rap|drake scorpion ni...|   699|
|  2|  Rap|drake scorpion go...|   406|
|  3|  Rap|drake scorpion no...|   804|
|  5|  Rap|drake scorpion el...|   335|
|  7|  Rap|drake scorpion im...|   450|
|  9|  Rap|drake scorpion mo...|   520|
| 10|  Rap|drake scorpion ca...|   392|
| 11|  Rap|drake scorpion sa...|   653|
| 12|  Rap|drake feat jayz s...|   464|
| 13|  Rap|drake scorpion ho...|   626|
| 14|  Rap|drake scorpion pe...|   302|
| 15|  Rap|drake scorpion su...|   359|
| 17|  Rap|drake scorpion fi...|   251|
| 19|  Rap|drake scorpion th...|   373|
| 20|  Rap|drake scorpion bl...|   414|
| 21|  Rap|drake feat michae...|   324|
| 22|  Rap|drake feat static...|   564|
| 23|  Rap|drake scorpion fi...|   521|
| 24|  Rap|drake scorpion 14...|   573|
+---+-----+--------------------+------+
only showing top 20 rows



In [6]:
pos_to_neg_number = StringIndexer(inputCol="genre", outputCol='label')
tokenizer = Tokenizer(inputCol='body_text_clean', outputCol='token_lyrics')
hashingTF = HashingTF(inputCol='token_lyrics', outputCol='hash_lyrics')
idf = IDF(inputCol='hash_lyrics', outputCol='idf_lyrics')

In [7]:
from pyspark.ml.linalg import Vector

clean_up = VectorAssembler(inputCols=['idf_lyrics', 'length'], outputCol='features')

In [8]:
from pyspark.ml import Pipeline

data_prep_pipline = Pipeline(stages=[pos_to_neg_number, tokenizer, hashingTF, idf, clean_up])

In [9]:
cleaner = data_prep_pipline.fit(data)
cleaned = cleaner.transform(data)

In [11]:
cleaned.select(['label', 'features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  2.0|(262145,[1353,907...|
|  2.0|(262145,[2437,538...|
|  2.0|(262145,[4914,576...|
|  2.0|(262145,[2437,306...|
|  2.0|(262145,[8342,231...|
|  2.0|(262145,[4402,107...|
|  2.0|(262145,[7597,834...|
|  2.0|(262145,[1576,410...|
|  2.0|(262145,[11160,13...|
|  2.0|(262145,[8342,106...|
|  2.0|(262145,[1353,232...|
|  2.0|(262145,[2089,392...|
|  2.0|(262145,[976,5325...|
|  2.0|(262145,[2325,690...|
|  2.0|(262145,[1353,648...|
|  2.0|(262145,[7617,217...|
|  2.0|(262145,[31308,47...|
|  2.0|(262145,[7218,834...|
|  2.0|(262145,[1353,834...|
|  2.0|(262145,[4914,678...|
+-----+--------------------+
only showing top 20 rows



In [13]:
(training, testing) = cleaned.randomSplit([0.7, 0.3])

In [14]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1.0, modelType='multinomial')
genre_predictor = nb.fit(training)

In [16]:
test_results = genre_predictor.transform(testing)
test_results.show()

+----+--------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| _c0|   genre|     body_text_clean|length|label|        token_lyrics|         hash_lyrics|          idf_lyrics|            features|       rawPrediction|         probability|prediction|
+----+--------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|1002|Pop/Rock|disturb immort de...|   465|  0.0|[disturb, immort,...|(262144,[5795,683...|(262144,[5795,683...|(262145,[5795,683...|[-3490.4995428511...|[1.0,5.9743810316...|       0.0|
|1006|Pop/Rock|disturb immort de...|   276|  0.0|[disturb, immort,...|(262144,[991,5795...|(262144,[991,5795...|(262145,[991,5795...|[-1698.4449226687...|[1.0,8.4608128113...|       0.0|
| 101|Pop/Rock|5 second summer y...|   260|  0.0|[5, second, summ

In [17]:
testing = testing.rdd

In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
f1_eval = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='f1')
precision_eval = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='weightedPrecision')
recall_eval = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='weightedRecall')
acc = acc_eval.evaluate(test_results)
f1 = f1_eval.evaluate(test_results)
precision = precision_eval.evaluate(test_results)
recall = recall_eval.evaluate(test_results)

print(f"The accuracy of the model is: {acc}")
print(f"The f1 of the model is: {f1}")
print(f"The precision of the model is: {precision}")
print(f"The recall of the model is: {recall}")

The accuracy of the model is: 0.7746005046257359
The f1 of the model is: 0.7222035910804357
The precision of the model is: 0.7608718634663412
The recall of the model is: 0.7746005046257359


In [20]:
import pickle

f = open('NB_genre_classifier.pickle', 'wb')
pickle.dump(genre_predictor, f)
f.close()

Py4JError: An error occurred while calling o210.__getstate__. Trace:
py4j.Py4JException: Method __getstate__([]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)

