In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, VectorAssembler, StringIndexer
import pandas as pd

In [2]:
spark = SparkSession.builder.appName('N-B').getOrCreate()

In [3]:
lyric_data = spark.read.format('csv').option('header', 'true').load('more_lyrics_cleaned.csv')
lyric_data.show()

+---+--------+--------------------+-------------+
|_c0|explicit|     body_text_clean|explicit text|
+---+--------+--------------------+-------------+
|  0|       1|drake feel trap t...|          yes|
|  1|       1|dj khale feat jus...|          yes|
|  2|       1|maroon 5 feat car...|          yes|
|  3|       1|cardi b feat bad ...|          yes|
|  4|       1|6ix9in feat nicki...|          yes|
|  5|       0|kenni chesney get...|           no|
|  6|       0|imagin dragon nat...|           no|
|  7|       1|post malon better...|          yes|
|  8|       1|florida georgia l...|          yes|
|  9|       0|demi lovato sober...|           no|
| 10|       1|5 second summer y...|          yes|
| 11|       0|lauren daigl say ...|           no|
| 12|       1|brett young merci...|          yes|
| 13|       1|juic wrld lucid d...|          yes|
| 14|       1|khalid normani lo...|          yes|
| 15|       1|taylor swift deli...|          yes|
| 16|       1|jason aldean feat...|          yes|


In [4]:
from pyspark.sql.functions import length
data = lyric_data.withColumn('length', length(lyric_data['body_text_clean']))
data.show()

+---+--------+--------------------+-------------+------+
|_c0|explicit|     body_text_clean|explicit text|length|
+---+--------+--------------------+-------------+------+
|  0|       1|drake feel trap t...|          yes|   601|
|  1|       1|dj khale feat jus...|          yes|   872|
|  2|       1|maroon 5 feat car...|          yes|   570|
|  3|       1|cardi b feat bad ...|          yes|   906|
|  4|       1|6ix9in feat nicki...|          yes|   617|
|  5|       0|kenni chesney get...|           no|   388|
|  6|       0|imagin dragon nat...|           no|   408|
|  7|       1|post malon better...|          yes|   538|
|  8|       1|florida georgia l...|          yes|   355|
|  9|       0|demi lovato sober...|           no|   281|
| 10|       1|5 second summer y...|          yes|   474|
| 11|       0|lauren daigl say ...|           no|   282|
| 12|       1|brett young merci...|          yes|   289|
| 13|       1|juic wrld lucid d...|          yes|   525|
| 14|       1|khalid normani lo

In [5]:
#create all features to data set
pos_to_neg_number = StringIndexer(inputCol='explicit text', outputCol='label')
tokenizer = Tokenizer(inputCol='body_text_clean', outputCol='token_lyrics')
hashingTF = HashingTF(inputCol='token_lyrics', outputCol='hash_lyrics')
idf = IDF(inputCol='hash_lyrics', outputCol='idf_lyrics')

In [6]:
#create feature vectors
from pyspark.ml.linalg import Vector

clean_up = VectorAssembler(inputCols=['idf_lyrics', 'length'], outputCol='features')


In [7]:
#create and run a data processing pipeline

from pyspark.ml import Pipeline

data_prep_pipeline = Pipeline(stages=[pos_to_neg_number, tokenizer, hashingTF, idf, clean_up])

In [8]:
#fit and transform pipeline
cleaner = data_prep_pipeline.fit(data)
cleaned = cleaner.transform(data)

In [9]:
#show cleaned data
cleaned.select(['label', 'features']).show()


+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(262145,[1353,907...|
|  1.0|(262145,[2437,705...|
|  1.0|(262145,[5381,844...|
|  1.0|(262145,[1133,243...|
|  1.0|(262145,[1353,247...|
|  0.0|(262145,[1353,177...|
|  0.0|(262145,[1879,576...|
|  1.0|(262145,[991,3924...|
|  1.0|(262145,[2437,110...|
|  0.0|(262145,[4914,576...|
|  1.0|(262145,[9129,112...|
|  0.0|(262145,[4900,147...|
|  1.0|(262145,[24150,25...|
|  1.0|(262145,[4016,631...|
|  1.0|(262145,[976,2325...|
|  1.0|(262145,[2325,306...|
|  1.0|(262145,[1353,243...|
|  1.0|(262145,[3763,759...|
|  0.0|(262145,[1879,576...|
|  0.0|(262145,[3091,988...|
+-----+--------------------+
only showing top 20 rows



In [10]:
#break down data into training and testing set
(training, testing) = cleaned.randomSplit([0.7, 0.3])

In [11]:
#create Naive Bayes model and fit training data
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1.0, modelType='multinomial')
explicit_predictor = nb.fit(training)

In [12]:
#transform model with testing data
test_results = explicit_predictor.transform(testing)
test_results.show()

+----+--------+--------------------+-------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| _c0|explicit|     body_text_clean|explicit text|length|label|        token_lyrics|         hash_lyrics|          idf_lyrics|            features|       rawPrediction|         probability|prediction|
+----+--------+--------------------+-------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|   0|       1|drake feel trap t...|          yes|   601|  1.0|[drake, feel, tra...|(262144,[1353,907...|(262144,[1353,907...|(262145,[1353,907...|[-3526.9459410295...|[1.53020773254419...|       1.0|
|   1|       1|dj khale feat jus...|          yes|   872|  1.0|[dj, khale, feat,...|(262144,[2437,705...|(262144,[2437,705...|(262145,[2437,705...|[-6552.6312131449...|[2.86325470314075...|       

In [13]:
testing = testing.rdd

In [16]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
#from pyspark.mllib.evaluation import MultilabelMetrics

acc_eval = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
f1_eval = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='f1')
precision_eval = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='weightedPrecision')
recall_eval = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='weightedRecall')
acc = acc_eval.evaluate(test_results)
f1 = f1_eval.evaluate(test_results)
precision = precision_eval.evaluate(test_results)
recall = recall_eval.evaluate(test_results)


print(f"The accuracy of the model is: {acc}")
print(f"The f1 of the model is: {f1}")
print(f"The precision of the model is: {precision}")
print(f"The recall of the model is: {recall}")
#print(f"precision: {acc_}")
#print(f"recall: {acc_recall}")
#print(f"f1Score: {acc_f1}")


The accuracy of the model is: 0.8385518590998043
The f1 of the model is: 0.8300037689058399
The precision of the model is: 0.8394750421576185
The recall of the model is: 0.8385518590998043


### Logistic Regression

Coefficients: (262145,[],[])
Intercept: (262145,[],[])


AttributeError: 'BinaryLogisticRegressionTrainingSummary' object has no attribute 'r2'