# Main Task

## Libraries

In [6]:
!pip install pyspark


Defaulting to user installation because normal site-packages is not writeable


In [38]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date
from pyspark.sql.functions import col, split
from pyspark.sql.functions import avg
from pyspark.sql.functions import desc
from pyspark.sql.functions import count

## Instantiating SparkSession & Load the Dataset

In [8]:
spark = SparkSession.builder \
    .appName("Spotify Data Analysis") \
    .getOrCreate()

In [9]:
df = spark.read.parquet("data/spotify.parquet")

## Check the Schema

In [10]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- album: string (nullable = true)
 |-- album_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- artist_ids: string (nullable = true)
 |-- track_number: long (nullable = true)
 |-- disc_number: long (nullable = true)
 |-- explicit: boolean (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: long (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: long (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- duration_ms: long (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- year: long (nullable = true)
 |-- release_date: string (nullable = true)





1. **`id`**: The unique identifier for each track on Spotify. This is typically a string of characters that Spotify uses to uniquely identify a song in their system.

2. **`name`**: The name of the track. This is a string that represents the title of the song.

3. **`album`**: The name of the album on which the track appears. This helps in identifying collections of songs released together by an artist.

4. **`album_id`**: Similar to the track ID, this is a unique identifier for the album. Useful for linking tracks to specific albums.

5. **`artists`**: The name of the artist or artists who performed or created the track. This field might contain multiple names if the track has multiple artists.

6. **`artist_ids`**: A unique identifier or identifiers for the artist or artists. Like `artists`, this can include multiple IDs separated by a delimiter if there are multiple artists.

7. **`track_number`**: The track's position on its album. This is a numerical order of where the track appears on its respective album.

8. **`disc_number`**: The number of the disc on which the track appears if the album has multiple discs.

9. **`explicit`**: A boolean value indicating whether the track has explicit content. True means there is explicit content; false means there isn't.

10. **`danceability`**: A measure from 0.0 to 1.0 representing how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity.

11. **`energy`**: A measure from 0.0 to 1.0 that represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy.

12. **`key`**: The key the track is in. Integers map to pitches using standard Pitch Class notation.

13. **`loudness`**: The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track and are useful for comparing relative loudness of tracks.

14. **`mode`**: Mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived. Major is represented by 1 and minor is 0.

15. **`speechiness`**: Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g., talk show, audio book, poetry), the closer to 1.0 the attribute value.

16. **`acousticness`**: A measure from 0.0 to 1.0 of how acoustic a track is. A score of 1.0 means the track is most likely an acoustic one.

17. **`instrumentalness`**: Predicts whether a track contains no vocals. The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content.

18. **`liveness`**: Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live.

19. **`valence`**: A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g., happy, cheerful, euphoric).

20. **`tempo`**: The overall estimated tempo of a track in beats per minute (BPM).

21. **`duration_ms`**: The duration of the track in milliseconds.

22. **`time_signature`**: An estimated overall time signature of a track. The time signature (meter) is a notational convention to specify how many beats are in each bar (or measure).

23. **`year`**: The year in which the track was released.

24. **`release_date`**: T of insights can be extracted from each attribute.he exact date on which the track was released.

In [21]:
df.show(1)

+--------------------+-------+--------------------+--------------------+--------------------+--------------------+------------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------+----+------------+
|                  id|   name|               album|            album_id|             artists|          artist_ids|track_number|disc_number|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_ms|time_signature|year|release_date|
+--------------------+-------+--------------------+--------------------+--------------------+--------------------+------------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------+----+------------+
|7lmeHLHBe4nmXzuXc...|Testify|The Battle Of Los...|2eia0myWFgoHuttJy...|['Rage Against Th...|['2d0hyoQ5

## Preprocess the Data

In [28]:
df = df.withColumn("release_date", to_date("release_date", "yyyy-MM-dd"))

In [29]:
df = df.withColumn("artists", split(col("artists"), ";"))

In [30]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- album: string (nullable = true)
 |-- album_id: string (nullable = true)
 |-- artists: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- artist_ids: string (nullable = true)
 |-- track_number: long (nullable = true)
 |-- disc_number: long (nullable = true)
 |-- explicit: boolean (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: long (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: long (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- duration_ms: long (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- year: long (nullable = true)
 |-- release_date: date (nullable 

In [27]:
df.show(1)

+--------------------+-------+--------------------+--------------------+--------------------+--------------------+------------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------+----+------------+--------------------+
|                  id|   name|               album|            album_id|             artists|          artist_ids|track_number|disc_number|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_ms|time_signature|year|release_date|       artists_array|
+--------------------+-------+--------------------+--------------------+--------------------+--------------------+------------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------+----+------------+--------------------+
|7lmeHLHBe4nmXzuXc...|Testify|The Battle

## Data Aggregation, filtering and Transformation

#### Average Features by Year (danceability, energy, tempo):

In [37]:
average_features_by_year = df.groupBy("year").agg(
    avg("danceability").alias("avg_danceability"),
    avg("energy").alias("avg_energy"),
    avg("tempo").alias("avg_tempo")
).orderBy("year")
average_features_by_year.show()


+----+-------------------+-------------------+------------------+
|year|   avg_danceability|         avg_energy|         avg_tempo|
+----+-------------------+-------------------+------------------+
|   0|             0.7655|             0.5016|115.98100000000002|
|1900| 0.4838571428571428|0.28219285714285713|128.72857142857143|
|1908| 0.6041052631578946|0.47369473684210534|131.79973684210526|
|1909|            0.61756|0.48479999999999995|111.96863999999998|
|1917| 0.5069423076923076|0.25368711538461536|118.50019230769229|
|1920|             0.5626|0.20966399999999996|         110.69708|
|1923|            0.62888|0.23725200000000005|120.64765999999997|
|1924| 0.6391388888888889| 0.3607916666666667|124.76100000000002|
|1925| 0.5031632653061224|0.15212653061224485|113.51040816326527|
|1926| 0.6359999999999999|0.26212336448598117|113.50233644859814|
|1927| 0.6635624999999999| 0.3003770833333333|115.02968750000001|
|1928| 0.5984912280701754|0.29645701754385967|111.89632456140346|
|1929|    

#### Count of Tracks per Artist:

In [42]:
tracks_per_artist = df.groupBy("artists").count().orderBy(desc("count"))
tracks_per_artist.show()

+--------------------+-----+
|             artists|count|
+--------------------+-----+
|[['Various Artist...| 1753|
|[['Vitamin String...| 1577|
|[['Aretha Frankli...| 1209|
|[['The City of Pr...| 1042|
|[["Dan Gibson's S...|  997|
|     [['Bob Dylan']]|  913|
|      [['The Fall']]|  812|
|[["Pickin' On Ser...|  774|
|  [['Dolly Parton']]|  747|
|[['Guided By Voic...|  739|
|[['Dionne Warwick']]|  728|
| [['Fleetwood Mac']]|  696|
| [['Elvis Presley']]|  645|
|[['Revolt Product...|  627|
|[['Mannheim Steam...|  619|
|   [['Udo Jürgens']]|  615|
|[['Emmylou Harris']]|  607|
| [['Barry Manilow']]|  605|
|    [['Al Jarreau']]|  597|
|[['Grant Macdonal...|  587|
+--------------------+-----+
only showing top 20 rows



## Exploding Array Columns

## Investigating Top-K Records

In [45]:
k = 10

### Top-k Tracks by Danceability

In [48]:
top_k_danceable_tracks_ids = df.select("id", "danceability").orderBy(desc("danceability")).limit(10)
top_k_danceable_tracks_ids.show()

+--------------------+------------+
|                  id|danceability|
+--------------------+------------+
|3BoKFYyanZpH1TPfl...|         1.0|
|58zc19A4974ofVuf9...|       0.994|
|6JiORXg2PysThO2l3...|       0.993|
|0U0M7HdfsKbyvPfSt...|       0.992|
|5u16CUz0AZUl7pVnW...|       0.991|
|0P1E4YNHBmOTMMS4r...|       0.991|
|5OpZKcjLyzTy82V0s...|        0.99|
|1gHafreKtfnubZ3A7...|        0.99|
|7CIN4HxftOh4KEwfL...|       0.988|
|2epnNWDJdDACv6wFL...|       0.988|
+--------------------+------------+



### Top-k Artists with the Highest Average Danceability

In [56]:
top_danceable_artists = df.groupBy("artists").agg(avg("danceability").alias("avg_danceability")).orderBy(desc("avg_danceability")).limit(k)
top_danceable_artists.show()

+--------------------+----------------+
|             artists|avg_danceability|
+--------------------+----------------+
|[['Andrea HEinste...|           0.992|
|[['Petey Pablo', ...|           0.987|
|           [['Gen']]|           0.987|
|[['Audio Soul Pro...|           0.986|
|[['LILDRUGHILL', ...|           0.984|
|[['Father', 'Ethe...|           0.984|
|[['Super Flu', 'A...|           0.984|
|        [['Baiyon']]|           0.984|
|[['Fusion Groove ...|           0.983|
|[['Tela', 'AK', '...|           0.981|
+--------------------+----------------+



### Top-k Recent Tracks

In [51]:
top_k_recent_tracks_ids = df.select("id", "release_date").orderBy(desc("release_date")).limit(k)
top_k_recent_tracks_ids.show()

+--------------------+------------+
|                  id|release_date|
+--------------------+------------+
|2nfBUx5R5q3BFZgmH...|  2020-12-18|
|6BFF8inuJZQkUi5oa...|  2020-12-18|
|0wZVfNaHdM89qAvNx...|  2020-12-18|
|5EepKUm7NmkPL3O3R...|  2020-12-18|
|0N4dnVnn5J6aiqrNp...|  2020-12-11|
|5ppAcbt2Y7gOhpklB...|  2020-12-11|
|1To9beWJvBwakDgyX...|  2020-12-11|
|7p3Kie6LLKqOARpi6...|  2020-12-11|
|640uPRDZr35rSH1Eg...|  2020-12-11|
|6U42lylN4GFcawK5g...|  2020-12-11|
+--------------------+------------+



### Top-k Longest Tracks

In [54]:
top_k_longest_tracks_ids = df.select("id", "duration_ms").orderBy(desc("duration_ms")).limit(k)
top_k_longest_tracks_ids.show()

+--------------------+-----------+
|                  id|duration_ms|
+--------------------+-----------+
|4aHjDZQr8QcEGhQ1M...|    6061090|
|2va3aUhF7lw90dQA3...|    6054655|
|4IGVZfgW44fYYPyPH...|    5764624|
|6gwP9VJp0SNlzhGRc...|    5713196|
|2tFaZzQHauQzezlYF...|    5679399|
|07o6WfMt7dGQrx4yd...|    5646226|
|4frGUT4FTk5lkQEhc...|    5645108|
|3M1In6GNpQ1q3boy2...|    5577278|
|1NFOGheRUq9Nlf5T8...|    5531591|
|1mFE7qiOOY9TjIdlz...|    5440375|
+--------------------+-----------+



### Top-k Most Frequently Appearing Artists

In [55]:
top_artists = df.groupBy("artists").agg(count("id").alias("track_count")).orderBy(desc("track_count")).limit(k)
top_artists.show()

+--------------------+-----------+
|             artists|track_count|
+--------------------+-----------+
|[['Various Artist...|       1753|
|[['Vitamin String...|       1577|
|[['Aretha Frankli...|       1209|
|[['The City of Pr...|       1042|
|[["Dan Gibson's S...|        997|
|     [['Bob Dylan']]|        913|
|      [['The Fall']]|        812|
|[["Pickin' On Ser...|        774|
|  [['Dolly Parton']]|        747|
|[['Guided By Voic...|        739|
+--------------------+-----------+



In [57]:
spark.stop()