In [1]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
spotify_data =pd.read_csv('music_streaming.csv')


In [3]:
spotify_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15517 entries, 0 to 15516
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Artist Name         15517 non-null  object 
 1   Track Name          15517 non-null  object 
 2   Popularity          15123 non-null  float64
 3   danceability        15517 non-null  float64
 4   energy              15517 non-null  float64
 5   key                 13774 non-null  float64
 6   loudness            15517 non-null  float64
 7   mode                15517 non-null  int64  
 8   speechiness         15517 non-null  float64
 9   acousticness        15517 non-null  float64
 10  instrumentalness    11930 non-null  float64
 11  liveness            15517 non-null  float64
 12  valence             15517 non-null  float64
 13  tempo               15517 non-null  float64
 14  duration_in min/ms  15517 non-null  float64
 15  time_signature      15517 non-null  int64  
 16  Genr

In [4]:
columns_to_impute = spotify_data.columns.drop(['Artist Name', 'Track Name'])
spotify_data_for_imputation = spotify_data[columns_to_impute]

# Initialize IterativeImputer
imputer = IterativeImputer(random_state=0)

# Perform imputation
spotify_data_imputed = imputer.fit_transform(spotify_data_for_imputation)

# Convert the result back to DataFrame
spotify_data_imputed = pd.DataFrame(spotify_data_imputed, columns=columns_to_impute)

# Concatenate 'Artist Name' and 'Track Name' back to the DataFrame
spotify_data_imputed[['Artist Name', 'Track Name']] = spotify_data[['Artist Name', 'Track Name']]


In [5]:
spotify_data_imputed.isnull().sum()

Popularity            0
danceability          0
energy                0
key                   0
loudness              0
mode                  0
speechiness           0
acousticness          0
instrumentalness      0
liveness              0
valence               0
tempo                 0
duration_in min/ms    0
time_signature        0
Genre                 0
Artist Name           0
Track Name            0
dtype: int64

In [6]:
# Handling Duplicates
spotify_data_imputed.drop_duplicates(inplace=True)


In [7]:
spotify_data_imputed.to_csv('cleaned_spotify_dataset.csv', index=False)


In [8]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=52ae3b9d5ab23c1e46867f3413ca2e7462eb22a5c2a1ff08f0ac583883d71ece
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [9]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("GenrePopularityAnalysis") \
    .getOrCreate()

# Load the dataset into a Spark DataFrame
spotify_imputed_df = spark.read.csv("cleaned_spotify_dataset.csv", header=True, inferSchema=True)

# Register the DataFrame as a temporary view
spotify_imputed_df.createOrReplaceTempView("spotify_imputed")


In [10]:
# Query to find the genre with the highest average popularity
highest_avg_popularity_genre = spark.sql("""
    SELECT Genre, AVG(Popularity) AS AvgPopularity
    FROM spotify_imputed
    GROUP BY Genre
    ORDER BY AvgPopularity DESC
    LIMIT 1
""")

# Show the result
highest_avg_popularity_genre.show()

+-----+------------------+
|Genre|     AvgPopularity|
+-----+------------------+
|  4.0|56.727300938594105|
+-----+------------------+



In [11]:
artist_song_count = spark.sql("""
    SELECT `Artist Name`, COUNT(*) AS SongCount
    FROM spotify_imputed
    WHERE `duration_in min/ms` > 5
    GROUP BY `Artist Name`
    ORDER BY SongCount DESC
""")

# Show the result
artist_song_count.show()

+--------------------+---------+
|         Artist Name|SongCount|
+--------------------+---------+
|  The Rolling Stones|       32|
|                  U2|       27|
|           Metallica|       27|
|             Nirvana|       22|
|      The Black Keys|       22|
|               AC/DC|       22|
|        Led Zeppelin|       21|
|            Coldplay|       20|
|The Smashing Pump...|       18|
|       Fleetwood Mac|       18|
|           Pearl Jam|       18|
|         The Killers|       17|
|           Aerosmith|       17|
|         Arcade Fire|       16|
|            Deftones|       16|
|           The Doors|       16|
|Creedence Clearwa...|       16|
|            The Cure|       15|
|           Van Halen|       15|
|      Arctic Monkeys|       15|
+--------------------+---------+
only showing top 20 rows



In [12]:
genre_song_count = spark.sql("""
    SELECT Genre, COUNT(*) AS SongCount
    FROM spotify_imputed
    GROUP BY Genre
    ORDER BY SongCount DESC
""")

# Show the result
genre_song_count.show()

+-----+---------+
|Genre|SongCount|
+-----+---------+
| 10.0|     4264|
|  6.0|     2263|
|  9.0|     1828|
|  8.0|     1704|
|  1.0|     1268|
|  5.0|     1210|
|  2.0|     1182|
|  0.0|      586|
|  7.0|      465|
|  4.0|      376|
|  3.0|      371|
+-----+---------+



In [13]:
top_artists = spark.sql("""
    SELECT `Artist Name`, AVG(Popularity) AS AvgPopularity
    FROM spotify_imputed
    GROUP BY `Artist Name`
    ORDER BY AvgPopularity DESC
    LIMIT 10
""")

# Show the result
top_artists.show()

+--------------------+-------------+
|         Artist Name|AvgPopularity|
+--------------------+-------------+
|            Måneskin|        100.0|
|The Kid LAROI, Ju...|         97.0|
|       Doja Cat, SZA|         95.0|
|     Los Legendarios|         95.0|
|Justin Bieber, Da...|         95.0|
|The Weeknd, Arian...|         94.0|
|          Nio Garcia|         93.0|
|Riton, Nightcrawl...|         92.0|
|                 ATB|         91.0|
|          Tion Wayne|         90.0|
+--------------------+-------------+



In [14]:
party_songs_recommendation = spark.sql("""
    SELECT `Artist Name`, `Track Name`, energy, danceability, valence
    FROM spotify_imputed
    WHERE energy >= 0.8 AND danceability >= 0.8 AND valence >= 0.8
    ORDER BY rand()
    LIMIT 5
""")

party_songs_recommendation.show()

+--------------------+--------------------+------+------------+-------+
|         Artist Name|          Track Name|energy|danceability|valence|
+--------------------+--------------------+------+------------+-------+
|Interplanetary Cr...|       Supreme Level| 0.846|       0.824|  0.887|
|               Cameo|             Word Up|  0.84|       0.878|  0.882|
|               Riton|Friday (feat. Muf...| 0.862|       0.824|  0.801|
|        David Banner|         Like A Pimp| 0.846|       0.844|   0.91|
|  Traveling Wilburys|Tweeter And The M...| 0.861|       0.803|  0.899|
+--------------------+--------------------+------+------------+-------+



In [15]:
from pyspark.sql.functions import col, avg, count, desc


In [16]:
# a) Which genre has the highest average popularity?
highest_avg_popularity_genre = spotify_imputed_df.groupBy("Genre").agg(avg("Popularity").alias("AvgPopularity")) \
    .orderBy(desc("AvgPopularity")).limit(1)
highest_avg_popularity_genre.show()

+-----+------------------+
|Genre|     AvgPopularity|
+-----+------------------+
|  4.0|56.727300938594105|
+-----+------------------+



In [17]:
# b) Display which artists have recorded the most number of songs with a duration of more than 5 minutes
most_songs_duration_gt_5 = spotify_imputed_df.filter(col("duration_in min/ms") > 5).groupBy("Artist Name").agg(count("*").alias("SongCount")).orderBy(desc("SongCount")).limit(20)
most_songs_duration_gt_5.show()

+--------------------+---------+
|         Artist Name|SongCount|
+--------------------+---------+
|  The Rolling Stones|       32|
|                  U2|       27|
|           Metallica|       27|
|      The Black Keys|       22|
|             Nirvana|       22|
|               AC/DC|       22|
|        Led Zeppelin|       21|
|            Coldplay|       20|
|The Smashing Pump...|       18|
|       Fleetwood Mac|       18|
|           Pearl Jam|       18|
|         The Killers|       17|
|           Aerosmith|       17|
|         Arcade Fire|       16|
|            Deftones|       16|
|           The Doors|       16|
|Creedence Clearwa...|       16|
|            The Cure|       15|
|           Van Halen|       15|
|                Beck|       15|
+--------------------+---------+



In [18]:
songs_in_genre=spotify_imputed_df.groupBy("Genre").agg(count("*").alias("SongCount")).orderBy("SongCount", ascending=False)
songs_in_genre.show()

+-----+---------+
|Genre|SongCount|
+-----+---------+
| 10.0|     4264|
|  6.0|     2263|
|  9.0|     1828|
|  8.0|     1704|
|  1.0|     1268|
|  5.0|     1210|
|  2.0|     1182|
|  0.0|      586|
|  7.0|      465|
|  4.0|      376|
|  3.0|      371|
+-----+---------+



In [19]:
dominant_artists = spotify_imputed_df.groupBy("Artist Name").agg(avg("Popularity").alias("AvgPopularity")).orderBy(desc("AvgPopularity")).limit(20)
dominant_artists.show()


+--------------------+-------------+
|         Artist Name|AvgPopularity|
+--------------------+-------------+
|            Måneskin|        100.0|
|The Kid LAROI, Ju...|         97.0|
|       Doja Cat, SZA|         95.0|
|     Los Legendarios|         95.0|
|Justin Bieber, Da...|         95.0|
|The Weeknd, Arian...|         94.0|
|          Nio Garcia|         93.0|
|Riton, Nightcrawl...|         92.0|
|                 ATB|         91.0|
|             Cardi B|         90.0|
|          Tion Wayne|         90.0|
|    Dua Lipa, DaBaby|         90.0|
|          The Weeknd|        89.25|
|Galantis, David G...|         89.0|
|  Majestic, Boney M.|         89.0|
|        Nathan Evans|         89.0|
|                 SZA|         89.0|
|    Trinidad Cardona|         88.0|
|  Travis Scott, HVME|         88.0|
|        Bella Poarch|         88.0|
+--------------------+-------------+



In [20]:
fun_party_songs = spotify_imputed_df.filter((col("energy") >= 0.8) & (col("danceability") >= 0.7) & (col("valence") >= 0.8)) \
    .orderBy(desc("Popularity")).select("Artist Name", "Track Name", "energy", "danceability", "valence").limit(5)
fun_party_songs.show()


+--------------------+--------------------+------+------------+-------+
|         Artist Name|          Track Name|energy|danceability|valence|
+--------------------+--------------------+------+------------+-------+
|               Riton|Friday (feat. Muf...| 0.862|       0.824|  0.801|
|Riton, Nightcrawl...|Friday (feat. Muf...| 0.862|       0.824|  0.801|
|          Joel Corry|Head & Heart (fea...| 0.874|       0.734|  0.905|
|    Dua Lipa, DaBaby|Levitating (feat....| 0.825|       0.702|  0.915|
|        Daddy Yankee|            PROBLEMA|   0.8|       0.773|  0.842|
+--------------------+--------------------+------+------------+-------+



In [28]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


In [32]:
spark = SparkSession.builder \
    .appName("GenrePopularityAnalysis") \
    .getOrCreate()

# Load the cleaned dataset into a Spark DataFrame
spotify_imputed_df = spark.read.csv("cleaned_spotify_dataset.csv", header=True, inferSchema=True)

In [33]:
feature_columns = ['Popularity', 'danceability', 'energy', 'key', 'loudness', 'mode',
                   'speechiness', 'acousticness', 'instrumentalness', 'liveness',
                   'valence', 'tempo', 'duration_in min/ms', 'time_signature']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
spotify_imputed_df = assembler.transform(spotify_imputed_df)

In [34]:
(training_data, testing_data) = spotify_imputed_df.randomSplit([0.8, 0.2], seed=42)


In [35]:
classifiers = [
    RandomForestClassifier(labelCol="Genre", featuresCol="features", seed=42),
    DecisionTreeClassifier(labelCol="Genre", featuresCol="features", seed=42),
    LogisticRegression(labelCol="Genre", featuresCol="features", maxIter=10)
]


In [39]:
evaluator = MulticlassClassificationEvaluator(labelCol="Genre", metricName="accuracy")
best_classifier = None
best_accuracy = 0.0

for classifier in classifiers:
    # Train the classifier
    model = classifier.fit(training_data)

    # Make predictions on the testing data
    predictions = model.transform(testing_data)

    # Evaluate the accuracy
    accuracy = evaluator.evaluate(predictions)

    # Print the accuracy for each classifier
    print("{} Accuracy: {:.2f}%".format(classifier.__class__.__name__, accuracy * 100))

    # Check if this classifier has the highest accuracy so far
    if accuracy > best_accuracy:
        best_classifier = classifier
        best_accuracy = accuracy

RandomForestClassifier Accuracy: 44.88%
DecisionTreeClassifier Accuracy: 44.48%
LogisticRegression Accuracy: 48.49%


In [40]:
# Print the best classifier based on accuracy
print("\nBest Classifier based on Accuracy: {}".format(best_classifier.__class__.__name__))


Best Classifier based on Accuracy: LogisticRegression


In [27]:
spark.stop()
