In [1]:
import cv2
import numpy as np
import pandas as pd

from sklearn import cluster
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import lightgbm
from lightgbm import LGBMClassifier

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("songs\data.csv.zip", compression="zip")
df

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.9950,['Carl Woitschach'],0.708,158648,0.1950,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563000,10,0.1510,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.7790,1928
1,0.9940,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901000,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.6040,['Seweryn Goszczyński'],0.749,104300,0.2200,0,6L63VW0PibdM1HDSBoqnoM,0.000000,5,0.1190,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.9290,107.177,0.8800,1928
3,0.9950,['Francisco Canaro'],0.781,180760,0.1300,0,6M94FkXd15sOAOQYRnWPN8,0.887000,1,0.1110,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.7200,1928
4,0.9900,"['Frédéric Chopin', 'Vladimir Horowitz']",0.210,687733,0.2040,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908000,11,0.0980,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169904,0.1730,"['DripReport', 'Tyga']",0.875,163800,0.4430,1,4KppkflX7I3vJQk7urOJaS,0.000032,1,0.0891,-7.461,1,Skechers (feat. Tyga) - Remix,75,2020-05-15,0.1430,100.012,0.3060,2020
169905,0.0167,"['Leon Bridges', 'Terrace Martin']",0.719,167468,0.3850,0,1ehhGlTvjtHo2e4xJFB0SZ,0.031300,8,0.1110,-10.907,1,Sweeter (feat. Terrace Martin),64,2020-06-08,0.0403,128.000,0.2700,2020
169906,0.5380,"['Kygo', 'Oh Wonder']",0.514,180700,0.5390,0,52eycxprLhK3lPcRLbQiVk,0.002330,7,0.1080,-9.332,1,How Would I Know,70,2020-05-29,0.1050,123.700,0.1530,2020
169907,0.0714,"['Cash Cash', 'Andy Grammer']",0.646,167308,0.7610,0,3wYOGJYD31sLRmBgCvWxa4,0.000000,1,0.2220,-2.557,1,I Found You,70,2020-02-28,0.0385,129.916,0.4720,2020


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169909 entries, 0 to 169908
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   acousticness      169909 non-null  float64
 1   artists           169909 non-null  object 
 2   danceability      169909 non-null  float64
 3   duration_ms       169909 non-null  int64  
 4   energy            169909 non-null  float64
 5   explicit          169909 non-null  int64  
 6   id                169909 non-null  object 
 7   instrumentalness  169909 non-null  float64
 8   key               169909 non-null  int64  
 9   liveness          169909 non-null  float64
 10  loudness          169909 non-null  float64
 11  mode              169909 non-null  int64  
 12  name              169909 non-null  object 
 13  popularity        169909 non-null  int64  
 14  release_date      169909 non-null  object 
 15  speechiness       169909 non-null  float64
 16  tempo             16

In [4]:
df.drop_duplicates(subset=["name"], inplace=True)

name = df["name"]

In [5]:
col_features = [
    "liveness",
    "danceability",
    "energy",
    "valence",
    "loudness",
    "instrumentalness",
]

X = MinMaxScaler().fit_transform(df[col_features])

kmean = cluster.KMeans(init="k-means++", n_clusters=2, random_state=42).fit(X)

df["kmeans"] = kmean.labels_

In [6]:
df["song_name"] = name

df

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,...,mode,name,popularity,release_date,speechiness,tempo,valence,year,kmeans,song_name
0,0.9950,['Carl Woitschach'],0.708,158648,0.1950,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563000,10,0.1510,...,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.7790,1928,1,Singende Bataillone 1. Teil
1,0.9940,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901000,8,0.0763,...,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928,1,"Fantasiestücke, Op. 111: Più tosto lento"
2,0.6040,['Seweryn Goszczyński'],0.749,104300,0.2200,0,6L63VW0PibdM1HDSBoqnoM,0.000000,5,0.1190,...,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.9290,107.177,0.8800,1928,0,Chapter 1.18 - Zamek kaniowski
3,0.9950,['Francisco Canaro'],0.781,180760,0.1300,0,6M94FkXd15sOAOQYRnWPN8,0.887000,1,0.1110,...,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.7200,1928,1,Bebamos Juntos - Instrumental (Remasterizado)
4,0.9900,"['Frédéric Chopin', 'Vladimir Horowitz']",0.210,687733,0.2040,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908000,11,0.0980,...,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169901,0.2640,"['Meek Mill', 'Roddy Ricch']",0.744,167845,0.7020,1,0j2CNrgtalXRGIvHMO2vzh,0.000000,7,0.1200,...,0,Letter To Nipsey (feat. Roddy Ricch),66,2020-01-27,0.2880,91.885,0.3380,2020,0,Letter To Nipsey (feat. Roddy Ricch)
169903,0.2100,"['LEGADO 7', 'Junior H']",0.795,218501,0.5850,0,52Cpyvd2dKb6XRn313nH87,0.000001,8,0.1120,...,1,Ojos De Maniaco,68,2020-02-28,0.0374,97.479,0.9340,2020,0,Ojos De Maniaco
169904,0.1730,"['DripReport', 'Tyga']",0.875,163800,0.4430,1,4KppkflX7I3vJQk7urOJaS,0.000032,1,0.0891,...,1,Skechers (feat. Tyga) - Remix,75,2020-05-15,0.1430,100.012,0.3060,2020,0,Skechers (feat. Tyga) - Remix
169905,0.0167,"['Leon Bridges', 'Terrace Martin']",0.719,167468,0.3850,0,1ehhGlTvjtHo2e4xJFB0SZ,0.031300,8,0.1110,...,1,Sweeter (feat. Terrace Martin),64,2020-06-08,0.0403,128.000,0.2700,2020,0,Sweeter (feat. Terrace Martin)


In [7]:
cluster_ = df.groupby(by="kmeans")
cluster_.count()

Unnamed: 0_level_0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,song_name
kmeans,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,108459,108459,108459,108459,108459,108459,108459,108459,108459,108459,108459,108459,108459,108459,108459,108459,108459,108459,108459,108459
1,24481,24481,24481,24481,24481,24481,24481,24481,24481,24481,24481,24481,24481,24481,24481,24481,24481,24481,24481,24481


In [8]:
y = df.pop("kmeans")
x = df.drop(columns=["name", "artists", "id", "release_date", "song_name"])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [9]:
model = LGBMClassifier().fit(x_train, y_train)

model.score(x_train, y_train), model.score(x_test, y_test)

(1.0, 0.99846547314578)

In [11]:
cluster_data = cluster_.apply(lambda x: x.sort_values(["popularity"], ascending=False))

cluster_data.reset_index(level=0, inplace=True)

cluster_data

Unnamed: 0,kmeans,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,...,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,song_name
87949,0,0.7310,"['Powfu', 'beabadoobee']",0.726,173333,0.4310,0,7eJMfftS33KTjuF7lTsMCx,0.00000,8,...,-8.765,0,death bed (coffee for your head) (feat. beabad...,97,2020-02-08,0.1350,144.026,0.348,2020,death bed (coffee for your head) (feat. beabad...
87941,0,0.2330,"['THE SCOTTS', 'Travis Scott', 'Kid Cudi']",0.716,165978,0.5370,1,39Yp9wwQiSRIDOvrVg7mbk,0.00000,0,...,-7.648,0,THE SCOTTS,96,2020-04-24,0.0514,129.979,0.280,2020,THE SCOTTS
87852,0,0.0686,"['Surf Mesa', 'Emilee']",0.674,176547,0.7740,0,62aP9fBQKYKxi7PDXwcUAS,0.00188,11,...,-7.567,0,ily (i love you baby) (feat. Emilee),95,2019-11-26,0.0892,112.050,0.330,2019,ily (i love you baby) (feat. Emilee)
87844,0,0.3050,"['BENEE', 'Gus Dapperton']",0.863,223480,0.6310,1,4nK5YrxbMGZstTLbvj6Gxw,0.00003,7,...,-4.689,1,Supalonely,95,2019-11-15,0.0534,128.977,0.817,2019,Supalonely
87951,0,0.0264,['6ix9ine'],0.611,132303,0.6880,1,4NhDYoQTYCdWHTvlbGVgwo,0.00000,1,...,-5.688,1,GOOBA,94,2020-05-08,0.3410,178.462,0.393,2020,GOOBA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97922,1,0.9960,['Francisco Canaro'],0.690,147133,0.2860,0,7GTLZMze5vPEMsnRO6VjPX,0.88700,3,...,-14.151,1,Caminito del Taller - Remasterizado,0,1925-07-12,0.1680,122.462,0.574,1925,Caminito del Taller - Remasterizado
97927,1,0.9960,['Ignacio Corsini'],0.566,112800,0.0272,0,7eYF7fkjQ2X7c4St7ECTIv,0.86900,7,...,-25.649,0,Reproches - Remasterizado,0,1925-04-25,0.1280,83.379,0.448,1925,Reproches - Remasterizado
97929,1,0.9960,['Francisco Canaro'],0.546,161893,0.4180,0,7loMx5eIfy4z4naKQDg3Th,0.94100,0,...,-13.399,1,Galleguita - Remasterizado,0,1925-07-12,0.0653,124.246,0.704,1925,Galleguita - Remasterizado
97931,1,0.9960,"['Francisco Canaro', 'Azucena Maizani']",0.554,166693,0.1570,0,7qd9OT1kay1XO1lbUukc7g,0.89700,4,...,-13.650,0,Amores de Carnaval - Remasterizado,0,1925-08-02,0.1500,78.346,0.405,1925,Amores de Carnaval - Remasterizado


In [12]:
cluster_data.to_csv("songs\culster_data.csv", index=False)

In [14]:
def cluster_analysis():
    df = pd.read_csv("songs\data.csv.zip", compression="zip")
    df.drop_duplicates(subset=["name"], inplace=True)

    col_features = [
        "liveness",
        "danceability",
        "energy",
        "valence",
        "loudness",
        "instrumentalness",
    ]
    X = MinMaxScaler().fit_transform(df[col_features])
    kmean = cluster.Birch(n_clusters=2).fit(X)

    df["kmeans"] = kmean.labels_

    df["song_name"] = name

    y = df.pop("kmeans")
    x = df.drop(columns=["name", "artists", "id", "release_date", "song_name"])

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

    model = LGBMClassifier().fit(x_train, y_train)

    return model.score(x_train, y_train), model.score(x_test, y_test)


cluster_analysis()

(1.0, 0.9976831653377464)