In [1]:
import pandas as pd
import numpy as np
import json
import re
import sys
import itertools

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('./../data/data.csv')
data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [3]:
data_by_artist = pd.read_csv('./../data/data_by_artist.csv')
data_by_artist.head()

Unnamed: 0,mode,count,acousticness,artists,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,9,0.590111,"""Cats"" 1981 Original London Cast",0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5
1,1,26,0.862538,"""Cats"" 1983 Broadway Cast",0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5
2,1,7,0.856571,"""Fiddler On The Roof” Motion Picture Chorus",0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.857143,0
3,1,27,0.884926,"""Fiddler On The Roof” Motion Picture Orchestra",0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.851852,0
4,1,7,0.510714,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5


In [4]:
data_by_genres = pd.read_csv('./../data/data_by_genres.csv')
data_by_genres.head()

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,21st century classical,0.979333,0.162883,160297.7,0.071317,0.606834,0.3616,-31.514333,0.040567,75.3365,0.103783,27.833333,6
1,1,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.5,5
2,1,8-bit,0.762,0.712,115177.0,0.818,0.876,0.126,-9.18,0.047,133.444,0.975,48.0,7
3,1,[],0.651417,0.529093,232880.9,0.419146,0.205309,0.218696,-12.288965,0.107872,112.857352,0.513604,20.859882,7
4,1,a cappella,0.676557,0.538961,190628.5,0.316434,0.003003,0.172254,-12.479387,0.082851,112.110362,0.448249,45.820071,7


In [5]:
data_by_year = pd.read_csv('./../data/data_by_year.csv')
data_by_year.head()

Unnamed: 0,mode,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,1921,0.886896,0.418597,260537.166667,0.231815,0.344878,0.20571,-17.048667,0.073662,101.531493,0.379327,0.653333,2
1,1,1922,0.938592,0.482042,165469.746479,0.237815,0.434195,0.24072,-19.275282,0.116655,100.884521,0.535549,0.140845,10
2,1,1923,0.957247,0.577341,177942.362162,0.262406,0.371733,0.227462,-14.129211,0.093949,114.01073,0.625492,5.389189,0
3,1,1924,0.9402,0.549894,191046.707627,0.344347,0.581701,0.235219,-14.231343,0.092089,120.689572,0.663725,0.661017,10
4,1,1925,0.962607,0.573863,184986.92446,0.278594,0.418297,0.237668,-14.146414,0.111918,115.521921,0.621929,2.604317,5


In [6]:
data_w_genres = pd.read_csv('./../data/data_w_genres.csv')
data_w_genres.head()

Unnamed: 0,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,['show tunes'],"""Cats"" 1981 Original London Cast",0.590111,0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5,1,9
1,[],"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5,1,26
2,[],"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.857143,0,1,7
3,[],"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.851852,0,1,27
4,[],"""Joseph And The Amazing Technicolor Dreamcoat""...",0.510714,0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5,1,7


In [7]:
data.shape, data_by_artist.shape, data_by_genres.shape, data_by_year.shape, data_w_genres.shape

((170653, 19), (28680, 15), (2973, 14), (100, 14), (28680, 16))

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      17

In [9]:
data_w_genres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28680 entries, 0 to 28679
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   genres            28680 non-null  object 
 1   artists           28680 non-null  object 
 2   acousticness      28680 non-null  float64
 3   danceability      28680 non-null  float64
 4   duration_ms       28680 non-null  float64
 5   energy            28680 non-null  float64
 6   instrumentalness  28680 non-null  float64
 7   liveness          28680 non-null  float64
 8   loudness          28680 non-null  float64
 9   speechiness       28680 non-null  float64
 10  tempo             28680 non-null  float64
 11  valence           28680 non-null  float64
 12  popularity        28680 non-null  float64
 13  key               28680 non-null  int64  
 14  mode              28680 non-null  int64  
 15  count             28680 non-null  int64  
dtypes: float64(11), int64(3), object(2)
memo

In [10]:
data.isna().sum()

valence             0
year                0
acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
id                  0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
dtype: int64

In [11]:
data_w_genres.isna().sum()

genres              0
artists             0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
popularity          0
key                 0
mode                0
count               0
dtype: int64

Буду работать с этими двумя датафреймами. Данные вроде чистинькие - нанов нет, хотя есть несколько проблемок, которые мы решим позже.

Основной датасет data содержит довольно много полезный инфы, но все-таки в нем нет самого главного - жанров песен. Но что-то похожее (почти) содержится в data_w_genres, где есть жанры по артистам, было бы неплохо для артистов достать жанры и смержить это с основным датасетом.

In [12]:
spotify_df = data.copy()

In [13]:
data_w_genres['genres'].values[0]

"['show tunes']"

Первая проблема - список жанров представлен строкой, а не листом. Как обнаружится далее, с артистами в основном датасете та же проблема. Исправляем.

In [14]:
data_w_genres['str_genres'] = data_w_genres['genres'].apply(lambda x: [re.sub(' ','_',i) for i in re.findall(r"'([^']*)'", x)])

In [15]:
data_w_genres['str_genres'].values[0]

['show_tunes']

In [16]:
spotify_df['new_artists'] = spotify_df['artists'].apply(lambda x: re.findall(r"'([^']*)'", x))

In [17]:
spotify_df['artists'][0]

"['Sergei Rachmaninoff', 'James Levine', 'Berliner Philharmoniker']"

In [18]:
spotify_df.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,new_artists
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954,"[Sergei Rachmaninoff, James Levine, Berliner P..."
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936,[Dennis Day]
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339,[KHP Kridhamardawa Karaton Ngayogyakarta Hadin...
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109,[Frank Parker]
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665,[Phil Regan]


In [19]:
spotify_df['new_artists'][0]

['Sergei Rachmaninoff', 'James Levine', 'Berliner Philharmoniker']

Тут отработало не для всех, т.к. у некоторых артистов есть апостроф в имени и код выше чувствителен к этому.

In [20]:
spotify_df['new_artists_2'] = spotify_df['artists'].apply(lambda x: re.findall('\"(.*?)\"',x))

In [21]:
spotify_df['new_artists_3'] = np.where(spotify_df['new_artists'].apply(lambda x: not x), spotify_df['new_artists_2'], spotify_df['new_artists'] )

In [22]:
spotify_df['new_artists_3']

0         [Sergei Rachmaninoff, James Levine, Berliner P...
1                                              [Dennis Day]
2         [KHP Kridhamardawa Karaton Ngayogyakarta Hadin...
3                                            [Frank Parker]
4                                              [Phil Regan]
                                ...                        
170648    [Anuel AA, Daddy Yankee, KAROL G, Ozuna, J Bal...
170649                                           [Ashnikko]
170650                                            [MAMAMOO]
170651                                             [Eminem]
170652                                    [KEVVO, J Balvin]
Name: new_artists_3, Length: 170653, dtype: object

Создадим в основном датасете еще поле артисты + название, далее объясню зачем.

In [23]:
spotify_df['artists_and_song'] = spotify_df.apply(lambda row: row['new_artists_3'][0] + ' ' + row['name'], axis = 1)

In [24]:
spotify_df.sort_values(['artists_and_song', 'release_date'], ascending = True, inplace = True)

In [25]:
spotify_df.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,mode,name,popularity,release_date,speechiness,tempo,new_artists,new_artists_2,new_artists_3,artists_and_song
150996,0.273,1996,0.0113,"[""Rappin' 4-Tay"", 'MC Breed', 'Too $hort']",0.897,337973,0.414,1,78859Af0fmA9VTlgnOHTAP,0.00011,...,0,Never Talk Down,35,1996,0.246,96.039,"[ 4-Tay"", , , ]",[Rappin' 4-Tay],"[ 4-Tay"", , , ]","4-Tay"", Never Talk Down"
103581,0.429,1994,0.0249,"[""World Class Wreckin' Cru"", ""Michel 'Le""]",0.715,351040,0.49,0,3hoiinUc5VA9xUEJID7R8V,0.00017,...,0,Turn Off The Lights - Rap,36,1994-04-06,0.0479,129.309,"[ Cru"", ""Michel ]","[World Class Wreckin' Cru, Michel 'Le]","[ Cru"", ""Michel ]","Cru"", ""Michel Turn Off The Lights - Rap"
15465,0.697,1999,0.0516,"[""Ol' Dirty Bastard"", 'Kelis', 'Rich Travali']",0.934,239547,0.459,1,6YYd5MLpu45J0uLrMdivF7,0.0,...,1,Got Your Money (feat. Kelis),66,1999,0.189,103.04,"[ Dirty Bastard"", , , ]",[Ol' Dirty Bastard],"[ Dirty Bastard"", , , ]","Dirty Bastard"", Got Your Money (feat. Kelis)"
16469,0.792,2004,0.0248,"[""Lil' Flip"", 'Lea']",0.814,225173,0.387,1,4s0o8TJHfX9LLHa0umnOzT,0.0,...,1,Sunshine (feat. Lea),62,2004-03-30,0.0945,93.961,"[ Flip"", ]",[Lil' Flip],"[ Flip"", ]","Flip"", Sunshine (feat. Lea)"
105644,0.819,2004,0.0218,"[""Lil' Flip"", 'Lea']",0.845,225187,0.346,0,3FaUH7ZMjW1hv9Jx6MIAIf,0.0,...,1,Sunshine (feat. Lea),47,2004-03-30,0.106,93.989,"[ Flip"", ]",[Lil' Flip],"[ Flip"", ]","Flip"", Sunshine (feat. Lea)"


Есть еще одна проблемка - повторяющиеся песни: те же исполнители, те же названия, может год другой, но все же, это одинаковые песни. Удаляем дубликаты. Чтобы увидеть это, глянем на последний столбец, который я создал ранее (как и обещал).

In [26]:
spotify_df[spotify_df['name']=='Danny Boy']

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,mode,name,popularity,release_date,speechiness,tempo,new_artists,new_artists_2,new_artists_3,artists_and_song
96893,0.141,1961,0.898,['Andy Williams'],0.247,174267,0.173,0,1FDs8eUhsrCZjAQUAbLbOj,0.00197,...,1,Danny Boy,10,1961-12-10,0.0347,80.136,[Andy Williams],[],[Andy Williams],Andy Williams Danny Boy
25421,0.207,1954,0.899,['Ben Webster'],0.345,216640,0.0915,0,0CS1pTW0N61b4yqHYWmsBQ,0.139,...,1,Danny Boy,20,1954-01-01,0.0558,84.931,[Ben Webster],[],[Ben Webster],Ben Webster Danny Boy
8143,0.0999,1962,0.991,"['Bill Evans', 'Shelly Manne']",0.48,220493,0.00705,0,3GimzseMaUjoAWjo6Gs3QH,0.888,...,0,Danny Boy,41,1962-11-01,0.0447,97.133,"[Bill Evans, Shelly Manne]",[],"[Bill Evans, Shelly Manne]",Bill Evans Danny Boy
80949,0.11,1963,0.993,['Bill Evans'],0.373,639027,0.0337,0,598Iu8Oplztg6vfTY6TeMj,0.933,...,1,Danny Boy,19,1963,0.0739,79.561,[Bill Evans],[],[Bill Evans],Bill Evans Danny Boy
121913,0.0935,2005,0.976,['Celtic Woman'],0.25,204467,0.155,0,75sSAymXP6tGOeRvImwzOf,1.2e-05,...,1,Danny Boy,47,2005-01-01,0.0389,100.842,[Celtic Woman],[],[Celtic Woman],Celtic Woman Danny Boy
128631,0.795,1959,0.895,['Conway Twitty'],0.563,167160,0.451,0,2O7KD1tW6eyLLSE7ce0ohD,0.000279,...,1,Danny Boy,11,1959-01-01,0.0697,86.96,[Conway Twitty],[],[Conway Twitty],Conway Twitty Danny Boy
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,...,1,Danny Boy,3,1921,0.0354,100.109,[Frank Parker],[],[Frank Parker],Frank Parker Danny Boy
96340,0.422,1958,0.797,['Jackie Wilson'],0.543,211960,0.214,0,0wPrGTS1SJkpMj4WTi6sbG,0.0,...,1,Danny Boy,15,1958,0.031,98.59,[Jackie Wilson],[],[Jackie Wilson],Jackie Wilson Danny Boy
97721,0.325,1965,0.969,['Johnny Cash'],0.534,308533,0.0955,0,6257QfmLdDbm8MSwu1tpMz,2e-06,...,1,Danny Boy,25,1965-02-15,0.0976,89.068,[Johnny Cash],[],[Johnny Cash],Johnny Cash Danny Boy
121411,0.241,2002,0.983,['Johnny Cash'],0.334,198507,0.0595,0,5o4SqGekEfvdkNuOVx5d3S,8.7e-05,...,1,Danny Boy,48,2002-01-01,0.0416,177.07,[Johnny Cash],[],[Johnny Cash],Johnny Cash Danny Boy


In [27]:
spotify_df.drop_duplicates('artists_and_song', inplace = True)

In [28]:
spotify_df.shape

(156607, 23)

Теперь необходимо получить список артистов, чтобы смержить его с жанрами, но в основном датафрейме лежат списки артистов.

In [29]:
artists_exploded = spotify_df[['new_artists_3','id']].explode('new_artists_3')

In [30]:
artists_exploded

Unnamed: 0,new_artists_3,id
150996,"4-Tay"",",78859Af0fmA9VTlgnOHTAP
150996,",",78859Af0fmA9VTlgnOHTAP
103581,"Cru"", ""Michel",3hoiinUc5VA9xUEJID7R8V
15465,"Dirty Bastard"",",6YYd5MLpu45J0uLrMdivF7
15465,",",6YYd5MLpu45J0uLrMdivF7
...,...,...
144483,黃國隆,5xFXTvnEe03SyvFpo6pEaE
144483,王秋玉,5xFXTvnEe03SyvFpo6pEaE
159177,黃國隆,4prhqrLXYMjHJ6vpRAlasx
150072,黑豹,3KIuCzckjdeeVuswPo20mC


Мержим.

In [31]:
artists_exploded_enriched = artists_exploded.merge(data_w_genres, how = 'left', left_on = 'new_artists_3',right_on = 'artists')

In [32]:
artists_exploded_enriched_nonnull = artists_exploded_enriched[~artists_exploded_enriched.str_genres.isnull()]

In [33]:
artists_genres_consolidated = artists_exploded_enriched_nonnull.groupby('id')['str_genres'].apply(list).reset_index()

In [34]:
artists_genres_consolidated['consolidates_genre_lists'] = artists_genres_consolidated['str_genres'].apply(lambda x: list(set(list(itertools.chain.from_iterable(x)))))

In [35]:
artists_genres_consolidated.head()

Unnamed: 0,id,str_genres,consolidates_genre_lists
0,000G1xMMuwxNHmwVsBdtj1,"[[candy_pop, dance_rock, new_wave, new_wave_po...","[candy_pop, rock, new_wave_pop, power_pop, new..."
1,000GyYHG4uWmlXieKLij8u,"[[alternative_hip_hop, conscious_hip_hop, minn...","[alternative_hip_hop, pop_rap, minnesota_hip_h..."
2,000Npgk5e2SgwGaIsN3ztv,"[[classic_bollywood, classic_pakistani_pop, fi...","[filmi, classic_bollywood, sufi, classic_pakis..."
3,000ZxLGm7jDlWCHtcXSeBe,"[[boogie-woogie, piano_blues, ragtime, stride]]","[boogie-woogie, stride, piano_blues, ragtime]"
4,000jBcNljWTnyjB4YO7ojf,[[]],[]


In [36]:
spotify_df = spotify_df.merge(artists_genres_consolidated[['id','consolidates_genre_lists']], on = 'id', how = 'left')

In [37]:
spotify_df.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,name,popularity,release_date,speechiness,tempo,new_artists,new_artists_2,new_artists_3,artists_and_song,consolidates_genre_lists
0,0.273,1996,0.0113,"[""Rappin' 4-Tay"", 'MC Breed', 'Too $hort']",0.897,337973,0.414,1,78859Af0fmA9VTlgnOHTAP,0.00011,...,Never Talk Down,35,1996,0.246,96.039,"[ 4-Tay"", , , ]",[Rappin' 4-Tay],"[ 4-Tay"", , , ]","4-Tay"", Never Talk Down",
1,0.429,1994,0.0249,"[""World Class Wreckin' Cru"", ""Michel 'Le""]",0.715,351040,0.49,0,3hoiinUc5VA9xUEJID7R8V,0.00017,...,Turn Off The Lights - Rap,36,1994-04-06,0.0479,129.309,"[ Cru"", ""Michel ]","[World Class Wreckin' Cru, Michel 'Le]","[ Cru"", ""Michel ]","Cru"", ""Michel Turn Off The Lights - Rap",
2,0.697,1999,0.0516,"[""Ol' Dirty Bastard"", 'Kelis', 'Rich Travali']",0.934,239547,0.459,1,6YYd5MLpu45J0uLrMdivF7,0.0,...,Got Your Money (feat. Kelis),66,1999,0.189,103.04,"[ Dirty Bastard"", , , ]",[Ol' Dirty Bastard],"[ Dirty Bastard"", , , ]","Dirty Bastard"", Got Your Money (feat. Kelis)",
3,0.792,2004,0.0248,"[""Lil' Flip"", 'Lea']",0.814,225173,0.387,1,4s0o8TJHfX9LLHa0umnOzT,0.0,...,Sunshine (feat. Lea),62,2004-03-30,0.0945,93.961,"[ Flip"", ]",[Lil' Flip],"[ Flip"", ]","Flip"", Sunshine (feat. Lea)",
4,0.722,2000,0.0404,"[""Lil' Kim"", 'Sisqo']",0.584,233053,0.682,1,5jv3QxL0MFswzM1UJ2qQJM,0.0,...,How Many Licks? (feat. Sisqo),53,2000-06-16,0.333,105.17,"[ Kim"", ]",[Lil' Kim],"[ Kim"", ]","Kim"", How Many Licks? (feat. Sisqo)",


Итак, теперь у нас есть датафрейм, где у исполнителей песенок есть жанры (да, это не жанры песен и исполнители могут экспериментировать, но это уже что-то)

Теперь займемся подготовкой данных: отшкалировать/закодировать/убрать лишнее, все как всегда.

In [38]:
spotify = spotify_df.drop(['new_artists', 'new_artists_2', 'artists_and_song', 'artists'], axis=1)

In [39]:
spotify.head()

Unnamed: 0,valence,year,acousticness,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,new_artists_3,consolidates_genre_lists
0,0.273,1996,0.0113,0.897,337973,0.414,1,78859Af0fmA9VTlgnOHTAP,0.00011,4,0.101,-8.45,0,Never Talk Down,35,1996,0.246,96.039,"[ 4-Tay"", , , ]",
1,0.429,1994,0.0249,0.715,351040,0.49,0,3hoiinUc5VA9xUEJID7R8V,0.00017,9,0.139,-9.504,0,Turn Off The Lights - Rap,36,1994-04-06,0.0479,129.309,"[ Cru"", ""Michel ]",
2,0.697,1999,0.0516,0.934,239547,0.459,1,6YYd5MLpu45J0uLrMdivF7,0.0,1,0.222,-7.654,1,Got Your Money (feat. Kelis),66,1999,0.189,103.04,"[ Dirty Bastard"", , , ]",
3,0.792,2004,0.0248,0.814,225173,0.387,1,4s0o8TJHfX9LLHa0umnOzT,0.0,0,0.131,-9.867,1,Sunshine (feat. Lea),62,2004-03-30,0.0945,93.961,"[ Flip"", ]",
4,0.722,2000,0.0404,0.584,233053,0.682,1,5jv3QxL0MFswzM1UJ2qQJM,0.0,10,0.352,-9.597,0,How Many Licks? (feat. Sisqo),53,2000-06-16,0.333,105.17,"[ Kim"", ]",


In [40]:
spotify['consolidates_genre_lists'] = spotify['consolidates_genre_lists'].apply(lambda d: d if isinstance(d, list) else [])

In [41]:
def ohe_prep(df, column, new_name):
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)
    return tf_df

In [42]:
tfidf = TfidfVectorizer()
scaler = StandardScaler()

def create_feature_set(df, float_cols):
    tfidf_matrix =  tfidf.fit_transform(df['consolidates_genre_lists'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names()]
    genre_df.reset_index(drop = True, inplace=True)

    year_ohe = ohe_prep(df, 'year','year') * 0.5

    floats = df[float_cols].reset_index(drop = True)
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) * 0.2

    final = pd.concat([genre_df, floats_scaled, year_ohe], axis = 1)

    final['id']=df['id'].values

    return final

In [43]:
float_cols = spotify.dtypes[(spotify.dtypes == 'float64') | (spotify.dtypes == 'int64')].index.values

In [44]:
float_cols

array(['valence', 'year', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'explicit', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'popularity', 'speechiness', 'tempo'],
      dtype=object)

In [45]:
complete_feature_set = create_feature_set(spotify, float_cols=float_cols).drop('id', axis=1).astype('float16')

In [46]:
complete_feature_set.head()

Unnamed: 0,genre|21st_century_classical,genre|432hz,genre|_hip_hop,genre|a_cappella,genre|abstract,genre|abstract_beats,genre|abstract_hip_hop,genre|accordeon,genre|accordion,genre|acid_house,...,year|2011,year|2012,year|2013,year|2014,year|2015,year|2016,year|2017,year|2018,year|2019,year|2020
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
complete_feature_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156607 entries, 0 to 156606
Columns: 3052 entries, genre|21st_century_classical to year|2020
dtypes: float16(3052)
memory usage: 911.6 MB


In [48]:
spotify.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156607 entries, 0 to 156606
Data columns (total 20 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   valence                   156607 non-null  float64
 1   year                      156607 non-null  int64  
 2   acousticness              156607 non-null  float64
 3   danceability              156607 non-null  float64
 4   duration_ms               156607 non-null  int64  
 5   energy                    156607 non-null  float64
 6   explicit                  156607 non-null  int64  
 7   id                        156607 non-null  object 
 8   instrumentalness          156607 non-null  float64
 9   key                       156607 non-null  int64  
 10  liveness                  156607 non-null  float64
 11  loudness                  156607 non-null  float64
 12  mode                      156607 non-null  int64  
 13  name                      156607 non-null  o

In [49]:
spotify.shape

(156607, 20)

In [50]:
complete_feature_set.shape

(156607, 3052)

### А далее пытаемся построить матрицу схожести и... у меня нет 180 гб оперативки.

In [75]:
new_data = data.drop(['artists', 'id', 'name', 'release_date'], axis=1)

In [77]:
from sklearn.metrics.pairwise import cosine_similarity

In [78]:
scaler = StandardScaler()

In [79]:
new_data_scaled = scaler.fit_transform(new_data)

In [88]:
new_data_scaled

array([[-1.7828247 , -2.15247016,  1.27618658, ..., -1.25680847,
        -0.37970638, -1.16930675],
       [ 1.65068832, -2.15247016,  0.61134711, ..., -1.21099271,
         1.94548067, -1.82117959],
       [-1.858821  , -2.15247016,  1.22034007, ..., -1.21099271,
        -0.3962973 , -0.21240379],
       ...,
       [ 0.41194856,  1.66730194, -1.06670771, ...,  2.04192615,
        -0.10749235, -0.81976118],
       [-1.26756976,  1.66730194, -1.30876246, ...,  1.7670316 ,
         1.28798856, -1.36140375],
       [ 0.43094764,  1.66730194, -0.98426761, ...,  1.95029463,
         0.05903135, -0.71220119]])

In [93]:
def cos_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [95]:
def predict(song, df, metric, n):
    metrics = np.zeros(df.shape[0])
    for index, i in enumerate(df):
        metrics[index] = metric(song, i)
    return metrics.argsort()[:n]

In [106]:
data.iloc[150012]

valence                              0.596
year                                  1991
acousticness                      0.000125
artists                      ['Green Day']
danceability                         0.376
duration_ms                         179827
energy                               0.988
explicit                                 0
id                  6Oxjqbh368cWsGJv7cNfUP
instrumentalness                  0.000002
key                                      3
liveness                             0.317
loudness                            -3.667
mode                                     1
name                        The One I Want
popularity                              30
release_date                    1991-07-01
speechiness                         0.0899
tempo                               90.246
Name: 150012, dtype: object

In [107]:
data.iloc[predict(new_data_scaled[150012], new_data_scaled, cos_sim, 5)]

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
26461,0.332,1959,0.897,['The Dave Brubeck Quartet'],0.641,261840,0.175,0,2D8hWnGqfqjnNv4jB12uds,0.446,10,0.123,-20.195,0,Everybody's Jumpin',32,1959,0.0502,150.051
26651,0.472,1960,0.907,['Blossom Dearie'],0.58,256027,0.124,0,3NWbd7KcJgEpj5ixc9Yrad,0.0,9,0.113,-20.579,0,The Gentleman Is A Dope,30,1960-01-01,0.0448,148.996
8100,0.687,1962,0.841,"['Stan Getz', 'Charlie Byrd']",0.637,283627,0.19,0,6y2YxIUeM9uMgAINRnb11K,0.263,9,0.12,-22.466,0,Samba Triste,46,1962-01-01,0.0676,143.111
45913,0.592,1963,0.822,['Oscar Peterson Trio'],0.639,232840,0.202,0,4vsYktePz0JersI03zbxNn,0.237,10,0.0675,-18.24,0,Band Call,32,1963-01-01,0.0401,151.562
8789,0.293,1965,0.956,['Paul Desmond'],0.589,265760,0.0578,0,5mgod2DVJEZI145AS8vQJ2,0.138,5,0.101,-23.976,0,A Taste of Honey,41,1965-01-01,0.0407,127.592
