In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import re
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
df = pd.read_json('./data/dataset.json', encoding = 'utf-8', orient='records')

In [5]:
df.head()

Unnamed: 0,artist_name,track_name,album_name,artist_genre,artist_popularity,track_popularity,artist_followers,danceability,energy,key,...,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,uri,release_date,album_image,id
0,Harry Styles,As It Was,Harry's House,[pop],94,94,21560011,0.52,0.731,6,...,0.00101,0.311,0.662,173.93,167303,4,spotify:track:4Dvkj6JhhA12EX05fT7y2e,2022-05-20,https://i.scdn.co/image/ab67616d0000b2732e8ed7...,4Dvkj6JhhA12EX05fT7y2e
1,Kate Bush,Running Up That Hill (A Deal With God),Hounds Of Love,"[art pop, art rock, baroque pop, new wave pop,...",80,95,1146744,0.629,0.547,10,...,0.00314,0.0604,0.197,108.375,298933,4,spotify:track:75FEaRjZTKLhTrFGsfMUXR,1985,https://i.scdn.co/image/ab67616d0000b27396ab64...,75FEaRjZTKLhTrFGsfMUXR
2,Bad Bunny,Me Porto Bonito,Un Verano Sin Ti,"[latin, reggaeton, trap latino]",100,99,50254603,0.911,0.712,1,...,2.7e-05,0.0933,0.425,92.005,178567,4,spotify:track:6Sq7ltF9Qa7SNFBsV5Cogx,2022-05-06,https://i.scdn.co/image/ab67616d0000b27349d694...,6Sq7ltF9Qa7SNFBsV5Cogx
3,Joji,Glimpse of Us,Glimpse of Us,"[alternative r&b, viral pop]",84,96,6167072,0.44,0.317,8,...,5e-06,0.141,0.268,169.914,233456,3,spotify:track:6xGruZOHLs39ZbVccQTuPZ,2022-06-10,https://i.scdn.co/image/ab67616d0000b273f798d4...,6xGruZOHLs39ZbVccQTuPZ
4,Bad Bunny,Ojitos Lindos,Un Verano Sin Ti,"[latin, reggaeton, trap latino]",100,98,50254603,0.647,0.686,3,...,1e-06,0.528,0.268,79.928,258299,4,spotify:track:3k3NWokhRRkEPhCzPmV8TW,2022-05-06,https://i.scdn.co/image/ab67616d0000b27349d694...,3k3NWokhRRkEPhCzPmV8TW


In [20]:
class ContentTFIDF:
    
    def __init__(self, data):
        self.data = data
     
 
    def cleanText(self, text_data_in_list):
        text_data = ','.join(text_data_in_list)
        text = re.sub('[-=+#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]','', text_data)
        print(text)
        return text

    def preprocess(self):
        genre = []
        for i in self.data['artist_genre']:
            if i == '[]':
                i = 'NA'
                genre.append(i.strip()) #"'[]'"
            else:
                i = self.cleanText(i)
                genre.append(i.strip())
        self.data['genre'] = genre
        self.data = self.data[self.data['genre'] != "NA"]
        self.data = self.data.reset_index()
        self.data['track_popularity'] = self.data['track_popularity'] / 100 


    def calculateTFIDF(self):
        tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,2) ,stop_words=stopwords.words('english'))
        tfidf_content = tfidf.fit_transform(self.data['genre'])
        tfidf_dict = tfidf.get_feature_names()

        return tfidf_dict, tfidf_content

    def saveTFIDF(self, path = "./data"):
        tfidf_dict, tfidf_content = self.calculateTFIDF()
        tfidf_array = tfidf_content.toarray()
        tfidf_matrix = pd.DataFrame(tfidf_array, columns = tfidf_dict)
        tfidf_file = 'tfidf_matrix.csv'
        if path == None:
            tfidf_path = tfidf_file
        else:
            tfidf_path = os.path.join(path, tfidf_file)
        tfidf_matrix.to_csv(tfidf_path, encoding = 'utf-8', index = False)

In [21]:
ctfidf = ContentTFIDF(df)

In [22]:
ctfidf.preprocess()

pop
art pop,art rock,baroque pop,new wave pop,permanent wave,piano rock,singersongwriter
latin,reggaeton,trap latino
alternative rb,viral pop
latin,reggaeton,trap latino
latin,reggaeton,trap latino
kpop,kpop boy group
latin,reggaeton,trap latino
latin,reggaeton,trap latino
latin,reggaeton,reggaeton colombiano
kpop,kpop boy group
dance pop,escape room,minnesota hip hop,pop,trap queen
gauze pop,indietronica,shiver pop
colombian pop,dance pop,latin,latin pop,pop
latin,reggaeton,trap latino
pop
deep underground hip hop,kentucky hip hop,rap
australian hip hop
dance pop,pop,postteen pop,uk pop
dfw rap,melodic rap,rap
latin,reggaeton,trap latino
glam rock,mellow gold,piano rock
latin,reggaeton,trap latino
alt z,pop
latin,latin viral pop,rap latina,reggaeton
mambo chileno,urbano chileno
canadian pop,pop
modern rock,rock
latin,reggaeton,trap latino
kpop,kpop boy group
gen z singersongwriter
dfw rap,melodic rap,rap
lgbtq hip hop,pop
dance pop,edm,electro house,house,pop,pop rap,progressive house

In [19]:
ctfidf.saveTFIDF()

