# Basic Setting

In [4]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import random
import warnings
warnings.filterwarnings(action='ignore')

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
track = pd.read_csv('global_track_spotify.csv')
track = pd.DataFrame(track)
track.head()

Unnamed: 0.1,Unnamed: 0,id,artist_name,track_name,album_name,artist_genre,artist_popularity,track_popularity,artist_followers,danceability,...,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,4Dvkj6JhhA12EX05fT7y2e,Harry Styles,As It Was,Harry's House,['pop'],94,93,21396163,167303,...,6,-5.338,0.0557,0.342,0.00101,0.311,0.662,173.93,167303,4
1,1,75FEaRjZTKLhTrFGsfMUXR,Kate Bush,Running Up That Hill (A Deal With God),Hounds Of Love,"['art pop', 'art rock', 'baroque pop', 'new wa...",80,94,1106089,298933,...,10,-13.123,0.055,0.72,0.00314,0.0604,0.197,108.375,298933,4
2,2,6Sq7ltF9Qa7SNFBsV5Cogx,Bad Bunny,Me Porto Bonito,Un Verano Sin Ti,"['latin', 'reggaeton', 'trap latino']",100,98,49880269,178567,...,1,-5.105,0.0817,0.0901,2.7e-05,0.0933,0.425,92.005,178567,4
3,3,6xGruZOHLs39ZbVccQTuPZ,Joji,Glimpse of Us,Glimpse of Us,"['alternative r&b', 'viral pop']",81,89,5992900,233456,...,8,-9.258,0.0531,0.891,5e-06,0.141,0.268,169.914,233456,3
4,4,3k3NWokhRRkEPhCzPmV8TW,Bad Bunny,Ojitos Lindos,Un Verano Sin Ti,"['latin', 'reggaeton', 'trap latino']",100,98,49880269,258299,...,3,-5.745,0.0413,0.08,1e-06,0.528,0.268,79.928,258299,4


# Feature Engineering 

In [7]:
import re
 
def cleanText(readData):
    text = re.sub('[-=+#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]','', readData) # 쉼표(,) 뺌
 
    return text

In [8]:
genre = []
for i in track['artist_genre']:
    if i == '[]':
        i = 'NA'
        genre.append(i.strip()) #"'[]'"
    else:
        i = cleanText(i)
        genre.append(i.strip())
track['genre'] = genre

In [9]:
track['track_popularity'] = track['track_popularity'] / 100 

In [10]:
track.head()

Unnamed: 0.1,Unnamed: 0,id,artist_name,track_name,album_name,artist_genre,artist_popularity,track_popularity,artist_followers,danceability,...,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,genre
0,0,4Dvkj6JhhA12EX05fT7y2e,Harry Styles,As It Was,Harry's House,['pop'],94,0.93,21396163,167303,...,-5.338,0.0557,0.342,0.00101,0.311,0.662,173.93,167303,4,pop
1,1,75FEaRjZTKLhTrFGsfMUXR,Kate Bush,Running Up That Hill (A Deal With God),Hounds Of Love,"['art pop', 'art rock', 'baroque pop', 'new wa...",80,0.94,1106089,298933,...,-13.123,0.055,0.72,0.00314,0.0604,0.197,108.375,298933,4,"art pop, art rock, baroque pop, new wave pop, ..."
2,2,6Sq7ltF9Qa7SNFBsV5Cogx,Bad Bunny,Me Porto Bonito,Un Verano Sin Ti,"['latin', 'reggaeton', 'trap latino']",100,0.98,49880269,178567,...,-5.105,0.0817,0.0901,2.7e-05,0.0933,0.425,92.005,178567,4,"latin, reggaeton, trap latino"
3,3,6xGruZOHLs39ZbVccQTuPZ,Joji,Glimpse of Us,Glimpse of Us,"['alternative r&b', 'viral pop']",81,0.89,5992900,233456,...,-9.258,0.0531,0.891,5e-06,0.141,0.268,169.914,233456,3,"alternative rb, viral pop"
4,4,3k3NWokhRRkEPhCzPmV8TW,Bad Bunny,Ojitos Lindos,Un Verano Sin Ti,"['latin', 'reggaeton', 'trap latino']",100,0.98,49880269,258299,...,-5.745,0.0413,0.08,1e-06,0.528,0.268,79.928,258299,4,"latin, reggaeton, trap latino"


# Music Recommeder System

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

class ContentTFIDF:
    
    def __init__(self, data):
        self.data = data

    def calculateTFIDF(self):
        tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,2) ,stop_words=stopwords.words('english'))
        tfidf_content = tfidf.fit_transform(self.data['artist_name'] + ' ' +
                                       self.data['track_name'] + ' ' +
                                       self.data['album_name'] + ' ' +
                                       self.data['genre'])
        return tfidf_content

In [12]:
track.shape

(1154, 22)

In [13]:
ct = ContentTFIDF(track)
ct_tfidf = ct.calculateTFIDF()
ct_tfidf

<1154x7880 sparse matrix of type '<class 'numpy.float64'>'
	with 18739 stored elements in Compressed Sparse Row format>

In [14]:
ct_tfidf.shape

(1154, 7880)

In [21]:
class ContentBasedRecommenderSystem:
    
    def __init__(self, data, tfidf_matrix):
        self.data = data
        self.tfidf_matrix = tfidf_matrix
        self.music = ''
        self.mood = ''
        self.speed = ''
        self.emotion = ''

        
    def user_info(self):
        print("--------------------------------------------------------------------\n\
노래를 추천해드리기 전에 잠시 당신에 대해서 알아보겠습니다 \n\
--------------------------------------------------------------------")
        songs = list(self.data['track_name'].values)
        song = random.sample(songs, 5)

        total_dictionary = {}
        qs=[]
        qs.append("무슨 곡이 가장 좋아요?   1) {}  2) {}  3) {}  4) {}  5) {}".format(song[0],song[1],song[2],song[3],song[4]))
        qs.append("어떤 분위기의 곡을 좋아하세요?   1) 밝은  2) 어두운")
        qs.append("어느정도의 속도를 가직 곡을 좋아하시나요?   1) 빠름  2) 느림")
        qs.append("고객님께서는 현재 감정이 어떤 상태이신가요?   1) 신남  2) 평범  3) 슬픔")
        qs.append("끝")

        for q in qs:
            question = q
            if question == "끝":
                break
            else:
                total_dictionary[question] = ""

        for i in total_dictionary:
            print(i)
            answer = input()
            total_dictionary[i] = answer 

        a = list(total_dictionary.items())
        self.music = a[0][1]
        self.mood = int(a[1][1])
        self.speed = int(a[2][1])
        self.emotion = int(a[3][1])
        
        return [self.music, self.mood, self.speed, self.emotion]

    def get_genre_score(self):
        cosine_sim_score = cosine_similarity(self.tfidf_matrix, self.tfidf_matrix)
        target_genre_index = self.data[self.data['track_name'] == self.music].index.values
        genre_score = cosine_sim_score[target_genre_index, :].reshape(-1, 1)
        return genre_score

    def get_mood_score(self):
        temp = pd.DataFrame(self.data['valence'])
        if self.mood == 1:
            temp['mood_score'] = temp['valence']
        else:
            temp['mood_score'] = temp['valence'].apply(lambda x: 1-x)
        return temp['mood_score']
    
    def get_speed_score(self):
        temp = pd.DataFrame(self.data['tempo'])
        temp['tempo_scaled'] = MinMaxScaler().fit_transform(pd.DataFrame(temp['tempo']))
        if self.speed == 1:
            temp['speed_score'] = temp['tempo_scaled']
        else:
            temp['speed_score'] = temp['tempo_scaled'].apply(lambda x: 1-x)
        return temp['speed_score']

    def get_emotion_score(self):
        temp = self.data[['danceability', 'energy', 'acousticness']]
        temp['danceability_scaled'] = MinMaxScaler().fit_transform((pd.DataFrame(temp['danceability'])))
        temp['acousticness_reverse'] = temp['acousticness'].apply(lambda x: 1-x)
        if self.emotion == 1:
            temp['emotion_score'] = temp.apply(lambda x: 1/3 * (x['danceability_scaled'] + x['energy'] + x['acousticness_reverse']), axis = 1)
        elif self.emotion == 2:
            temp['emotion_score'] = temp.apply(lambda x: 2/3 * (abs(x['danceability_scaled']-0.5) + abs(x['energy']-0.5) + abs(x['acousticness_reverse']-0.5)), axis = 1)
        else:
            temp['emotion_score'] = temp.apply(lambda x: 1/3 * ((1-x['danceability_scaled']) + (1-x['energy']) + (1-x['acousticness_reverse'])), axis = 1)
        return temp['emotion_score']

    def get_total_score(self, top_n = 20):
        result_df = self.data[['artist_name', 'track_name', 'album_name']]
        result_df['mood_score'] = pd.DataFrame(self.get_mood_score())
        result_df['speed_score'] = pd.DataFrame(self.get_speed_score())
        result_df['emotion_score'] = pd.DataFrame(self.get_emotion_score())
        result_df['genre_score'] = pd.DataFrame(self.get_genre_score())
        result_df['total_score'] = result_df.apply(lambda x: 1/6*(x['mood_score'] + x['speed_score'] + x['emotion_score']) + 0.5*x['genre_score'], axis = 1)

        return result_df.sort_values(by = 'total_score', ascending=False)[:top_n]

In [22]:
cbr = ContentBasedRecommenderSystem(track, ct_tfidf)
ui1 = cbr.user_info()
ex1 = cbr.get_total_score()
ex1

--------------------------------------------------------------------
노래를 추천해드리기 전에 잠시 당신에 대해서 알아보겠습니다 
--------------------------------------------------------------------
무슨 곡이 가장 좋아요?   1) Look To The Sky  2) So Fresh  3) START OVER  4) Bambi  5) 무기여 잘 있거라
So Fresh
어떤 분위기의 곡을 좋아하세요?   1) 밝은  2) 어두운
1
어느정도의 속도를 가직 곡을 좋아하시나요?   1) 빠름  2) 느림
1
고객님께서는 현재 감정이 어떤 상태이신가요?   1) 신남  2) 평범  3) 슬픔
1


Unnamed: 0,artist_name,track_name,album_name,mood_score,speed_score,emotion_score,genre_score,total_score
792,MC MONG,So Fresh,So Fresh,0.807,0.548033,0.726323,1.0,0.846893
826,MC MONG,A Letter to you Part.2,The Way I Am,0.705,0.480299,0.567021,0.368955,0.476531
189,AOA,Like a Cat,Like a Cat,0.928,0.962276,0.723262,0.0,0.43559
375,황혜영,일과 이분의 일,투유 프로젝트 - 슈가맨 Pt.33,0.923,0.970206,0.701478,0.0,0.432447
857,BoA,My Sweetie,No.1 - The 2nd Album,0.923,0.932434,0.733024,0.0,0.43141
279,Epik High,Fan,Remapping the Human Soul,0.78,0.797894,0.708519,0.081803,0.42197
70,Mighty Mouth,TOK TOK (ORIGINAL VER.) (feat.Kim Bum Soo),Mighty Fresh,0.875,0.624904,0.713554,0.106086,0.421953
819,The Jadu,김밥,The Jadu 3,0.886,0.894518,0.728663,0.0,0.418197
80,Secret,Love Is Move,MOVING IN SECRET,0.963,0.77343,0.743597,0.0,0.413338
204,Girl's Day,Ring My Bell,Girl's Day Love Second Album,0.811,0.952116,0.701644,0.0,0.410793


In [23]:
ui2 = cbr.user_info()
ex2 = cbr.get_total_score()
ex2

--------------------------------------------------------------------
노래를 추천해드리기 전에 잠시 당신에 대해서 알아보겠습니다 
--------------------------------------------------------------------
무슨 곡이 가장 좋아요?   1) Lonely  2) Yeosu Night Sea  3) 미인아 Bonamana  4) Um Oh Ah Yeh  5) Sherlock (Clue + Note)
Yeosu Night Sea
어떤 분위기의 곡을 좋아하세요?   1) 밝은  2) 어두운
2
어느정도의 속도를 가직 곡을 좋아하시나요?   1) 빠름  2) 느림
2
고객님께서는 현재 감정이 어떤 상태이신가요?   1) 신남  2) 평범  3) 슬픔
3


Unnamed: 0,artist_name,track_name,album_name,mood_score,speed_score,emotion_score,genre_score,total_score
112,Busker Busker,Yeosu Night Sea,Busker Busker 1st,0.773,0.042641,0.474897,1.0,0.71509
107,Busker Busker,Cherry Blossom Ending,Busker Busker 1st,0.337,0.42348,0.55816,0.747612,0.593579
111,Busker Busker,First Love,Busker Busker 1st,0.314,0.423648,0.368484,0.793068,0.580889
133,Busker Busker,Bloom,Busker Busker 1st,0.152,0.423471,0.361034,0.810571,0.56137
337,2NE1,Fire,1st Mini Album,1.0,1.0,0.666667,0.031669,0.460279
338,2NE1,I Don't Care,1st Mini Album,1.0,1.0,0.666667,0.031631,0.46026
336,2NE1,Lollipop (Bonus Track) [BIGBANG & 2NE1],1st Mini Album,1.0,1.0,0.666667,0.022656,0.455773
594,JAURIM,영원히 영원히,자우림,0.855,0.634805,0.628619,0.106949,0.406545
1106,Ann kim,Home,Home,0.892,0.661297,0.826095,0.0,0.396565
1093,92914,Starlight,Starlight,0.922,0.649714,0.780598,0.008262,0.396183


In [24]:
ui3 = cbr.user_info()
ex3 = cbr.get_total_score()
ex3

--------------------------------------------------------------------
노래를 추천해드리기 전에 잠시 당신에 대해서 알아보겠습니다 
--------------------------------------------------------------------
무슨 곡이 가장 좋아요?   1) Fighting! 응원해  2) Breath  3) You’re the Best  4) DORADORA  5) Yo No Soy Celoso
Breath
어떤 분위기의 곡을 좋아하세요?   1) 밝은  2) 어두운
1
어느정도의 속도를 가직 곡을 좋아하시나요?   1) 빠름  2) 느림
1
고객님께서는 현재 감정이 어떤 상태이신가요?   1) 신남  2) 평범  3) 슬픔
2


Unnamed: 0,artist_name,track_name,album_name,mood_score,speed_score,emotion_score,genre_score,total_score
490,Beast,Breath,So Beast,0.479,0.615234,0.662823,1.0,0.792843
493,Beast,Beautiful,So Beast,0.916,0.658336,0.702992,0.597603,0.678356
497,Beast,Mystery,Beast Is The B2ST,0.948,0.624923,0.691019,0.399088,0.576868
495,Beast,Bad Girl,Beast Is The B2ST,0.902,0.601051,0.642679,0.389286,0.552265
488,Beast,Beautiful Night,Midnight Sun,0.685,0.591308,0.710551,0.261909,0.462098
479,2PM,우리집 My House,No.5,0.781,0.845995,0.636607,0.150518,0.452526
491,Beast,Shadow,"Hard to love, How to love",0.752,0.552306,0.60969,0.264885,0.451442
189,AOA,Like a Cat,Like a Cat,0.928,0.962276,0.661476,0.044638,0.447611
760,EXO,3.6.5,The 1st Album 'XOXO' (Repackage),0.93,0.629816,0.655035,0.145728,0.442006
290,BIGBANG,Dirty Cash,BIGBANG Vol. 1,0.947,0.624409,0.706758,0.118684,0.439036
