In [12]:
import os
import re
import logging
import argparse
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings(action='ignore')

In [13]:
class ContentBasedRecommenderSystem:
    
    def __init__(self, data, tfidf, music, mood, speed, emotion):
        self.data = data
        self.tfidf = tfidf
        self.music = music
        self.mood = mood
        self.speed = speed
        self.emotion = emotion
        self.result = pd.DataFrame()

    def cleanText(self, text_data):
        text_data_str = str(text_data)
        text = re.sub('[-=+#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]','', text_data_str)
        return text

    def preprocess(self):
        genre = []
        for i in self.data['artist_genre']:
            if i == '[]':
                i = 'NA'
                genre.append(i.strip()) #"'[]'"
            else:
                i = self.cleanText(i)
                genre.append(i.strip())
        self.data['genre'] = genre
        self.data = self.data[self.data['genre'] != "NA"]
        self.data = self.data.reset_index()
        self.data['track_popularity'] = self.data['track_popularity'] / 100 

    def euclidean_distance(self, x, y):   
        return np.sqrt(np.sum((x - y) ** 2))
    
    def recommend_features(self, top=200):
        scaler = MinMaxScaler()
        index = self.data[self.data['track_name'] == self.music].index.values
        track_new = self.data[['danceability', 'energy', 'valence', 'tempo', 'acousticness']]
        track_scaled = scaler.fit_transform(track_new)
        target_index = track_scaled[index]

        euclidean = []
        for value in track_scaled:
            eu = self.euclidean_distance(target_index, value)
            euclidean.append(eu)

        self.data['euclidean_distance'] = euclidean
        result_features = self.data.sort_values(by='euclidean_distance', ascending=True)[:top]

        return result_features[['id','artist_name', 'track_name', 'euclidean_distance']]

    
    def recommend_genre(self, top=200):
        # 코사인 유사도
        ts_genre = cosine_similarity(self.tfidf, self.tfidf)

        #특정 장르 정보 뽑아오기
        target_genre_index = self.data[self.data['track_name'] == self.music].index.values

        # 입력한 영화의 유사도 데이터 프레임 추가
        self.data["cos_similarity"] = ts_genre[target_genre_index, :].reshape(-1,1)
        sim_genre = self.data.sort_values(by="cos_similarity", ascending=False)
        final_index = sim_genre.index.values[ : top]
        result_genre = self.data.iloc[final_index]

        return result_genre[['id','artist_name', 'track_name', 'cos_similarity']]

    
    def get_feature_genre_intersection(self):
        recommended_feature = self.recommend_features()
        recommended_genre = self.recommend_genre()
        intersection = pd.merge(recommended_feature, recommended_genre, how='inner')
        similarity = intersection[['euclidean_distance', 'cos_similarity']]
        scaler = MinMaxScaler()
        scale = scaler.fit_transform(similarity)
        scale = pd.DataFrame(scale, columns=['eu_scaled', 'cos_scaled'])
        
        intersection['euclidean_scaled'] = scale['eu_scaled']
        intersection['cosine_scaled'] = scale['cos_scaled']
        intersection['ratio'] = intersection['euclidean_scaled'] + (1 - intersection['cosine_scaled'])
        result_intersection = intersection.sort_values('ratio', ascending=True)
        self.result = pd.merge(self.data, result_intersection, how='inner').sort_values(by='ratio')
        
        return self.result

    
    def get_genre_score(self):
        cosine_sim_score = cosine_similarity(self.tfidf, self.tfidf)
        target_genre_index = self.result[self.result['track_name'] == self.music].index.values
        genre_score = cosine_sim_score[target_genre_index, :].reshape(-1, 1)
        return genre_score

    
    def get_mood_score(self):
        temp = pd.DataFrame(self.result['valence'])
        if self.mood == 1:
            temp['mood_score'] = temp['valence']
        else:
            temp['mood_score'] = temp['valence'].apply(lambda x: 1-x)
        return temp['mood_score']
    
    
    def get_speed_score(self):
        temp = pd.DataFrame(self.result['tempo'])
        temp['tempo_scaled'] = MinMaxScaler().fit_transform(pd.DataFrame(temp['tempo']))
        if self.speed == 1:
            temp['speed_score'] = temp['tempo_scaled']
        else:
            temp['speed_score'] = temp['tempo_scaled'].apply(lambda x: 1-x)
        return temp['speed_score']

    
    def get_emotion_score(self):
        temp = self.result[['danceability', 'energy', 'acousticness']]
        temp['danceability_scaled'] = MinMaxScaler().fit_transform((pd.DataFrame(temp['danceability'])))
        temp['acousticness_reverse'] = temp['acousticness'].apply(lambda x: 1-x)
        if self.emotion == 1:
            temp['emotion_score'] = temp.apply(lambda x: 1/3 * (x['danceability_scaled'] + x['energy'] + x['acousticness_reverse']), axis = 1)
        elif self.emotion == 2:
            temp['emotion_score'] = temp.apply(lambda x: 2/3 * (abs(x['danceability_scaled']-0.5) + abs(x['energy']-0.5) + abs(x['acousticness_reverse']-0.5)), axis = 1)
        else:
            temp['emotion_score'] = temp.apply(lambda x: 1/3 * ((1-x['danceability_scaled']) + (1-x['energy']) + (1-x['acousticness_reverse'])), axis = 1)
        return temp['emotion_score']

    def get_total_score(self, top_n = 20):
        result_df = self.result[['artist_name', 'track_name', 'album_name']]
        result_df['mood_score'] = pd.DataFrame(self.get_mood_score())
        result_df['speed_score'] = pd.DataFrame(self.get_speed_score())
        result_df['emotion_score'] = pd.DataFrame(self.get_emotion_score())
        result_df['genre_score'] = pd.DataFrame(self.get_genre_score())
        result_df['total_score'] = result_df.apply(lambda x: 1/6*(x['mood_score'] + x['speed_score'] + x['emotion_score']) + 0.5*x['genre_score'], axis = 1)
        
        return result_df.iloc[1:].sort_values(by = 'total_score', ascending=False)[:top_n]


    


In [14]:
    track = pd.read_json("data/track/track_dataset.json", encoding = 'utf-8', orient='records')
    ct_tfidf = pd.read_csv("data/tfidf/tfidf_matrix.csv", encoding = 'utf-8')

In [15]:
    cbr = ContentBasedRecommenderSystem(track, ct_tfidf, "SMILEY(Feat. BIBI)", 2, 2, 2)
    cbr.preprocess()
    cbr.get_feature_genre_intersection()
    cbr.get_total_score()

Unnamed: 0,artist_name,track_name,album_name,mood_score,speed_score,emotion_score,genre_score,total_score
44,BLACKPINK,How You Like That,THE ALBUM,0.656,0.494663,0.672716,1.0,0.803897
7,IVE,ELEVEN,ELEVEN,0.413,0.590474,0.663254,1.0,0.777788
4,(G)I-DLE,TOMBOY,I NEVER DIE,0.355,0.551841,0.696152,1.0,0.767166
5,PSY,That That (prod. & feat. SUGA of BTS),PSY 9th,0.094,0.495264,0.937937,1.0,0.754534
22,AKMU,"Hey kid, Close your eyes (with Lee Sun Hee)",NEXT EPISODE,0.463,0.495483,0.486797,1.0,0.74088
45,BLACKPINK,Kill This Love,KILL THIS LOVE,0.42,0.4761,0.462775,1.0,0.726479
28,Couch Peach,Cliché,Cliché,0.044,0.514275,0.620667,1.0,0.69649
2,BTS,Born Singer,Proof,0.316,0.228739,0.612002,1.0,0.69279
8,MeloMance,"Love, Maybe","Love, Maybe (A Business Proposal OST Special T...",0.369,0.098593,0.492667,1.0,0.660043
40,2NE1,IF I WERE YOU,CRUSH,0.376,0.0,0.439783,1.0,0.635964
