In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from fairlearn.metrics import MetricFrame, selection_rate, demographic_parity_difference
from fairlearn.postprocessing import ThresholdOptimizer
from sklearn.neighbors import NearestNeighbors

In [2]:
dataset = pd.read_csv('tcc_ceds_music.csv') # genres: 'pop', 'country', 'blues', 'jazz', 'reggae', 'rock', 'hip hop'
dataset['artist_name'] = dataset['artist_name'].str.lower()

gender = pd.read_csv('gender.csv')
gender['NAME'] = gender['NAME'].str.lower()
gender.rename(columns={'NAME': 'artist_name'}, inplace=True)

combined = dataset.merge(right = gender, how = 'inner', on = 'artist_name')

In [3]:
# Prepare datasets and combine them

def remove_stuff(df, column):
    to_remove = ['[',']','\n','\r',"\""," "]
    for item in to_remove:
        df[column] = df[column].str.replace(item, '')
    return df

gender_new = remove_stuff(gender, 'GENRE')

gender_new['GENRE'] = gender_new['GENRE'].str.split(',')
print(gender_new.head(2))

combined1 = dataset.merge(right = gender_new, how = 'inner', on = 'artist_name')
combined1
filtered_df = combined1[combined1.apply(
        lambda row: any(row['genre'].lower() == g.lower() for g in row['GENRE']) if isinstance(row['GENRE'], list) else False, 
        axis=1
)]

filtered_df = filtered_df.dropna()

   CHARTMETRIC_ID artist_name ARTIST_COUNTRY    PRONOUN GENDER IS_BAND  \
0        10862574    kay yang            NaN     he/him   male   False   
1        11051639   2002 mobb            NaN  they/them   male    True   

                                  GENRE  
0                              [Others]  
1  [dance, hip-hop/rap, alternativerap]  


In [4]:
#3971
unique_genres = set()
filtered_df['GENRE'].apply(lambda x: unique_genres.update(x))
print(unique_genres)

{'caribbeanfolk', 'k-pop', 'classical', 'soundtrack', 'doowop', 'mena', 'r&b', 'singer/songwriter', 'dubstep', 'metal', 'j-pop', 'opm', 'hip-hop/rap', 'countrypop', 'edm', 'jawaiian', 'emo', 'protometal', 'Folk', 'Dancehall', 'Hip-Hop/Rap', 'reggaeton', 'Classical', 'softrock', 'Electronic', 'pop', 'Reggae', 'Country', 'indiepop', 'karneval', 'cafres', 'artpop', 'Alternative', 'latinrock', 'r&b/soul', 'bachata', 'latin', 'northamerican', 'country', 'anime', 'Blues', 'alternative', 'indie', 'poppunk', 'electronic', "Children'sMusic", 'reddirt', 'ambient', 'bossanova', 'lofi', 'house', 'hardrock', 'IndieRock', 'basshouse', 'punk', 'Latin', 'techhouse', 'softjazz', 'christmas', 'chill', 'statetrance', 'countryhip-hop', 'classicrock', 'synthpop', 'phonk', 'blues', 'alternativerock', 'LatinDance', 'Rock', 'Jazz', 'latinpop', 'newwave', 'newage', 'folk', 'holiday', 'reggae', 'cajun', "children'smusic", 'honkytonk', 'dance', 'nightcore', 'Dance', 'dancepop', 'R&B/Soul', 'LatinPop', 'westcoast

In [5]:
# filtered_df.columns
# Index(['Unnamed: 0', 'artist_name', 'track_name', 'release_date', 'genre',
#        'lyrics', 'len', 'dating', 'violence', 'world/life', 'night/time',
#        'shake the audience', 'family/gospel', 'romantic', 'communication',
#        'obscene', 'music', 'movement/places', 'light/visual perceptions',
#        'family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability',
#        'loudness', 'acousticness', 'instrumentalness', 'valence', 'energy',
#        'topic', 'age', 'CHARTMETRIC_ID', 'ARTIST_COUNTRY', 'PRONOUN', 'GENDER',
#        'IS_BAND', 'GENRE'],
#       dtype='object')

In [6]:
filtered_df.groupby('genre').count().iloc[:,0]

genre
blues       231
country     736
jazz         87
pop        1692
reggae      161
rock       1064
Name: Unnamed: 0, dtype: int64

In [7]:
num_features = ['len', 'dating', 'violence', 'world/life', 'night/time',
       'shake the audience', 'family/gospel', 'romantic', 'communication',
       'obscene', 'music', 'movement/places', 'light/visual perceptions',
       'family/spiritual', 'like/girls','feelings', 'danceability',
       'loudness', 'acousticness', 'instrumentalness', 'valence', 'energy',
        'age']

categor = ['artist_name', 'track_name', 'genre', 'GENDER', 'topic','ARTIST_COUNTRY']

filtered = filtered_df.filter(num_features)


model = NearestNeighbors(n_neighbors=6,metric = 'cosine')
model.fit(filtered)

In [74]:
import random
distances_list = []
names_list = []
artist_list = []

def recommend_songs_by_genre(user_genre, model, filtered, df, num_features, n_recommendations=50):
    #Filter
    genre_filtered_df = df[df['genre'].str.lower() == user_genre.lower()].copy()
    
    if genre_filtered_df.empty:
        print(f"No songs found for the genre '{user_genre}'. Please try a different genre.")
        return None

    #Set same index
    genre_indices = genre_filtered_df.index
    filtered_genre_data = filtered.loc[genre_indices]

    #Random Song
    random_index = random.choice(genre_indices)
    
    #Get features
    input_features = filtered.loc[[random_index]]

    #Find near neighbors
    distances, indices = model.kneighbors(input_features, n_neighbors=n_recommendations + 1)
    for i, index in enumerate(indices.flatten()[1:]):
        distances_list.append(distances.flatten()[i + 1])
        names_list.append(df.iloc[index]['track_name'])
        artist_list.append(df.iloc[index]['artist_name'])

    #Get index of recommendations
    recommended_indices = indices.flatten()[1:]
    
    #Get indices of recommendations
    return df.iloc[recommended_indices][['artist_name', 'track_name', 'genre', 'GENDER']]


In [75]:
#Get user input
user_genre = input("Enter a genre: ").strip()
recommendations = recommend_songs_by_genre(user_genre, model, filtered, filtered_df, num_features)

if recommendations is not None:
    print("\nRecommended Songs in the genre:", user_genre)
    print(recommendations)
else:
    print("No recommendations found.")



Recommended Songs in the genre: rock
                    artist_name  \
28998                billy joel   
11191              kenny rogers   
1240                       cher   
11795              kenny rogers   
29029                  bee gees   
28997               rod stewart   
29357                billy joel   
30141              stevie nicks   
4124            tears for fears   
10502              dolly parton   
21931           aretha franklin   
10506              dolly parton   
2090             the beach boys   
10928              dolly parton   
3055                  bob dylan   
29219             fleetwood mac   
29137             elvis presley   
29965                air supply   
29677                     queen   
6090                   coldplay   
12161               johnny cash   
29263                billy joel   
2384   electric light orchestra   
29313             jimmy buffett   
11450               johnny cash   
12332              dolly parton   
1982             

In [76]:
dict = {'track_name':names_list, 'dist':distances_list}
name_list_df = pd.DataFrame(dict)
name_list_df['artist_name'] = artist_list

GENDER \
male             2991 \
female            774 \
mixed             174 \
non-binary         13 \
androgynous         7 \
genderfluid         7 \
not specified       5 \
Name: count, dtype: int64

In [None]:
#enforcing demographic parity, equal 
num_female = 5
num_male = 5

male_recommendations = recommendations[recommendations['GENDER'] == 'male'].head(num_male)
female_recommendations = recommendations[recommendations['GENDER'] == 'female'].head(num_female)

male_merge = male_recommendations.merge(right = name_list_df, how = 'inner', on = 'track_name')
female_merge = female_recommendations.merge(right = name_list_df, how = 'inner', on = 'track_name')
overall_merge = recommendations.head(10).merge(right = name_list_df, how = 'inner', on = 'track_name')

male_dist = male_merge['dist'].mean()
female_dist = female_merge['dist'].mean()
overall_dist = overall_merge.head(10)['dist'].mean()

print(f"Male average distance: {male_dist:.6f}\nFemale average distance: {female_dist:.6f}\nOverall average distance: {overall_dist:.6f}")


Male average distance: 0.000026
Female average distance: 0.000035
Overall average distance: 0.000032
