In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from fairlearn.metrics import MetricFrame, selection_rate, demographic_parity_difference
from fairlearn.postprocessing import ThresholdOptimizer
from sklearn.neighbors import NearestNeighbors

In [2]:
dataset = pd.read_csv('tcc_ceds_music.csv') # genres: 'pop', 'country', 'blues', 'jazz', 'reggae', 'rock', 'hip hop'
dataset['artist_name'] = dataset['artist_name'].str.lower()

gender = pd.read_csv('gender.csv')
gender['NAME'] = gender['NAME'].str.lower()
gender.rename(columns={'NAME': 'artist_name'}, inplace=True)

combined = dataset.merge(right = gender, how = 'inner', on = 'artist_name')

In [3]:
# Prepare datasets and combine them

def remove_stuff(df, column):
    to_remove = ['[',']','\n','\r',"\""," "]
    for item in to_remove:
        df[column] = df[column].str.replace(item, '')
    return df

gender_new = remove_stuff(gender, 'GENRE')

gender_new['GENRE'] = gender_new['GENRE'].str.split(',')
print(gender_new.head(2))

combined1 = dataset.merge(right = gender_new, how = 'inner', on = 'artist_name')
combined1
filtered_df = combined1[combined1.apply(
        lambda row: any(row['genre'].lower() == g.lower() for g in row['GENRE']) if isinstance(row['GENRE'], list) else False, 
        axis=1
)]

filtered_df = filtered_df.dropna()

   CHARTMETRIC_ID artist_name ARTIST_COUNTRY    PRONOUN GENDER IS_BAND  \
0        10862574    kay yang            NaN     he/him   male   False   
1        11051639   2002 mobb            NaN  they/them   male    True   

                                  GENRE  
0                              [Others]  
1  [dance, hip-hop/rap, alternativerap]  


In [4]:
#3971
unique_genres = set()
filtered_df['GENRE'].apply(lambda x: unique_genres.update(x))
print(unique_genres)

{'anime', 'doowop', 'j-pop', 'Blues', 'dance', 'soundtrack', 'edm', 'reggaeton', "Children'sMusic", 'countryhip-hop', 'blues', 'synthpop', 'hip-hop/rap', 'nightcore', 'ambient', 'Dance', 'Electronic', 'christmas', 'Classical', 'newwave', 'artpop', 'LatinDance', 'LatinPop', 'mena', 'northamerican', 'pop', 'indiepop', 'HardRock', 'rockabilly', 'punk', 'techhouse', 'cajun', 'Folk', 'Reggae', 'Rock', 'electronic', 'Jazz', 'chill', 'house', 'hardrock', 'honkytonk', 'classicrock', 'cafres', 'karneval', 'Pop', 'westcoasthip-hop', 'ppop', 'indie', 'holiday', 'funk', 'chansons', 'Country', 'statetrance', 'poppunk', 'reggae', 'Holiday', 'softjazz', 'jawaiian', 'emo', 'alternativerock', 'IndieRock', 'bachata', 'opm', 'R&B/Soul', 'metal', 'oldies', 'countrypop', 'outlawcountry', 'Hip-Hop/Rap', 'dancepop', 'softrock', 'protometal', 'country', 'latin', 'newage', 'singer/songwriter', 'jazz', 'k-pop', 'latinrock', 'Latin', 'folk', 'basshouse', 'newcountry', 'latinpop', 'reddirt', 'phonk', 'alternative

In [5]:
# filtered_df.columns
# Index(['Unnamed: 0', 'artist_name', 'track_name', 'release_date', 'genre',
#        'lyrics', 'len', 'dating', 'violence', 'world/life', 'night/time',
#        'shake the audience', 'family/gospel', 'romantic', 'communication',
#        'obscene', 'music', 'movement/places', 'light/visual perceptions',
#        'family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability',
#        'loudness', 'acousticness', 'instrumentalness', 'valence', 'energy',
#        'topic', 'age', 'CHARTMETRIC_ID', 'ARTIST_COUNTRY', 'PRONOUN', 'GENDER',
#        'IS_BAND', 'GENRE'],
#       dtype='object')

In [6]:
filtered_df.groupby('genre').count().iloc[:,0]

genre
blues       231
country     736
jazz         87
pop        1692
reggae      161
rock       1064
Name: Unnamed: 0, dtype: int64

In [7]:
num_features = ['len', 'dating', 'violence', 'world/life', 'night/time',
       'shake the audience', 'family/gospel', 'romantic', 'communication',
       'obscene', 'music', 'movement/places', 'light/visual perceptions',
       'family/spiritual', 'like/girls','feelings', 'danceability',
       'loudness', 'acousticness', 'instrumentalness', 'valence', 'energy',
        'age']

categor = ['artist_name', 'track_name', 'genre', 'GENDER', 'topic','ARTIST_COUNTRY']

filtered = filtered_df.filter(num_features)


model = NearestNeighbors(n_neighbors=6,metric = 'cosine')
model.fit(filtered)

In [18]:
import random
distances_list = []
names_list = []
def recommend_songs_by_genre(user_genre, model, filtered, df, num_features, n_recommendations=50):
    #Filter
    genre_filtered_df = df[df['genre'].str.lower() == user_genre.lower()].copy()
    
    if genre_filtered_df.empty:
        print(f"No songs found for the genre '{user_genre}'. Please try a different genre.")
        return None

    #Set same index
    genre_indices = genre_filtered_df.index
    filtered_genre_data = filtered.loc[genre_indices]

    #Random Song
    random_index = random.choice(genre_indices)
    
    #Get features
    input_features = filtered.loc[[random_index]]

    #Find near neighbors
    distances, indices = model.kneighbors(input_features, n_neighbors=n_recommendations + 1)
    for i, index in enumerate(indices.flatten()[1:]):
        distances_list.append(distances.flatten()[i + 1])
        names_list.append(df.iloc[index]['artist_name'])

    #Get index of recommendations
    recommended_indices = indices.flatten()[1:]
    
    #Get indices of recommendations
    return df.iloc[recommended_indices][['artist_name', 'track_name', 'genre', 'GENDER']]


In [19]:
dict = {'name':names_list, 'dist':distances_list}
name_list_df = pd.DataFrame(dict)

In [20]:
#Get user input
user_genre = input("Enter a genre: ").strip()
recommendations = recommend_songs_by_genre(user_genre, model, filtered, filtered_df, num_features)

if recommendations is not None:
    print("\nRecommended Songs in the genre:", user_genre)
    print(recommendations)
else:
    print("No recommendations found.")



Recommended Songs in the genre: rock
                    artist_name                               track_name  \
2684   electric light orchestra                                 eldorado   
29815                pink floyd                            paranoid eyes   
2297               neil diamond                           if you go away   
30350                     queen                  you take my breath away   
5524               cyndi lauper                              true colors   
6199              tracy chapman                                  for you   
31234                        u2                   running to stand still   
11477               johnny cash                           here was a man   
22341           louis armstrong            you can't lose a broken heart   
2277                carole king  (you make me feel like) a natural woman   
30159               rod stewart                 the way you look tonight   
1322                  bob dylan               let 

GENDER \
male             2991 \
female            774 \
mixed             174 \
non-binary         13 \
androgynous         7 \
genderfluid         7 \
not specified       5 \
Name: count, dtype: int64

In [None]:
# print(f"Number of female artists: {len(recommendations[recommendations['GENDER'] == 'female'])}\nNumber of male artists: {len(recommendations[recommendations['GENDER'] == 'male'])}")
# proportion to population
# num_female = int(round(len(recommendations[recommendations['GENDER'] == 'female']) / 50 * 10))
# num_male = int(round(len(recommendations[recommendations['GENDER'] == 'male']) / 50 * 10))

#enforcing demographic parity, equal 
num_female = 5
num_male = 5

print(f"Female num: {num_female}\nMale num:{num_male}")
male_recommendations = recommendations[(recommendations['GENDER'] == 'male') & (recommendations['genre'] == user_genre)].head(num_male)
female_recommendations = recommendations[(recommendations['GENDER'] == 'female') & (recommendations['genre'] == user_genre)].head(num_female)

print(recommendations.head(5))
print(female_recommendations)

def calc_dist(df, name_list):
    for i in range(len(df)):
        
    

# print("First 5 Male Recommendations:")
# print(male_recommendations)

# print("\nFirst 5 Female Recommendations:")
# print(female_recommendations)

Female num: 5
Male num:5
                    artist_name               track_name genre  GENDER
2684   electric light orchestra                 eldorado   pop    male
29815                pink floyd            paranoid eyes  rock    male
2297               neil diamond           if you go away   pop    male
30350                     queen  you take my breath away  rock    male
5524               cyndi lauper              true colors   pop  female
        artist_name        track_name genre  GENDER
31388  stevie nicks  leather and lace  rock  female
Empty DataFrame
Columns: [name, dist]
Index: []
