In [52]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from fairlearn.metrics import MetricFrame, selection_rate, demographic_parity_difference
from fairlearn.postprocessing import ThresholdOptimizer

In [3]:
dataset = pd.read_csv('tcc_ceds_music.csv')
dataset['artist_name'] = dataset['artist_name'].str.lower()

In [4]:
dataset['genre'].unique()

array(['pop', 'country', 'blues', 'jazz', 'reggae', 'rock', 'hip hop'],
      dtype=object)

In [5]:
gender = pd.read_csv('gender.csv')
gender['NAME'] = gender['NAME'].str.lower()
gender.rename(columns={'NAME': 'artist_name'}, inplace=True)

In [7]:
combined = dataset.merge(right = gender, how = 'inner', on = 'artist_name')

In [10]:
bruh = pd.read_csv('gender.csv')
bruh[bruh['NAME'].str.lower() == 'drake']

Unnamed: 0,CHARTMETRIC_ID,NAME,ARTIST_COUNTRY,PRONOUN,GENDER,IS_BAND,GENRE
276558,627070,Drake,CL,they/them,,True,"[\r\n ""metal"",\r\n ""rock"",\r\n ""hard rock""\..."
545887,3380,Drake,CA,he/him,male,False,"[\r\n ""hip-hop/rap"",\r\n ""Hip-Hop/Rap"",\r\n ..."


In [11]:
#now, make sure genres are similar

In [12]:
def remove_stuff(df, column):
    to_remove = ['[',']','\n','\r',"\""," "]
    for item in to_remove:
        df[column] = df[column].str.replace(item, '')
    return df

gender_new = remove_stuff(gender, 'GENRE')

#Works!

In [13]:
#Split the GENRE into a list , separated by commas
gender_new['GENRE'] = gender_new['GENRE'].str.split(',')

In [15]:
combined1 = dataset.merge(right = gender_new, how = 'inner', on = 'artist_name')
combined1
filtered_df = combined1[combined1.apply(
        lambda row: any(row['genre'].lower() == g.lower() for g in row['GENRE']) if isinstance(row['GENRE'], list) else False, 
        axis=1
    )]
# Still missing the hip hop, since it is all hip-hop/rap

In [None]:
# filtered_df['genre'].unique()

array(['pop', 'country', 'blues', 'jazz', 'reggae', 'rock'], dtype=object)

In [18]:
#Okay, it's not great, but we have our dataset now. Fantastic! Come back and clean it up later.

In [44]:
filtered_df.loc[filtered_df['PRONOUN'] == 'he/him', 'GENDER'] = 'male'
filtered_df.loc[filtered_df['PRONOUN'] == 'she/her', 'GENDER'] = 'female'

In [47]:
filtered_df = filtered_df.dropna(subset=['GENDER'])

In [50]:
filtered_df.groupby('GENDER').count()

Unnamed: 0_level_0,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,instrumentalness,valence,energy,topic,age,CHARTMETRIC_ID,ARTIST_COUNTRY,PRONOUN,IS_BAND,GENRE
GENDER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
female,2689,2689,2689,2689,2689,2689,2689,2689,2689,2689,...,2689,2689,2689,2689,2689,2689,2630,2689,2673,2689
male,10806,10806,10806,10806,10806,10806,10806,10806,10806,10806,...,10806,10806,10806,10806,10806,10806,10778,10806,10729,10806
mixed,174,174,174,174,174,174,174,174,174,174,...,174,174,174,174,174,174,174,174,174,174
non-binary,13,13,13,13,13,13,13,13,13,13,...,13,13,13,13,13,13,13,13,13,13
not specified,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5


In [53]:
# list all of the features of the data set
filtered_df.columns

Index(['Unnamed: 0', 'artist_name', 'track_name', 'release_date', 'genre',
       'lyrics', 'len', 'dating', 'violence', 'world/life', 'night/time',
       'shake the audience', 'family/gospel', 'romantic', 'communication',
       'obscene', 'music', 'movement/places', 'light/visual perceptions',
       'family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability',
       'loudness', 'acousticness', 'instrumentalness', 'valence', 'energy',
       'topic', 'age', 'CHARTMETRIC_ID', 'ARTIST_COUNTRY', 'PRONOUN', 'GENDER',
       'IS_BAND', 'GENRE'],
      dtype='object')

In [55]:
filtered_df.to_csv('filtered_df.csv', index=True)

In [56]:
# # Step 1: Load the Data
# data = filtered_df

# # Step 2: Preprocess Data
# # Select relevant features based on columns in your dataset, took out GENRE
# features = [
#     'artist_name', 'release_date', 'genre',
#        'lyrics', 'len', 'dating', 'violence', 'world/life', 'night/time',
#        'shake the audience', 'family/gospel', 'romantic', 'communication',
#        'obscene', 'music', 'movement/places', 'light/visual perceptions',
#        'family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability',
#        'loudness', 'acousticness', 'instrumentalness', 'valence', 'energy',
#        'topic', 'age', 'CHARTMETRIC_ID', 'ARTIST_COUNTRY', 'PRONOUN', 'GENDER',
#        'IS_BAND'
# ]
# target = 'track_name'  # "track_name" is the target variable in this dataset

# # Encode the target variable (topic)
# data[target] = LabelEncoder().fit_transform(data[target])

# # Step 3: Split Data
# # Using 'age' as a proxy for the sensitive attribute here
# X_train, X_test, y_train, y_test, sensitive_train, sensitive_test = train_test_split(
#     data[features], data[target], data['age'], test_size=0.2, random_state=42
# )

# # Step 4: Define and Train the Model
# model = KNeighborsClassifier(n_neighbors=5)
# model.fit(X_train, y_train)

# # Step 5: Fairness Evaluation
# y_pred = model.predict(X_test)

# # Using Fairlearn to assess fairness
# metric_frame = MetricFrame(
#     metrics={"accuracy": lambda y_true, y_pred: (y_true == y_pred).mean(),
#              "selection_rate": selection_rate},
#     y_true=y_test,
#     y_pred=y_pred,
#     sensitive_features=sensitive_test
# )

# # Display metrics for each group
# print("Accuracy per group (based on age):", metric_frame.by_group["accuracy"])
# print("Selection rate per group (based on age):", metric_frame.by_group["selection_rate"])
# print("Demographic Parity Difference (based on age):", demographic_parity_difference(y_true=y_test, y_pred=y_pred, sensitive_features=sensitive_test))


In [10]:
import pandas as pd
import numpy as np
import random
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

# Step 1: Adjust parameters
file_path = "filtered_df.csv"  # Adjust this path to where your CSV file is located
sample_fraction = 0.05  # Fraction of rows to sample (adjust as needed)

# Step 2: Define the columns to load and the numerical features to use
selected_features = [
    "danceability", "energy", "valence", "acousticness", "instrumentalness",
    "loudness", "age"
]
essential_columns = ['artist_name', 'track_name', 'genre', 'GENDER', 'ARTIST_COUNTRY'] + selected_features

# Load the dataset
def load_sample(file_path, sample_fraction, columns):
    print("Loading and sampling data...")
    return pd.read_csv(
        file_path,
        usecols=columns,
        skiprows=lambda i: i > 0 and random.random() > sample_fraction
    )

df_sample = load_sample(file_path, sample_fraction, essential_columns)

# Check if data is loaded
if df_sample.empty:
    print("No data loaded. Please check the sample fraction or file path.")
    exit()

# Get user input for genre
user_genre = input("Enter a genre: ").strip().lower()
df_genre_filtered = df_sample[df_sample['genre'].str.lower() == user_genre].copy()

if df_genre_filtered.empty:
    print(f"No songs found for the genre '{user_genre}'. Please try a different genre.")
    exit()

print(f"Found {len(df_genre_filtered)} songs in the '{user_genre}' genre.")

# Reset the index
df_genre_filtered.reset_index(drop=True, inplace=True)

# Normalize the features
scaler = MinMaxScaler()
df_genre_filtered.loc[:, selected_features] = scaler.fit_transform(df_genre_filtered[selected_features])

# Apply PCA
pca = PCA(n_components=5)
reduced_features = pca.fit_transform(df_genre_filtered[selected_features])

# Train Nearest Neighbors model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=6)
model_knn.fit(reduced_features)

# Function to recommend songs
def recommend_songs(track_id, model, reduced_features, df, top_n=20):
    track_index = df.index[df['track_name'] == track_id].tolist()[0]
    distances, indices = model.kneighbors([reduced_features[track_index]], n_neighbors=top_n + 1)
    recommended_indices = indices.flatten()[1:]
    return df.iloc[recommended_indices][['artist_name', 'track_name', 'genre', 'GENDER', 'ARTIST_COUNTRY']]

# Function to calculate fairness metrics
def calculate_fairness(df, recommendations):
    # Calculate Demographic Parity Ratio (DPR) and Mean Difference (MD) for GENDER
    gender_counts = recommendations['GENDER'].value_counts(normalize=True)
    if len(gender_counts) > 1:
        dpr_gender = gender_counts.min() / gender_counts.max()
        md_gender = abs(gender_counts.diff().dropna().iloc[0])
    else:
        dpr_gender, md_gender = 1, 0

    # Calculate Demographic Parity Ratio (DPR) and Mean Difference (MD) for ARTIST_COUNTRY
    country_counts = recommendations['ARTIST_COUNTRY'].value_counts(normalize=True)
    if len(country_counts) > 1:
        dpr_country = country_counts.min() / country_counts.max()
        md_country = abs(country_counts.diff().dropna().iloc[0])
    else:
        dpr_country, md_country = 1, 0

    return {
        'DPR (Gender)': dpr_gender,
        'MD (Gender)': md_gender,
        'DPR (Country)': dpr_country,
        'MD (Country)': md_country
    }

# Generate recommendations
if len(df_genre_filtered) > 0:
    sample_track_id = df_genre_filtered['track_name'].iloc[0]
    recommendations = recommend_songs(sample_track_id, model_knn, reduced_features, df_genre_filtered)
    
    # Display recommendations
    print("\nRecommended Songs:")
    print(recommendations)

    # Calculate and display fairness metrics
    fairness_metrics = calculate_fairness(df_genre_filtered, recommendations)
    print("\nFairness Metrics:")
    for metric, value in fairness_metrics.items():
        print(f"{metric}: {value:.3f}")
else:
    print("No data available after filtering by genre.")


Loading and sampling data...
Found 69 songs in the 'rock' genre.

Recommended Songs:
          artist_name  track_name genre  GENDER ARTIST_COUNTRY
15        the beatles        4795  rock    male             GB
4       elvis presley        4969  rock    male             US
11          the kinks        3766  rock    male             GB
18         pink floyd         613  rock    male             GB
3       elvis presley        6158  rock    male             US
5    barbra streisand        5899  rock  female             US
26       james taylor        4354  rock    male             US
25          meat loaf        2615  rock    male             US
47      lionel richie         720  rock    male             US
61      mark knopfler        2908  rock    male             GB
7       elvis presley        2706  rock    male             US
2       elvis presley        8646  rock    male             US
20     paul mccartney        7099  rock    male             GB
14         pink floyd        9056

In [12]:
import pandas as pd
import numpy as np

start = 0
stop = 0

def simulate_fairness_metrics(n, df, model, reduced_features):
    """
    Run n simulations of the recommendation system and calculate average fairness metrics.
    
    Parameters:
    n (int): Number of simulations to run
    df (DataFrame): The filtered DataFrame used for recommendations
    model (NearestNeighbors): Trained Nearest Neighbors model
    reduced_features (ndarray): PCA-transformed feature set
    
    Returns:
    dict: Averages of fairness metrics over n simulations
    """
    
    def recommend_songs(track_id, model, reduced_features, df, top_n=5):
        # Get the index of the given track
        track_index = df.index[df['track_name'] == track_id].tolist()[0]
        distances, indices = model.kneighbors([reduced_features[track_index]], n_neighbors=top_n + 1)
        recommended_indices = indices.flatten()[1:]
        return df.iloc[recommended_indices][['artist_name', 'track_name', 'genre', 'GENDER', 'ARTIST_COUNTRY']]
    
    def calculate_fairness(recommendations):
        # Calculate Demographic Parity Ratio (DPR) and Mean Difference (MD) for GENDER
        gender_counts = recommendations['GENDER'].value_counts(normalize=True)
        if len(gender_counts) > 1:
            dpr_gender = gender_counts.min() / gender_counts.max()
            md_gender = abs(gender_counts.diff().dropna().iloc[0])
        else:
            dpr_gender, md_gender = 1, 0

        # Calculate Demographic Parity Ratio (DPR) and Mean Difference (MD) for ARTIST_COUNTRY
        country_counts = recommendations['ARTIST_COUNTRY'].value_counts(normalize=True)
        if len(country_counts) > 1:
            dpr_country = country_counts.min() / country_counts.max()
            md_country = abs(country_counts.diff().dropna().iloc[0])
        else:
            dpr_country, md_country = 1, 0

        return {
            'DPR (Gender)': dpr_gender,
            'MD (Gender)': md_gender,
            'DPR (Country)': dpr_country,
            'MD (Country)': md_country
        }
    
    # Initialize sums of metrics
    total_metrics = {
        'DPR (Gender)': 0,
        'MD (Gender)': 0,
        'DPR (Country)': 0,
        'MD (Country)': 0
    }
    
    # Run n simulations
    for _ in range(n):
        # Randomly select a track to generate recommendations
        sample_track_id = df['track_name'].sample(1).iloc[0]
        recommendations = recommend_songs(sample_track_id, model, reduced_features, df)
        
        # Calculate fairness metrics for this simulation
        metrics = calculate_fairness(recommendations)
        
        # Accumulate the metrics
        for key in total_metrics:
            total_metrics[key] += metrics[key]
    
    # Calculate averages
    avg_metrics = {key: value / n for key, value in total_metrics.items()}
    
    return avg_metrics

# Example usage:
n_simulations = 10000
start = time.time()
avg_fairness_metrics = simulate_fairness_metrics(n_simulations, df_genre_filtered, model_knn, reduced_features)
stop = time.time()
print(f"\nAverage Fairness Metrics over {n_simulations} simulations:")
for metric, value in avg_fairness_metrics.items():
    print(f"{metric}: {value:.3f}")

print(f"Time taken: {stop-start} seconds")



Average Fairness Metrics over 10000 simulations:
DPR (Gender): 0.805
MD (Gender): 0.155
DPR (Country): 0.547
MD (Country): 0.281
Time taken: 17.239089965820312 seconds


# Results
Average Fairness Metrics over 1000 simulations: \
DPR (Gender): 0.630 \
MD (Gender): 0.259 \
DPR (Country): 0.549 \
MD (Country): 0.289 \
 \
Average Fairness Metrics over 10000 simulations: \
DPR (Gender): 0.642 \
MD (Gender): 0.250 \
DPR (Country): 0.536 \
MD (Country): 0.296 