In [17]:
import pandas as pd
from itertools import combinations

df = pd.read_csv('musicPrefs.csv')

# Data Preprocessing

In [18]:
main_genres = {
    'Latin Music': ['Reggae', 'Bachata', 'Salsa', 'Flamenco', 'Tango', 'Samba'],
    'Pop': ['Alternative Pop', 'Indie Pop', 'Disco Pop', 'K-pop', 'J-pop'],
    'Rock': ['Classic Rock', 'Indie Rock', 'Alternative Rock', 'Grunge Rock', 'Metal', 'Hard Rock', 'Heavy Metal'],
    'Jazz': ['Smooth Jazz', 'Swing', 'Blues', 'Ragtime', 'Bepop', 'Bossa Nova'],
    'Film': ['Musicals', 'Soundtracks', 'Bollywood', 'Tollywood', 'Show Musics'],
    'EDM': ['Techno', 'House', 'Dubstep'],
    'Classical': ['Instrumental', 'Opera', 'Orchestrals', 'Organ Music']
}
for main_genre, sub_genres in main_genres.items():
    df[main_genre] = df['What are the music genres you listen to?'].apply(lambda x: 1 if main_genre in str(x).split(', ') else 0)
    
    for sub_genre in sub_genres:
        df[sub_genre] = df.apply(lambda row: 1 if row[main_genre] == 1 and sub_genre in str(row[f'If you selected {main_genre}, which of these sub-genres do you listen to?']).split(', ') else 0, axis=1)

columns_to_remove = ['What are the music genres you listen to?']
for main_genre in main_genres:
    columns_to_remove.append(f'If you selected {main_genre}, which of these sub-genres do you listen to?')

df = df.drop(columns=columns_to_remove)
df

Unnamed: 0,Timestamp,Latin Music,Reggae,Bachata,Salsa,Flamenco,Tango,Samba,Pop,Alternative Pop,...,Show Musics,EDM,Techno,House,Dubstep,Classical,Instrumental,Opera,Orchestrals,Organ Music
0,11/17/2023 21:28:43,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,11/17/2023 21:51:01,1,1,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,11/17/2023 22:48:36,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,11/17/2023 23:41:55,0,0,0,0,0,0,0,1,0,...,0,1,1,1,0,0,0,0,0,0
4,11/18/2023 0:51:32,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,12/2/2023 11:57:31,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
141,12/2/2023 12:01:23,0,0,0,0,0,0,0,1,0,...,0,1,1,1,1,0,0,0,0,0
142,12/2/2023 12:24:21,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
143,12/2/2023 12:27:18,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


# Apriori - Frequen Itemsets

In [19]:
# FrequentItemsets: List of sets

def candidateGen(FrequentItemsets, k):
    C = []
    for f1, f2 in combinations(FrequentItemsets, 2):
        if len(f1) == len(f2) == k:
            mergedC = f1 | f2
            if len(mergedC) == len(f1) + 1:
                item = mergedC
                flag = True
                for s in item:
                    if len({s}) == len(item) - 1:
                        if s in FrequentItemsets:
                            flag = False
                if flag == True and item not in C:
                    C.append(item)
    return C
def getSupport(itemset, T):
    lenAllBaskets = len(T)
    count = sum([1 for sublist in T if all(item in sublist for item in itemset)])
    support = count / lenAllBaskets
    return support

freq_Itemsets_Support = pd.DataFrame(columns=['itemset', 'skyline_items', 'support'])

def Apriori(T, I, minSup, convertFunction):
    F = {}
    C = {}
    F[1] = [{i} for i in I if getSupport([i], T) >= minSup] # all items in first pass
    k = 2
    res = []
    res.extend(F[1])
    while F[k-1] != []:
        C[k] = candidateGen(F[k-1], k-1)
        #print("Candidates: ", C[k])
        count = {}
        for i, c in enumerate(C[k]):
            count[i] = getSupport(c, T)
            
        F[k] = []

        for i, c in enumerate(C[k]):
            if count[i] >= minSup:
                F[k].append(c)
                freq_Itemsets_Support.loc[len(freq_Itemsets_Support.index)] = [c, convertFunction(c),count[i]]

        # add values to res
        res.extend(F[k])
        # skyline datasets
        for f in F:
            if f > k-1:
                for A in F[f]:
                    A = list(A)
                    for a in A:
                        l = A.copy()
                        l.remove(a)
                        sbst = set(l)
                        if sbst in res:
                            res.remove(sbst)
        k = k + 1
    return res

In [20]:
T = [list(df.columns[df.iloc[i, :] == 1]) for i in range(len(df))]
I = list(df.columns)

def convertGenre(ids):
    return ' and '.join(ids)

# min_sup_values = [0.05, 0.10, 0.15, 0.20]
# for minSup in min_sup_values:
#     print(f"Running Apriori with minSup: {minSup}")
#     res = Apriori(T, I, minSup, convertGenre)
#     print(f"Number of Frequent Itemsets: {len(res)}")


In [21]:
res = Apriori(T, I, 0.1, convertGenre)

For a balanced approach, where we get a reasonable number of itemsets (genre listening patterns) without being overwhelmed, 0.1 seems like a good choice. This will likely give us a mix of common and somewhat less common (but interesting) patterns.

# Association Rules

In [22]:
freq_Itemsets_Support = freq_Itemsets_Support[freq_Itemsets_Support['itemset'].isin(res)].reset_index(drop=True)
freq_Itemsets_Support

Unnamed: 0,itemset,skyline_items,support
0,"{Bachata, Latin Music}",Bachata and Latin Music,0.103448
1,"{Jazz, Latin Music}",Jazz and Latin Music,0.103448
2,"{Swing, Jazz}",Swing and Jazz,0.124138
3,"{Bossa Nova, Jazz}",Bossa Nova and Jazz,0.117241
4,"{EDM, Jazz}",EDM and Jazz,0.103448
...,...,...,...
89,"{Instrumental, Indie Pop, Film, Pop, Alternati...",Instrumental and Indie Pop and Film and Pop an...,0.103448
90,"{Alternative Rock, Instrumental, Indie Pop, Po...",Alternative Rock and Instrumental and Indie Po...,0.110345
91,"{Alternative Rock, Indie Pop, Film, Pop, Sound...",Alternative Rock and Indie Pop and Film and Po...,0.110345
92,"{Instrumental, Indie Pop, Pop, Classic Rock, A...",Instrumental and Indie Pop and Pop and Classic...,0.103448


In [23]:
def confidence(f, s, T):
    # number of times we see abc in the number of times we see ab in association ab -> c
    f = list(f)
    try:
        conf = getSupport(f, T) / getSupport(s, T)
    except:
        return 0
    return conf

In [24]:
associationRulesConf = pd.DataFrame(columns=['itemset', 'leftSide', 'rightSide', 'conf'])

In [28]:
# apriori returns dictionary where keys are k and values are skyline ds of size k
def genRules(frequentItemsets, minConf, convertFunction, T):
    rules = []
    for itemset in frequentItemsets:
        if len(itemset) >= 2:
            for s in itemset:
                iset2 = itemset.copy()
                p1 = list(iset2)
                p1.remove(s)
                conf = confidence(itemset, p1, T)
                if conf >= minConf:
                    rules.append((itemset, p1, [s], conf))
    return rules


In [30]:
min_conf_values = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]  # Example values
for minConf in min_conf_values:
    print(f"Running Association Rules Generation with minConf: {minConf}")
    association_rules = genRules(res, minConf, convertGenre, T)
    print(f"Number of Association Rules: {len(association_rules)}")

Running Association Rules Generation with minConf: 0.5
Number of Association Rules: 408
Running Association Rules Generation with minConf: 0.6
Number of Association Rules: 393
Running Association Rules Generation with minConf: 0.7
Number of Association Rules: 364
Running Association Rules Generation with minConf: 0.8
Number of Association Rules: 317
Running Association Rules Generation with minConf: 0.9
Number of Association Rules: 257
Running Association Rules Generation with minConf: 0.95
Number of Association Rules: 195
Running Association Rules Generation with minConf: 0.99
Number of Association Rules: 193


For the purposes of strong confidence and still accounting for wanting to extract interesting patterns, we coose a minConf of 0.9. This also has to biggest difference in number of association rules generated, and so can be noted as the best place to set a cutoff. 

In [36]:
minConf = 0.9
association_rules = genRules(res, minConf, convertGenre, T)

In [37]:
# Create a mapping from each genre and sub-genre to its main genre
item_to_main_genre = {}
for main_genre, sub_genres in main_genres.items():
    item_to_main_genre[main_genre] = main_genre
    for sub_genre in sub_genres:
        item_to_main_genre[sub_genre] = main_genre

# Function to check if a rule strictly crosses different main genres
def is_strict_cross_genre_rule(rule):
    left_main_genres = set(item_to_main_genre[item] for item in rule[1])
    right_main_genres = set(item_to_main_genre[item] for item in rule[2])

    # Check for no overlap between the main genres on left and right sides of the rule
    return left_main_genres.isdisjoint(right_main_genres)

# Filter for strict cross-genre rules
strict_cross_genre_rules = [rule for rule in association_rules if is_strict_cross_genre_rule(rule)]

# Formatting and displaying the strict cross-genre rules
formatted_strict_cross_genre_rules = []
for rule in strict_cross_genre_rules:
    left_side = convertGenre(rule[1])
    right_side = convertGenre(rule[2])
    conf = rule[3]
    formatted_strict_cross_genre_rules.append(f"Rule: {left_side} --> {right_side} | Confidence: {conf:.2f}")

# Print formatted strict cross-genre rules
for fr in formatted_strict_cross_genre_rules:
    print(fr)

Rule: Rock and EDM and Techno --> Pop | Confidence: 1.00
Rule: EDM and Soundtracks and Film --> Pop | Confidence: 1.00
Rule: Techno and EDM and House --> Pop | Confidence: 1.00
Rule: Instrumental and Classical and EDM --> Pop | Confidence: 0.94
Rule: Latin Music and Classic Rock and Rock and Indie Rock --> Pop | Confidence: 1.00
Rule: Classic Rock and Rock and EDM and Indie Rock --> Pop | Confidence: 0.94
Rule: Film and Classic Rock and Rock and Musicals --> Pop | Confidence: 1.00
Rule: Soundtracks and Film and Classic Rock and Rock --> Pop | Confidence: 0.94
Rule: Classic Rock and Rock and EDM and House --> Pop | Confidence: 0.94
Rule: Indie Rock and Rock and EDM and House --> Pop | Confidence: 0.94
Rule: Soundtracks and Film and Classical and Musicals --> Pop | Confidence: 1.00
Rule: Instrumental and Film and Classical and Musicals --> Pop | Confidence: 1.00
Rule: Alternative Rock and Classic Rock and Rock and Classical and Indie Rock --> Pop | Confidence: 0.94
Rule: Instrumental and