# Last FM hometask <br>
https://www.kaggle.com/ravichaubey1506/lastfm <br>
1. Выбрать данные по странам своей группы (совместно): <br>
    3530203_70101: Germany, Netherlands <br>
    3530203_70102: Belarus, Ukraine, Poland, Russian Federation<br>
    3530903_70301: Sweden, Finland, Norway, Denmark, Iceland<br>
    3530903_70302: Spain, Portugal, France, Italy, Belgium<br>
    
2. Попытаться найти полезные с точки зрения продвижения групп (или еще чего-нибудь) и нетривиальные правила, используя алгоритмы Apriori, FPGrowth, FPMax и всевозможные метрики. Хотя бы 5 правил.
3. Вывести эти правила в отдельных ячейках. 
4. Подумать, как можно было бы использовать полученные правила.

In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules

In [2]:
data = pd.read_csv("lastfm.csv")
data.head()

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany


In [3]:
countries = 'Sweden, Finland, Norway, Denmark, Iceland'.split(', ')
countries

['Sweden', 'Finland', 'Norway', 'Denmark', 'Iceland']

In [4]:
lastfm_df = data[data['country'].isin(countries)]
len(lastfm_df)

30268

In [5]:
user_info = lastfm_df[['user', 'country', 'sex']].drop_duplicates('user')
user_info.set_index('user', inplace=True)
user_info.head()

Unnamed: 0_level_0,country,sex
user,Unnamed: 1_level_1,Unnamed: 2_level_1
5,Finland,m
7,Finland,m
19,Sweden,f
28,Sweden,m
48,Norway,m


In [6]:
# dummy encoding исполнителя
pivoted_artist = lastfm_df[['user', 'artist']].pivot_table(index='user', columns='artist',aggfunc=any, fill_value=False).astype(int)
pivoted_artist.sum() # количество непустых значений

artist
...and you will know us by the trail of dead    17
2pac                                            49
3 doors down                                    58
30 seconds to mars                              57
311                                              3
                                                ..
yeah yeah yeahs                                 14
yellowcard                                      15
yo la tengo                                      8
zero 7                                          23
Édith piaf                                      14
Length: 995, dtype: int64

In [7]:
# dummy encoding страны
pivoted_country = lastfm_df[['user', 'country']].pivot_table(index='user', columns='country',aggfunc=any, fill_value=False).astype(int)
pivoted_country.sum()

country
Denmark    119
Finland    510
Iceland     16
Norway     214
Sweden     575
dtype: int64

In [8]:
# dummy encoding пола пользователя
pivoted_sex = lastfm_df[['user', 'sex']].pivot_table(index='user', columns='sex',aggfunc=any, fill_value=False).astype(int)
pivoted_sex.sum()

sex
f     332
m    1102
dtype: int64

In [9]:
pivoted_all = pd.concat([pivoted_artist, pivoted_country, pivoted_sex], axis=1)
pivoted_all.head()

Unnamed: 0_level_0,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,311,36 crazyfists,44,50 cent,65daysofstatic,[unknown],...,yo la tengo,zero 7,Édith piaf,Denmark,Finland,Iceland,Norway,Sweden,f,m
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
19,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
28,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
48,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [10]:
pivoted_all[pivoted_all.columns[-13:]]

Unnamed: 0_level_0,yann tiersen,yeah yeah yeahs,yellowcard,yo la tengo,zero 7,Édith piaf,Denmark,Finland,Iceland,Norway,Sweden,f,m
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
5,0,0,0,0,0,0,0,1,0,0,0,0,1
7,0,0,0,0,0,0,0,1,0,0,0,0,1
19,0,0,0,1,0,0,0,0,0,0,1,1,0
28,0,0,0,0,0,0,0,0,0,0,1,0,1
48,0,0,0,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19582,0,0,0,0,0,0,0,0,0,1,0,0,1
19586,0,0,0,0,0,0,0,0,0,0,1,0,1
19587,0,0,0,0,0,0,1,0,0,0,0,0,1
19605,0,0,0,0,0,0,0,0,0,0,1,1,0


# Apriori

In [11]:
apriori_artist = apriori(pivoted_artist, min_support=0.03, use_colnames=True)
apriori_artist

Unnamed: 0,support,itemsets
0,0.034170,(2pac)
1,0.040446,(3 doors down)
2,0.039749,(30 seconds to mars)
3,0.038354,(50 cent)
4,0.042538,(a perfect circle)
...,...,...
257,0.036960,"(queen, the beatles)"
258,0.032775,"(radiohead, the beatles)"
259,0.036262,"(rammstein, system of a down)"
260,0.033473,"(system of a down, red hot chili peppers)"


In [12]:
apriori_artist_rules = association_rules(apriori_artist, metric="confidence", min_threshold=0.4)
apriori_artist_rules.sort_values('confidence', ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
10,(soilwork),(in flames),0.060669,0.154812,0.039052,0.643678,4.157813,0.029659,2.37198
15,(sonata arctica),(nightwish),0.076011,0.133194,0.040446,0.53211,3.995005,0.030322,1.852586
2,(children of bodom),(in flames),0.078103,0.154812,0.039052,0.5,3.22973,0.02696,1.690377
13,(koЯn),(system of a down),0.072524,0.137378,0.036262,0.5,3.639594,0.026299,1.725244
8,(koЯn),(in flames),0.072524,0.154812,0.03417,0.471154,3.043399,0.022943,1.598174
12,(iron maiden),(metallica),0.09205,0.164575,0.043236,0.469697,2.854006,0.028087,1.575374
4,(children of bodom),(nightwish),0.078103,0.133194,0.036262,0.464286,3.485789,0.025859,1.618038
0,(ac/dc),(metallica),0.087866,0.164575,0.040446,0.460317,2.797014,0.025986,1.547994
14,(led zeppelin),(pink floyd),0.073222,0.110181,0.033473,0.457143,4.149005,0.025405,1.63914
6,(the killers),(coldplay),0.105997,0.172245,0.04742,0.447368,2.597273,0.029162,1.497842


In [13]:
apriori_all = apriori(pivoted_all, min_support=0.03, use_colnames=True)
apriori_all

Unnamed: 0,support,itemsets
0,0.034170,(2pac)
1,0.040446,(3 doors down)
2,0.039749,(30 seconds to mars)
3,0.038354,(50 cent)
4,0.042538,(a perfect circle)
...,...,...
545,0.054393,"(m, Finland, system of a down)"
546,0.042538,"(m, system of a down, Sweden)"
547,0.048815,"(m, Sweden, the beatles)"
548,0.038354,"(m, Sweden, the killers)"


In [14]:
apriori_all_rules = association_rules(apriori_all, metric="confidence", min_threshold=0.1)
apriori_all_rules.head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(3 doors down),(m),0.040446,0.76848,0.032078,0.793103,1.032042,0.000996,1.119014
1,(50 cent),(m),0.038354,0.76848,0.032078,0.836364,1.088335,0.002604,1.414846
2,(a perfect circle),(m),0.042538,0.76848,0.036262,0.852459,1.10928,0.003572,1.569193
3,(ac/dc),(metallica),0.087866,0.164575,0.040446,0.460317,2.797014,0.025986,1.547994
4,(metallica),(ac/dc),0.164575,0.087866,0.040446,0.245763,2.797014,0.025986,1.209346


In [15]:
apriori_all_rules[apriori_all_rules['consequents'].map(lambda x: len(x) > 1)].sort_values('confidence', ascending=False).head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
396,(amorphis),"(m, Finland)",0.046722,0.273361,0.032775,0.701493,2.566174,0.020003,2.43424
520,(lamb of god),"(m, Finland)",0.052301,0.273361,0.032078,0.613333,2.243673,0.017781,1.879238
475,(soilwork),"(m, in flames)",0.060669,0.132497,0.034868,0.574713,4.337568,0.026829,2.039805
426,(children of bodom),"(m, Finland)",0.078103,0.273361,0.041841,0.535714,1.95973,0.020491,1.565068
607,(soilwork),"(m, Finland)",0.060669,0.273361,0.030683,0.505747,1.850106,0.014099,1.470176


In [16]:
apriori_all_rules[apriori_all_rules['antecedents'].map(lambda x: len(x) > 1)].sort_values('confidence', ascending=False).head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
504,"(iron maiden, Finland)",(m),0.042538,0.76848,0.040446,0.95082,1.237274,0.007756,4.707578
499,"(iron maiden, nightwish)",(m),0.034868,0.76848,0.032775,0.94,1.223194,0.00598,3.85867
532,"(metallica, nightwish)",(m),0.043933,0.76848,0.041144,0.936508,1.21865,0.007382,3.646444
494,"(iron maiden, metallica)",(m),0.043236,0.76848,0.040446,0.935484,1.217317,0.007221,3.588563
519,"(Finland, lamb of god)",(m),0.034868,0.76848,0.032078,0.92,1.197169,0.005283,2.894003


# FP-growth

In [17]:
fpgrowth_artist = fpgrowth(pivoted_artist, min_support=0.005, use_colnames=True)
# fpgrowth_artist

In [18]:
fpgrowth_artist_rules = association_rules(fpgrowth_artist, metric="confidence", min_threshold=0.8)
fpgrowth_artist_rules.sort_values('support', ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
294,"(nas, jay-z)",(kanye west),0.016736,0.08159,0.014644,0.875,10.724359,0.013279,7.34728
295,"(nas, snoop dogg)",(kanye west),0.016039,0.08159,0.01325,0.826087,10.124861,0.011941,5.280858
354,"(ac/dc, nirvana)",(metallica),0.015342,0.164575,0.012552,0.818182,4.971495,0.010027,4.59484
568,"(dark tranquillity, iron maiden)",(in flames),0.013947,0.154812,0.011855,0.85,5.490541,0.009696,5.634589
110,"(jay-z, notorious b.i.g.)",(kanye west),0.013947,0.08159,0.011855,0.85,10.417949,0.010717,6.122734
620,"(children of bodom, cradle of filth)",(in flames),0.014644,0.154812,0.011855,0.809524,5.229086,0.009588,4.437238
64,"(jay-z, eminem)",(kanye west),0.014644,0.08159,0.011855,0.809524,9.921856,0.01066,4.821653
297,"(nas, snoop dogg, kanye west)",(jay-z),0.01325,0.043236,0.011158,0.842105,19.47708,0.010585,6.059507
78,"(killswitch engage, soilwork)",(in flames),0.01325,0.154812,0.011158,0.842105,5.439545,0.009106,5.352859
296,"(nas, jay-z, snoop dogg)",(kanye west),0.011855,0.08159,0.011158,0.941176,11.535445,0.01019,15.612971
