# Real Data: [LastFM](https://www.last.fm/es/)

In [16]:
from fim import * #fim=frequent itemset mining
import pandas as pd

In [5]:
lastfm = pd.read_csv("https://www.biz.uiowa.edu/faculty/jledolter/DataMining/lastfm.csv")
lastfm.shape

(289955, 4)

In [6]:
lastfm.head(20)

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany
5,1,schandmaul,f,Germany
6,1,edguy,f,Germany
7,1,jack johnson,f,Germany
8,1,eluveitie,f,Germany
9,1,the killers,f,Germany


## Data Pre-processing

In [7]:
#quitar columnas
lastfm = lastfm[['user', 'artist']]

In [8]:
#eliminar duplicados
lastfm = lastfm.drop_duplicates()
lastfm.shape

(289953, 2)

In [9]:
lastfm.head()

Unnamed: 0,user,artist
0,1,red hot chili peppers
1,1,the black dahlia murder
2,1,goldfrapp
3,1,dropkick murphys
4,1,le tigre


In [10]:
#contar la cantidad de usuarios distintos
print(len(lastfm['user'].unique()))
print(lastfm['user'].unique())

15000
[    1     3     4 ... 19715 19717 19718]


In [11]:
#contar la cantidad de artistas distintos
print(len(lastfm['artist'].unique()))
print(lastfm['artist'].unique())

1004
['red hot chili peppers' 'the black dahlia murder' 'goldfrapp' ...
 'immortal technique' 'delerium' 'finch']


## Transforming the data into a transactional dataset

In [12]:
trans = []
for u in lastfm['user'].unique(): #para cada usuario único, se debe recuperar la lista de los artistas que sigue
  #print(u,lastfm[lastfm['user'] == u]['artist'].values)
  trans.append(list(lastfm[lastfm['user'] == u]['artist'].values))
print(len(trans))

15000


In [15]:
trans

[['red hot chili peppers',
  'the black dahlia murder',
  'goldfrapp',
  'dropkick murphys',
  'le tigre',
  'schandmaul',
  'edguy',
  'jack johnson',
  'eluveitie',
  'the killers',
  'judas priest',
  'rob zombie',
  'john mayer',
  'the who',
  'guano apes',
  'the rolling stones'],
 ['devendra banhart',
  'boards of canada',
  'cocorosie',
  'aphex twin',
  'animal collective',
  'atmosphere',
  'joanna newsom',
  'air',
  'portishead',
  'massive attack',
  'broken social scene',
  'arcade fire',
  'plaid',
  'prefuse 73',
  'm83',
  'the flashbulb',
  'pavement',
  'goldfrapp',
  'amon tobin',
  'sage francis',
  'four tet',
  'max richter',
  'autechre',
  'radiohead',
  'neutral milk hotel',
  'beastie boys',
  'aesop rock',
  'mf doom',
  'the books'],
 ['tv on the radio',
  'tool',
  'kyuss',
  'dj shadow',
  'air',
  'a tribe called quest',
  'the cinematic orchestra',
  'beck',
  'bon iver',
  'röyksopp',
  'bonobo',
  'the decemberists',
  'snow patrol',
  'battles',
  't

In [13]:
print('El usuario {} escucha a {}'.format(0,trans[0]))
print('El usuario {} escucha a {}'.format(10,trans[10]))
print('El usuario {} escucha a {}'.format(100,trans[100]))
print('El usuario {} escucha a {}'.format(1000,trans[1000]))
print('El usuario {} escucha a {}'.format(5000,trans[5000]))
print('El usuario {} escucha a {}'.format(10000,trans[10000]))

El usuario 0 escucha a ['red hot chili peppers', 'the black dahlia murder', 'goldfrapp', 'dropkick murphys', 'le tigre', 'schandmaul', 'edguy', 'jack johnson', 'eluveitie', 'the killers', 'judas priest', 'rob zombie', 'john mayer', 'the who', 'guano apes', 'the rolling stones']
El usuario 10 escucha a ['sonata arctica', 'infected mushroom', 'iron maiden', 'creedence clearwater revival', 'nightwish', 'mindless self indulgence', 'electric light orchestra', 'john williams', 'michael jackson', 'fleetwood mac', 'aphex twin']
El usuario 100 escucha a ['queens of the stone age', 'pantera', 'nine inch nails', 'no doubt', 'down', 'guns n roses', 'the smashing pumpkins', 'iron maiden', 'guano apes', 'the offspring', 'metallica', 'fatboy slim', 'pearl jam', 'pink', 'alice in chains', 'marilyn manson', 'slipknot', 'eric clapton', 'iced earth', 'black sabbath', 'red hot chili peppers', 'u2', 'as i lay dying', 'godsmack', 'avril lavigne', 'led zeppelin', 'john williams', 'in flames', 'isis', 'stone 

## Mining itemsets

In [17]:
#extraer todos los itemsets frecuentes con al menos 1% de soporte mínimo y 2 items como mínimo
fpgrowth(trans, supp=1, zmin=2)

[(('the beatles', 'radiohead'), 873),
 (('coldplay', 'radiohead'), 819),
 (('coldplay', 'the beatles', 'radiohead'), 293),
 (('coldplay', 'the beatles'), 665),
 (('red hot chili peppers', 'radiohead'), 474),
 (('red hot chili peppers', 'the beatles', 'radiohead'), 187),
 (('red hot chili peppers', 'the beatles'), 508),
 (('red hot chili peppers', 'coldplay', 'radiohead'), 222),
 (('red hot chili peppers', 'coldplay', 'the beatles'), 201),
 (('red hot chili peppers', 'coldplay'), 579),
 (('muse', 'radiohead'), 645),
 (('muse', 'the beatles', 'radiohead'), 207),
 (('muse', 'the beatles'), 408),
 (('muse', 'coldplay', 'radiohead'), 275),
 (('muse', 'coldplay', 'the beatles'), 184),
 (('muse', 'coldplay'), 582),
 (('muse', 'red hot chili peppers', 'radiohead'), 168),
 (('muse', 'red hot chili peppers', 'coldplay'), 181),
 (('muse', 'red hot chili peppers'), 406),
 (('metallica', 'radiohead'), 263),
 (('metallica', 'the beatles'), 336),
 (('metallica', 'coldplay'), 261),
 (('metallica', 're

## Mining Association Rules

### support 1% and confidence 40%

In [18]:
ar = fpgrowth(trans, target='r', supp=1, conf=40, report='aSC')
ar
#('coldplay', 'the beatles') -> 'radiohead'
#conf(('coldplay', 'the beatles') -> 'radiohead') = freq(('coldplay', 'the beatles') -> 'radiohead') / freq(('coldplay', 'the beatles'))
#conf(('coldplay', 'the beatles') -> 'radiohead') = 293 / 665 = 0.4406

[('radiohead',
  ('coldplay', 'the beatles'),
  293,
  1.9533333333333334,
  44.06015037593985),
 ('coldplay',
  ('red hot chili peppers', 'radiohead'),
  222,
  1.48,
  46.835443037974684),
 ('radiohead',
  ('red hot chili peppers', 'coldplay', 'the beatles'),
  92,
  0.6133333333333334,
  45.77114427860697),
 ('the beatles',
  ('red hot chili peppers', 'coldplay', 'radiohead'),
  92,
  0.6133333333333334,
  41.44144144144144),
 ('coldplay',
  ('red hot chili peppers', 'the beatles', 'radiohead'),
  92,
  0.6133333333333334,
  49.19786096256685),
 ('radiohead', ('muse', 'the beatles'), 207, 1.38, 50.73529411764706),
 ('radiohead',
  ('muse', 'coldplay'),
  275,
  1.8333333333333333,
  47.250859106529205),
 ('coldplay',
  ('muse', 'radiohead'),
  275,
  1.8333333333333333,
  42.63565891472868),
 ('radiohead',
  ('muse', 'coldplay', 'the beatles'),
  96,
  0.64,
  52.17391304347826),
 ('coldplay',
  ('muse', 'the beatles', 'radiohead'),
  96,
  0.64,
  46.3768115942029),
 ('coldplay',
 

In [19]:
df_ar = pd.DataFrame(ar)
df_ar.columns = ['Consecuente', 'Antecendente', 'Freq', 'Freq(%)', 'Conf']
df_ar.sort_values(by='Conf', ascending=False)

Unnamed: 0,Consecuente,Antecendente,Freq,Freq(%),Conf
1556,coldplay,"(keane, snow patrol)",118,0.786667,75.641026
440,radiohead,"(sigur rós, the cure)",106,0.706667,69.736842
1085,coldplay,"(snow patrol, oasis)",114,0.760000,69.512195
1550,coldplay,"(keane, muse)",115,0.766667,68.862275
1552,coldplay,"(keane, the killers)",143,0.953333,67.772512
...,...,...,...,...,...
2088,death cab for cutie,"(say anything,)",68,0.453333,40.000000
57,radiohead,"(nirvana, pink floyd)",122,0.813333,40.000000
1265,belle and sebastian,"(sufjan stevens, the shins)",82,0.546667,40.000000
2096,the beatles,"(cat stevens,)",70,0.466667,40.000000


### support 0.05% and confidence 40%

In [20]:
ar = fpgrowth(trans, target='r', supp=0.05, conf=40, report='aC')
print('Hay {} reglas'.format(len(ar)))

Hay 12688286 reglas


In [21]:
ar

[('radiohead', ('coldplay', 'the beatles'), 293, 44.06015037593985),
 ('coldplay', ('red hot chili peppers', 'radiohead'), 222, 46.835443037974684),
 ('radiohead',
  ('red hot chili peppers', 'coldplay', 'the beatles'),
  92,
  45.77114427860697),
 ('the beatles',
  ('red hot chili peppers', 'coldplay', 'radiohead'),
  92,
  41.44144144144144),
 ('coldplay',
  ('red hot chili peppers', 'the beatles', 'radiohead'),
  92,
  49.19786096256685),
 ('radiohead', ('muse', 'the beatles'), 207, 50.73529411764706),
 ('radiohead', ('muse', 'coldplay'), 275, 47.250859106529205),
 ('coldplay', ('muse', 'radiohead'), 275, 42.63565891472868),
 ('radiohead', ('muse', 'coldplay', 'the beatles'), 96, 52.17391304347826),
 ('coldplay', ('muse', 'the beatles', 'radiohead'), 96, 46.3768115942029),
 ('coldplay', ('muse', 'the beatles'), 184, 45.09803921568628),
 ('radiohead', ('muse', 'red hot chili peppers'), 168, 41.37931034482759),
 ('radiohead',
  ('muse', 'red hot chili peppers', 'the beatles'),
  60,
 