In [1]:
import pickle
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from functions import get_movies_scores, get_infos, get_X
from variable_selection_metrics import user_questions, random_questions, loop_simulation, metrics_simulations
from metrics import random_forest, forward_feature_selection, backward_feature_elimination, random_variable_choice, variable_mean_choice


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Load data

artists = pd.read_csv("data/hetrec2011-lastfm-2k/artists.dat", sep="\t", usecols=['id', 'name', 'url', 'pictureURL'])

with open("data/hetrec2011-lastfm-2k/tags.dat") as f:
    lines = f.readlines()
    ids = []
    values = []
    for i, line in enumerate(lines[1:]):
        ids.append(line.strip().split("\t")[0])
        values.append(line.strip().split("\t")[1])
    tags = pd.DataFrame({'tagID': ids, 'tagValue': values})

user_artists = pd.read_csv("data/hetrec2011-lastfm-2k/user_artists.dat", sep="\t", usecols=['userID', 'artistID', 'weight'])

user_friends = pd.read_csv("data/hetrec2011-lastfm-2k/user_friends.dat", sep="\t", usecols=['userID', 'friendID'])

user_taggedartists = pd.read_csv("data/hetrec2011-lastfm-2k/user_taggedartists.dat", sep="\t", usecols=['userID', 'artistID', 'tagID', 'day', 'month', 'year'])

In [5]:
tags.tagID = tags.tagID.transform(int)

In [6]:
tags.size

23892

In [7]:
user_taggedartists.tagID.unique().size

9749

In [10]:
usefull_tags = user_taggedartists.tagID.unique().copy()

counts = user_taggedartists.tagID.value_counts()

selected = counts[counts > 500].index

selected_tags = tags[[(i in selected) for i in tags.tagID]]

In [12]:
def create_experiment_df(user_artists, tags, user_taggedartists):
    weights = user_artists.weight
    maxw = np.max(weights)
    threshold = np.exp(0.5 * np.log(maxw))
    ratings = np.zeros(weights.size, dtype=int)
    ratings[weights > threshold] = 5
    
    experiment_dict = {
        "item":user_artists.artistID,
        "user":user_artists.userID,
        "rating":ratings,
    }
    
    l = user_artists.artistID.size
    for tag in tags.tagID:
        experiment_dict[int(tag)] = np.zeros(l, dtype=int)
    
    for i in range(user_taggedartists.userID.size):
        experiment_dict[user_taggedartists.tagID.iloc[i]][user_artists.artistID == user_taggedartists.artistID.iloc[i]] += 1
        
    experiment_df = pd.DataFrame(experiment_dict)
    
    return experiment_df

In [9]:
selected_user_taggedartists = user_taggedartists[[(i in selected) for i in user_taggedartists.tagID]]

In [11]:
selected_user_taggedartists.tagID.unique().size

73

In [25]:
sel_exp_df = create_experiment_df(user_artists, selected_tags, selected_user_taggedartists)

In [None]:
for c in get_X(sel_exp_df):
    

In [26]:
sel_exp_df.head()

Unnamed: 0,item,user,rating,1,5,13,14,15,16,18,...,386,387,389,432,508,625,735,824,829,850
0,51,2,5,0,0,1,1,0,38,6,...,0,0,3,0,2,0,0,2,0,3
1,52,2,5,0,0,10,0,7,0,6,...,0,0,0,0,0,0,0,0,0,0
2,53,2,5,0,0,16,16,7,0,31,...,0,0,0,0,1,0,0,3,0,0
3,54,2,5,0,0,8,1,4,0,8,...,0,0,0,0,0,0,0,1,0,0
4,55,2,5,0,0,1,0,0,1,27,...,0,0,0,0,2,2,0,1,0,0


In [16]:
correlations = get_X(sel_exp_df).corr()

In [17]:
correlations

Unnamed: 0,1,5,13,14,15,16,18,21,24,25,...,386,387,389,432,508,625,735,824,829,850
1,1.000000,0.125888,-0.047747,-0.031603,-0.027343,-0.028024,-0.052959,-0.021685,-0.075232,0.049065,...,0.709577,-0.007501,-0.036783,-0.020187,-0.063300,-0.042157,-0.018891,-0.059347,-0.034242,0.053188
5,0.125888,1.000000,-0.026969,-0.014463,-0.012849,-0.017000,-0.032844,-0.010921,-0.037750,-0.024180,...,0.023879,-0.006162,-0.019614,0.021107,-0.042852,-0.018644,-0.020135,-0.037892,-0.016724,-0.024434
13,-0.047747,-0.026969,1.000000,0.510599,0.722213,0.097098,0.423893,0.689510,0.008826,0.093598,...,-0.041835,0.049156,0.147076,-0.035975,0.053110,0.046771,-0.021910,0.458472,0.007708,0.030446
14,-0.031603,-0.014463,0.510599,1.000000,0.339218,0.065259,0.221801,0.197054,-0.046907,0.027438,...,-0.025093,0.015199,0.009625,-0.021308,-0.010199,-0.020229,-0.015167,0.191548,-0.019642,-0.038957
15,-0.027343,-0.012849,0.722213,0.339218,1.000000,-0.017917,0.248044,0.836352,-0.044548,-0.031137,...,-0.020577,-0.016892,-0.022655,-0.016171,-0.040117,-0.017897,-0.027119,0.160936,-0.022201,-0.044428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625,-0.042157,-0.018644,0.046771,-0.020229,-0.017897,-0.027635,0.005509,-0.013044,0.293286,0.063551,...,-0.029842,-0.030374,-0.036039,-0.023550,0.253688,1.000000,0.011117,0.121354,0.159134,0.098704
735,-0.018891,-0.020135,-0.021910,-0.015167,-0.027119,0.031214,-0.052552,-0.023143,-0.012479,0.292455,...,0.072608,0.415762,-0.028693,-0.022744,0.017837,0.011117,1.000000,0.005692,0.316203,0.082505
824,-0.059347,-0.037892,0.458472,0.191548,0.160936,0.016016,0.256167,0.187209,0.165563,0.100030,...,-0.058623,0.195231,0.357937,-0.026123,0.268973,0.121354,0.005692,1.000000,0.057223,0.148039
829,-0.034242,-0.016724,0.007708,-0.019642,-0.022201,-0.024025,-0.046170,-0.018150,0.001463,-0.006133,...,-0.014729,0.072808,-0.012325,-0.019751,0.014574,0.159134,0.316203,0.057223,1.000000,0.068940


In [39]:
sed = sel_exp_df.copy()

In [40]:
for c in get_X(sel_exp_df):
    sed[c][sed[c] >= 1] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [41]:
sed

Unnamed: 0,item,user,rating,1,5,13,14,15,16,18,...,386,387,389,432,508,625,735,824,829,850
0,51,2,5,0,0,1,1,0,1,1,...,0,0,1,0,1,0,0,1,0,1
1,52,2,5,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,53,2,5,0,0,1,1,1,0,1,...,0,0,0,0,1,0,0,1,0,0
3,54,2,5,0,0,1,1,1,0,1,...,0,0,0,0,0,0,0,1,0,0
4,55,2,5,0,0,1,0,0,1,1,...,0,0,0,0,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92829,18726,2100,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92830,18727,2100,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92831,18728,2100,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92832,18729,2100,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
correlations = get_X(sed).corr()

In [43]:
correlations

Unnamed: 0,1,5,13,14,15,16,18,21,24,25,...,386,387,389,432,508,625,735,824,829,850
1,1.000000,0.267245,-0.022812,0.020336,-0.042534,0.030565,-0.031381,0.005266,-0.136111,0.111398,...,0.540296,0.163386,-0.048399,0.016214,0.007768,-0.080949,0.004852,-0.005258,-0.047436,0.108542
5,0.267245,1.000000,-0.069716,-0.028387,-0.026522,-0.055746,-0.077717,-0.030725,-0.119257,-0.030904,...,0.184331,0.000272,-0.048175,-0.011394,-0.091325,-0.049131,-0.043753,-0.071326,-0.036124,-0.035685
13,-0.022812,-0.069716,1.000000,0.402235,0.313836,0.296994,0.288839,0.340909,0.271724,0.100933,...,-0.068988,0.237313,0.306683,-0.089667,0.301461,0.145898,0.014856,0.498617,0.119411,0.168195
14,0.020336,-0.028387,0.402235,1.000000,0.429322,0.193884,0.275532,0.449015,-0.003347,0.121063,...,-0.068904,0.272700,0.142091,-0.057548,0.023957,-0.027034,0.064376,0.251835,0.054133,-0.000098
15,-0.042534,-0.026522,0.313836,0.429322,1.000000,0.019751,0.228595,0.540881,-0.041960,-0.012142,...,-0.053366,0.044689,-0.004237,-0.027226,-0.044646,0.000833,-0.054130,0.199712,-0.039672,-0.067561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625,-0.080949,-0.049131,0.145898,-0.027034,0.000833,-0.004929,0.131652,0.040133,0.314688,0.015469,...,-0.097406,-0.086357,-0.027540,0.032253,0.244774,1.000000,0.041317,0.217353,0.180526,0.161423
735,0.004852,-0.043753,0.014856,0.064376,-0.054130,0.137372,-0.094219,-0.068266,0.027489,0.468440,...,0.159877,0.228166,0.031323,-0.022811,0.000327,0.041317,1.000000,0.016052,0.316960,0.160648
824,-0.005258,-0.071326,0.498617,0.251835,0.199712,0.218908,0.213959,0.202782,0.307893,0.119584,...,-0.115699,0.233305,0.246283,-0.025336,0.351457,0.217353,0.016052,1.000000,0.121707,0.236402
829,-0.047436,-0.036124,0.119411,0.054133,-0.039672,0.063988,-0.091926,-0.032670,0.042978,0.115637,...,0.010617,0.126622,0.060928,-0.022593,0.102302,0.180526,0.316960,0.121707,1.000000,0.166959


In [48]:
to_remove = set()
X = get_X(sel_exp_df)
for i in correlations.columns:
    for j in correlations.columns:
        if i > j and correlations.loc[i][j] > 0.6:
            if X.loc[i].sum() > X.loc[j].sum():
                to_remove.add(j)
            else :
                to_remove.add(i)

In [49]:
to_remove

{78, 84, 102, 130, 735}

In [None]:


selected_tags_v2 = selected_tags[[(i not in to_remove) for i in selected_tags.tagID]]

selected_user_taggedartists_v2 = selected_user_taggedartists[[(i not in to_remove) for i in selected_user_taggedartists.tagID]]

selected_user_taggedartists_v2.tagID.unique().size

sel_exp_df_v2 = create_experiment_df(user_artists, selected_tags_v2, selected_user_taggedartists_v2)

sel_exp_df_v2.head()

get_X(sel_exp_df)

correlations.loc[24].sort_values()

X.loc[24].values

X.loc[39].values

tags[[(i in to_remove) for i in tags.tagID]]