In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from scipy.sparse import lil_matrix

In [2]:
df = pd.read_csv('./datasources/processed/user_artist_plays.csv')

In [3]:
df.isnull().sum()

User_ID     0
Artist     35
Plays       0
dtype: int64

In [4]:
df = df.dropna()

In [5]:
user_cnt = df.User_ID.nunique()
print ('No. Users: {}'.format(user_cnt))

df.groupby('User_ID').agg({'Artist': ['count'], 'Plays': ['sum']})\
                     .describe()

No. Users: 66949


Unnamed: 0_level_0,Artist,Plays
Unnamed: 0_level_1,count,sum
count,66949.0,66949.0
mean,49.832514,12733.677262
std,7.654471,17679.939268
min,1.0,1.0
25%,47.0,3306.0
50%,49.0,7716.0
75%,52.0,15604.0
max,166.0,420950.0


In [6]:
def createLookup(values):
    value_ids = [i for i in range (0, len(values))]
    value_to_idx = {values[i]: value_ids[i] for i in range(len(values))}
    idx_to_value = {value_ids[i]: values[i] for i in range(len(value_ids))}
    
    return value_to_idx, idx_to_value

## Create User ID lookup values and filter to top 1500 users ##
users = df.groupby('User_ID')['Plays'].sum()\
          .sort_values(ascending=False).reset_index().User_ID

user_to_idx, idx_to_user = createLookup(users)

In [7]:
df['U_ID'] = df.User_ID.apply(lambda x: user_to_idx[x])                              
df = df[df['U_ID'] < 10000]
user_cnt = df.User_ID.nunique()

print ('No. Users: {}'.format(user_cnt))

No. Users: 10000


In [8]:
artist_cnt = df.Artist.nunique()
print ('No. Artists: {}'.format(artist_cnt))

ptiles = [x/100 for x in range(90,100)]
df.groupby('Artist').agg({'User_ID': ['count'], 'Plays': ['sum']})\
                    .describe()

No. Artists: 38553


Unnamed: 0_level_0,User_ID,Plays
Unnamed: 0_level_1,count,sum
count,38553.0,38553.0
mean,13.243561,11040.39
std,67.029329,67581.96
min,1.0,1.0
25%,1.0,400.0
50%,1.0,964.0
75%,4.0,3151.0
max,3485.0,5000745.0


In [9]:
## Create Artist Looksups ## 
artists = df.groupby('Artist')['Plays'].sum()\
            .sort_values(ascending=False).reset_index().Artist
artist_to_idx, idx_to_artist = createLookup(artists)

In [10]:
## Filter Data to Top 1000 artists ##
df['A_ID'] = df.Artist.apply(lambda x: artist_to_idx[x])
df = df[df['A_ID'] < 1000]

artist_cnt = df.Artist.nunique()
print ('No. Artists: {}'.format(artist_cnt))

No. Artists: 1000


In [11]:
## Pivot User/Artist data in sparse matrix (artists x users) ##
df = df.groupby(['U_ID', 'A_ID'])[['Plays']].sum().reset_index()
pivoted_df = df.pivot(index='A_ID', columns='U_ID', values = 'Plays').fillna(0)

In [12]:
from sklearn.preprocessing import MinMaxScaler

def minmax(df):
    scaler = MinMaxScaler()
    scaler.fit(pivoted_df)
    scaled_df = scaler.transform(pivoted_df)
    return scaled_df

artist_by_user = minmax(pivoted_df)

In [13]:
with open('./datasources/artist_to_idx.pkl', 'wb') as f:
    pickle.dump(artist_to_idx, f)
with open('./datasources/user_to_idx.pkl', 'wb') as f:
    pickle.dump(user_to_idx, f)
with open('./datasources/idx_to_artist.pkl', 'wb') as f:
    pickle.dump(idx_to_artist, f)
with open('./datasources/idx_to_user.pkl', 'wb') as f:
    pickle.dump(idx_to_user, f)
    
import scipy
A_U_sparse = scipy.sparse.csc_matrix(artist_by_user)

scipy.sparse.save_npz('./data/artist_user_mtrx.npz', A_U_sparse)

In [None]:
## Clean up ##
# ! rm -rf /home/ec2-user/SageMaker/artist_recommendation_KNN/datasources/processed