In [None]:
import pandas as pd
import pylab as plt
import seaborn
from sklearn.linear_model import LinearRegression
import numpy.random as nprnd
import random
import json
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances
from scipy.sparse import coo_matrix
from sklearn.decomposition import LatentDirichletAllocation
import lda

pd.set_option('display.max_columns', 500)
%matplotlib inline

#Data Gathering and Preparation

##Data prepocessing and Data integrity checks

### Read data
df = pd.read_csv("music.tsv",delimiter='\t',encoding='utf-8',header=None)
### Rename the columns
df.columns = ['userID','playerID','playerName','playNum']
### Remove the missing values
df.dropna(inplace=True)
### Data cleaning and turned the complicated ID format to the simple one.
_, user_id = np.unique(df.userID, return_inverse=True)
_, player_id = np.unique(df.playerID, return_inverse=True)
df['userID'] = user_id
df['playerID'] = player_id
df['userID'] = df['userID'].astype(np.int32)
df['playerID'] = df['playerID'].astype(np.int32)
df['playNum'] = df['playNum'].astype(np.int32)

s_player = df[['playerID', 'playerName']].drop_duplicates(subset='playerID').set_index('playerID').sort_index()
df.drop('playerName', axis=1, inplace=True)
df.head()

print(df.userID.max())
print(df.playerID.max())
print(df.playNum.max())
print(df.userID.min())
print(df.playerID.min())
print(df.playNum.min())

## Using SQL

from sqlalchemy import create_engine
engine = create_engine('sqlite:///:memory:')
import sqlite3
df.to_sql('data',engine)
### connecting to the database 
connection = sqlite3.connect("data.db")
### Count the total play times of a user
pd.read_sql_query('SELECT COUNT(*) FROM data GROUP BY userID', engine)
### Count the total play times group by artist with a descent order
pd.read_sql_query('SELECT playerID,COUNT(*) AS number FROM data GROUP BY playerID ORDER BY number DESC', engine)

##Feature Engineering
# pivot the matrix using sparse matrix. A low is a user, and each columns is the artist he listened, the values are the play times.
user_item_sp_mat = coo_matrix((df.playNum, (df.userID, df.playerID)), (df.userID.max()+1, df.playerID.max()+1), dtype=np.int32)


# Model (LDA)
##This is a unsupervised Problem. We used Latent Dirichlet allocation(LDA) to do the dimensions reduction which is clustering.
##We divided 160k artists into 20 class, according to the taste of users which is the latent variable.

### Run LDA model
lda = LatentDirichletAllocation(n_components=20, max_iter=10, random_state=0)
lda.fit(user_item_sp_mat)

### Check the comparable number of points in each cluster.
for i in range(0,20):
    print(lda.components_[[i]].size)
### Check the covariance. In each cluster, in oder to check, we over it with 100000000
for i in range(0,19):
    print(np.cov(lda.components_[[i]])/100000000)

## Interpretations for the clusters
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
print_top_words(lda, s_player.playerName, 5)

## Investigate properties of those clusters
from sklearn.externals import joblib
joblib.dump(lda, 'lda.pkl')
lda.components_.shape
user_components = lda.transform(user_item_sp_mat)
user_components.shape

## The process to use the model. Input is a artist's name and output is the artist(s) in the same style.
def getName(name):
    for i in range(s_player.shape[0]):
        if s_player.iloc[i].playerName == name:
            topic=lda.components_[:, i].argmax()
            for j in range(0,9):
                artists = lda.components_[[topic]].argsort(axis=1)
                if s_player.iloc[artists[0][-j]].playerName == name:
                    continue;
                else:
                    print(s_player.iloc[artists[0][-j]].playerName)
## an example
getName('bad religion')

## clusters stable
### Using samples because the original dataset is too large
sample = df.iloc[0:100000]
### pivot the matrix using sparse matrix. A low is a user, and each columns is the artist he listened, the values are the play times.
user_item_sp_mat_1 = coo_matrix((sample.playNum, (sample.userID, sample.playerID)), (sample.userID.max()+1, sample.playerID.max()+1), dtype=np.int32)
### Run LDA model time1
lda_2 = LatentDirichletAllocation(n_components=20, max_iter=10)
lda_2.fit(user_item_sp_mat_1)
lda_2.transform(user_item_sp_mat_1)
print_top_words(lda_2, s_player.playerName, 5)

### Run LDA model time2
lda_1 = LatentDirichletAllocation(n_components=20, max_iter=10)
lda_1.fit(user_item_sp_mat_1)
lda_1.transform(user_item_sp_mat_1)


### Run LDA model time3
lda_3 = LatentDirichletAllocation(n_components=20, max_iter=10)
lda_3.fit(user_item_sp_mat_1)
lda_3.transform(user_item_sp_mat_1)

### Run LDA model time3
lda_4 = LatentDirichletAllocation(n_components=20, max_iter=10)
lda_4.fit(user_item_sp_mat_1)
lda_4.transform(user_item_sp_mat_1)

### Run LDA model time3
lda_5 = LatentDirichletAllocation(n_components=20, max_iter=10)
lda_5.fit(user_item_sp_mat_1)
lda_5.transform(user_item_sp_mat_1)

### choose topic number
y11= np.sum(sum(np.cov(lda_11.transform(user_item_sp_mat_1))))
y12=np.sum(sum(np.cov(lda_2.transform(user_item_sp_mat_1))))
y13=np.sum(sum(np.cov(lda_12.transform(user_item_sp_mat_1))))
y14=np.sum(sum(np.cov(lda_13.transform(user_item_sp_mat_1))))
y1_sample = [y11,y12,y13,y14]
x1_sample = [1,2,3,4]
plt.plot(x1_sample,y1_sample)
plt.show()

###Plot the stability
y1 = np.sum(sum(np.cov(lda_5.transform(user_item_sp_mat_1),lda_3.transform(user_item_sp_mat_1))))
y2 = np.sum(sum(np.cov(lda_5.transform(user_item_sp_mat_1),lda_2.transform(user_item_sp_mat_1))))
y3 = np.sum(sum(np.cov(lda_5.transform(user_item_sp_mat_1),lda_1.transform(user_item_sp_mat_1))))
y5 = np.sum(sum(np.cov(lda_2.transform(user_item_sp_mat_1),lda_1.transform(user_item_sp_mat_1))))
y6 = np.sum(sum(np.cov(lda_3.transform(user_item_sp_mat_1),lda_1.transform(user_item_sp_mat_1))))
y7 = np.sum(sum(np.cov(lda_3.transform(user_item_sp_mat_1),lda_2.transform(user_item_sp_mat_1))))
y8 = np.sum(sum(np.cov(lda_4.transform(user_item_sp_mat_1),lda_2.transform(user_item_sp_mat_1))))
y = [y1,y2,y3,y5,y6,y7,y8]
x = [1,2,3,4,5,6,7]
plt.plot(x,y)
plt.show()

#Extra interesting ideas(using neutral network to do the non-linear dimensions reduction)
##Input user features on artist into a neural network and get a output with n dimensions which is latent features represent the style
##of the user. Do the same thing to artist and get a output represent the style of artist. The inner product of these latent features is the play times.
##This is the process similar to LDA, but is non-linear. We hope this kind of non-linear model could explore more deep relationship between users and artists.