In [1]:
#Fitting exposure matrix using PCA
from sklearn.decomposition import PCA
from sklearn import preprocessing






#Row represent userIds, Columns represent songIds
#dim are number of latent factors
#exposureMatrix is numpy matrix, with row represents every user and column representing every feature
def GetFactorsForRowsAndColumns(dim, exposureMatrix):
    Xscaler = preprocessing.StandardScaler(with_std=False).fit(exposureMatrix)
    Xscaled = Xscaler.fit_transform(exposureMatrix)
    pca = PCA(n_components=dim)
    pca.fit(Xscaled)
    rowFactors = pca.fit_transform(Xscaled)
    colFactors = pca.components_.T

    #Shape of row factors will be (num of rows) x (latent dim)
    #Shape of column factors will be (num of cols) x (latent dim)
    
    return rowFactors, colFactors



  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [2]:
import os
import sys
import pickle
import warnings
import numpy as np 
import pandas as pd 
from ast import literal_eval
import matplotlib.pyplot as plt
from scipy import sparse, stats
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.model_selection import train_test_split

randseed = 29266137

In [73]:
def PreprocessData():
    df = pd.read_csv('/home/uk2051/ML_Project/songsDataset.csv')
    df = df.rename({"\'userID\'": "userId", "\'songID\'": "movieId", "\'rating\'": "rating"}, axis='columns')
    
    df = df.drop(labels = range(10000,2000000), axis = 0)
    songIntCode, songUniques = pd.factorize(df['movieId'], sort=True) #Reindexing songs ids
    df['movieId'] = songIntCode
    return df

In [74]:
df = PreprocessData()

In [75]:
df.head()

Unnamed: 0,userId,movieId,rating
0,0,382,5
1,0,448,4
2,0,1185,4
3,0,1891,5
4,0,4466,5


In [78]:
def exposure_data(df, train_size=0.75):
    user_to_row = {}
    movie_to_column = {}
    
    uniq_users = np.unique(df['userId'])
    uniq_movies = np.unique(df['movieId'])

    for i, user_id in enumerate(uniq_users):
        user_to_row[user_id] = i

    for j, movie_id in enumerate(uniq_movies):
        movie_to_column[movie_id] = j
    
    data = []
    n_users = len(uniq_users)
    n_movies = len(uniq_movies)
    for row in df.iterrows():
      user, movie = row[1][0], row[1][1]
      data.append((user_to_row[user], movie_to_column[movie], 1))

    exposure_df = pd.DataFrame(data, columns =['userid', 'movieid', 'a'])
    #Train-test split
    df_copy = exposure_df.copy()
    train_set = df_copy.sample(frac=train_size, random_state=0)
    test_set = df_copy.drop(train_set.index)
    
    return exposure_df, train_set, test_set, n_users, n_movies, user_to_row, movie_to_column


exposure_df, train, test, n_users, n_movies, user_to_row, movie_to_column = exposure_data(df)

In [79]:
exposure_df.head()

Unnamed: 0,userid,movieid,a
0,0,382,1
1,0,448,1
2,0,1185,1
3,0,1891,1
4,0,4466,1


In [81]:
a_matrix = sparse.coo_matrix((exposure_df['a'],(exposure_df['userid'],exposure_df['movieid'])),shape=(n_users,n_movies))

In [82]:
a_matrix.shape
print(type(a_matrix))
a_matrix = a_matrix.todense()
print(type(a_matrix))
a_matrix = a_matrix.tolist()
print(type(a_matrix))

<class 'scipy.sparse.coo.coo_matrix'>
<class 'numpy.matrix'>
<class 'list'>


In [127]:
userFactors, songsFactor = GetFactorsForRowsAndColumns(20, a_matrix)

In [128]:
print(userFactors.shape)

(1000, 20)


In [129]:
def get_ratings_matrix(df, train_size=0.75):
    user_to_row = {}
    movie_to_column = {}
    df_values = df.values
    n_dims = 10
    parameters = {}
    
    uniq_users = np.unique(df_values[:, 0])
    uniq_movies = np.unique(df_values[:, 1])

    for i, user_id in enumerate(uniq_users):
        user_to_row[user_id] = i

    for j, movie_id in enumerate(uniq_movies):
        movie_to_column[movie_id] = j
    
    n_users = len(uniq_users)
    n_movies = len(uniq_movies)
    
    R = np.zeros((n_users, n_movies))
    
    df_copy = df.copy()
    train_set = df_copy.sample(frac=train_size, random_state=0)
    test_set = df_copy.drop(train_set.index)
    
    for index, row in train_set.iterrows():
        i = user_to_row[row.userId]
        j = movie_to_column[row.movieId]
        R[i, j] = row.rating

    return R, train_set, test_set, n_dims, n_users, n_movies, user_to_row, movie_to_column

In [130]:
R, train_set, test_set, n_dims, n_users, n_movies, user_to_row, movie_to_column = get_ratings_matrix(df, 0.8)

In [154]:
def matrix_X(R):
  X = []
  for i in range(len(R)):
    row = [1 if val == 1 else 0 for val in R[i]]
    X.append(row)
  return X

X = matrix_X(R)
y = R #Rating matrix
y_scaler = preprocessing.StandardScaler().fit(y)
y_scaled = y_scaler.fit_transform(y)

X_scaler = preprocessing.StandardScaler().fit(X)
X_scaled = X_scaler.fit_transform(X)

pmfU_scaler = preprocessing.StandardScaler().fit(userFactors)
pmfU_scaled = pmfU_scaler.fit_transform(userFactors)

In [155]:
X_train, X_test = train_test_split(X_scaled, test_size=0.20, random_state=randseed)
y_train, y_test = train_test_split(y_scaled, test_size=0.20, random_state=randseed)
pmfU_train, pmfU_test = train_test_split(pmfU_scaled, test_size=0.20, random_state=randseed)
n_users, n_items = X_train.shape

In [156]:
print(len(X))
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(pmfU_train.shape)
print(pmfU_test.shape)
print(n_items)

1000
(800, 7389)
(800, 7389)
(200, 7389)
(200, 7389)
(800, 20)
(200, 20)
7389


In [159]:
import warnings
warnings.filterwarnings('ignore')
reg = linear_model.Ridge(normalize=True)
for i in range(n_items):
    reg.fit(np.column_stack([X_train[:,i], pmfU_train]), y_train[:,i])

In [160]:
test_items = X_test.shape[1]
prediction = []

for i in range(test_items):
    res = reg.predict(np.column_stack([X_test[:,i], pmfU_test]))
    prediction.append(res)

In [161]:
y_test = np.transpose(y_test)
rmse = mean_squared_error(y_test, prediction, squared=False)
print(rmse)

0.8880783998490532
