# NMF (Non-Negative Matrix Factorization)

---

## 0) Imports and Loading Data

In [1]:
import pandas as pd
import numpy as np

# NMF
from sklearn.decomposition import NMF

In [9]:
R = pd.read_csv('movie_ratings.csv', usecols=list(range(1,12)), index_col=0, 
               skiprows=1, nrows=13)
R

Unnamed: 0_level_0,Night on Earth,Der Clou / The Sting,Me and You and Everyone We Know,La vita è bella,Before Sunset,The Last King of Scotland,The Mauritanian,One Flew Over the Cuckoo's Nest,Lullaby for Pi,twelve monkeys
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Stefan,5.0,,,,,,,5.0,,
Thomas,,5.0,,4.0,2.0,4.0,,3.0,,
Filipe,,,5.0,5.0,4.0,,,4.0,,5.0
Marija,5.0,,5.0,3.0,5.0,3.0,,4.0,,2.0
Andreas,,,,,,5.0,,,,
Daniel,5.0,2.0,4.0,3.0,4.0,4.0,5.0,4.0,,5.0
Matthias,4.0,,,4.0,,3.0,,4.0,,4.0
Jana,,,,,,,,5.0,,4.0
Homa,,,,5.0,5.0,,,5.0,,
Saskia,,,,,,,,,5.0,


In [10]:
R.shape

(13, 10)

---

## 1) Train NMF
* Small n_components = ...
* High n_components = ...

In [11]:
# Instantiate the nmf
nmf = NMF(n_components=4)

In [12]:
# As usual with scikit-learn Classes, we fit the nmf
nmf.fit(R)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## 2) Deal with Missing Values

- Scikit-Learn implementation of NMF is not able to deal with missing values. This was one of our main points of using it.

**What can we do about it?**

- Fill in the missing values
    - Arbitrary value (0 or 3)
    - Average over DataFrame
    - Average per Column
    - Average per Row
    - Median

- Use a different package; eg. surprise
- Use my nmf_extension code

In [14]:
# This is not the best solution; try out different solutions
R = R.fillna(0)

In [15]:
R

Unnamed: 0_level_0,Night on Earth,Der Clou / The Sting,Me and You and Everyone We Know,La vita è bella,Before Sunset,The Last King of Scotland,The Mauritanian,One Flew Over the Cuckoo's Nest,Lullaby for Pi,twelve monkeys
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Stefan,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
Thomas,0.0,5.0,0.0,4.0,2.0,4.0,0.0,3.0,0.0,0.0
Filipe,0.0,0.0,5.0,5.0,4.0,0.0,0.0,4.0,0.0,5.0
Marija,5.0,0.0,5.0,3.0,5.0,3.0,0.0,4.0,0.0,2.0
Andreas,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0
Daniel,5.0,2.0,4.0,3.0,4.0,4.0,5.0,4.0,0.0,5.0
Matthias,4.0,0.0,0.0,4.0,0.0,3.0,0.0,4.0,0.0,4.0
Jana,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,4.0
Homa,0.0,0.0,0.0,5.0,5.0,0.0,0.0,5.0,0.0,0.0
Saskia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0


In [16]:
nmf.fit(R)



NMF(n_components=4)

## 3) Inspect the sub-matrices

In [18]:
R.shape

(13, 10)

In [17]:
# Extract Q
Q = nmf.components_
Q # Q should have a shape of ???

array([[0.        , 0.        , 1.04507656, 2.70406048, 2.44746186,
        0.        , 0.        , 0.        , 0.        , 1.77846238],
       [2.66275728, 0.        , 1.24888253, 0.        , 0.48978615,
        0.70723996, 0.94825145, 1.85082355, 0.        , 0.69962965],
       [0.13502739, 0.        , 0.        , 0.45001356, 0.        ,
        0.        , 0.        , 2.79954125, 0.        , 0.6575245 ],
       [0.        , 1.28723721, 0.        , 0.94728594, 0.        ,
        2.42821088, 0.13595237, 0.        , 0.        , 0.09436887]])

In [19]:
# Make a DataFrame out of Q
Q = pd.DataFrame(nmf.components_, columns=R.columns)
Q

Unnamed: 0,Night on Earth,Der Clou / The Sting,Me and You and Everyone We Know,La vita è bella,Before Sunset,The Last King of Scotland,The Mauritanian,One Flew Over the Cuckoo's Nest,Lullaby for Pi,twelve monkeys
0,0.0,0.0,1.045077,2.70406,2.447462,0.0,0.0,0.0,0.0,1.778462
1,2.662757,0.0,1.248883,0.0,0.489786,0.70724,0.948251,1.850824,0.0,0.69963
2,0.135027,0.0,0.0,0.450014,0.0,0.0,0.0,2.799541,0.0,0.657525
3,0.0,1.287237,0.0,0.947286,0.0,2.428211,0.135952,0.0,0.0,0.094369


In [20]:
R.shape

(13, 10)

In [21]:
# Calculate P
P = nmf.transform(R)
P # We expect a shape of 13x4

array([[0.00000000e+00, 1.22435773e+00, 8.62779427e-01, 0.00000000e+00],
       [4.29329937e-01, 0.00000000e+00, 9.56375656e-01, 2.15846918e+00],
       [1.89660577e+00, 3.37954842e-01, 1.19922192e+00, 0.00000000e+00],
       [1.18064519e+00, 1.95356107e+00, 0.00000000e+00, 3.95981457e-01],
       [0.00000000e+00, 5.79166177e-02, 0.00000000e+00, 1.41901463e+00],
       [1.11220072e+00, 2.29317922e+00, 0.00000000e+00, 1.04506959e+00],
       [5.42491709e-01, 9.03520626e-01, 1.06081968e+00, 9.11164543e-01],
       [1.44436228e-01, 3.69583550e-04, 1.91754297e+00, 0.00000000e+00],
       [1.24445461e+00, 0.00000000e+00, 1.55996442e+00, 6.81867635e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.82759351e-01, 0.00000000e+00, 1.60939980e+00, 1.11774697e+00],
       [1.71333327e+00, 0.00000000e+00, 9.05799962e-02, 4.64010982e-02],
       [0.00000000e+00, 0.00000000e+00, 1.64862879e+00, 0.00000000e+00]])

In [22]:
# Make a DataFrame out of P
P = pd.DataFrame(nmf.transform(R), index=R.index)
P

Unnamed: 0_level_0,0,1,2,3
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Stefan,0.0,1.224358,0.862779,0.0
Thomas,0.42933,0.0,0.956376,2.158469
Filipe,1.896606,0.337955,1.199222,0.0
Marija,1.180645,1.953561,0.0,0.395981
Andreas,0.0,0.057917,0.0,1.419015
Daniel,1.112201,2.293179,0.0,1.04507
Matthias,0.542492,0.903521,1.06082,0.911165
Jana,0.144436,0.00037,1.917543,0.0
Homa,1.244455,0.0,1.559964,0.068187
Saskia,0.0,0.0,0.0,0.0


In [23]:
# Look at the reconstruction error
nmf.reconstruction_err_

11.858954706043287

## 3) Reconstruct the original matrix

In [None]:
# Look at the original matrix R
R.head()

In [25]:
Q.shape

(4, 10)

In [26]:
P.shape

(13, 4)

In [31]:
# Calculate R_hat
R_hat = pd.DataFrame(np.dot(P, Q), columns=R.columns, index=R.index)
round(R_hat, 2)

Unnamed: 0_level_0,Night on Earth,Der Clou / The Sting,Me and You and Everyone We Know,La vita è bella,Before Sunset,The Last King of Scotland,The Mauritanian,One Flew Over the Cuckoo's Nest,Lullaby for Pi,twelve monkeys
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Stefan,3.38,0.0,1.53,0.39,0.6,0.87,1.16,4.68,0.0,1.42
Thomas,0.13,2.78,0.45,3.64,1.05,5.24,0.29,2.68,0.0,1.6
Filipe,1.06,0.0,2.4,5.67,4.81,0.24,0.32,3.98,0.0,4.4
Marija,5.2,0.51,3.67,3.57,3.85,2.34,1.91,3.62,0.0,3.5
Andreas,0.15,1.83,0.07,1.34,0.03,3.49,0.25,0.11,0.0,0.17
Daniel,6.11,1.35,4.03,4.0,3.85,4.16,2.32,4.24,0.0,3.68
Matthias,2.55,1.17,1.7,2.81,1.77,2.85,0.98,4.64,0.0,2.38
Jana,0.26,0.0,0.15,1.25,0.35,0.0,0.0,5.37,0.0,1.52
Homa,0.21,0.09,1.3,4.13,3.05,0.17,0.01,4.37,0.0,3.25
Saskia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---

## 4) Make a prediction based on new user input

In [32]:
# Create a dictionary for a new user
new_user_input = {'Before Sunset': 2, 'twelve monkeys': 4} # Because you will receive your Data from the web app as json

In [34]:
new_user_input

{'Before Sunset': 2, 'twelve monkeys': 4}

In [38]:
# Convert it to a pd.DataFrame
new_user = pd.DataFrame(new_user_input, index=['Paul'], columns=R.columns)
new_user

Unnamed: 0,Night on Earth,Der Clou / The Sting,Me and You and Everyone We Know,La vita è bella,Before Sunset,The Last King of Scotland,The Mauritanian,One Flew Over the Cuckoo's Nest,Lullaby for Pi,twelve monkeys
Paul,,,,,2,,,,,4


In [40]:
#Prediction step 1 - generate user_P 
user_P = nmf.transform(new_user.fillna(0))
user_P

array([[0.66054021, 0.05595301, 0.08457801, 0.        ]])

In [43]:
#new user R - reconstruct R but for this new user only
user_R = pd.DataFrame(np.dot(user_P, Q), columns=R.columns, index=['Paul'])
user_R

Unnamed: 0,Night on Earth,Der Clou / The Sting,Me and You and Everyone We Know,La vita è bella,Before Sunset,The Last King of Scotland,The Mauritanian,One Flew Over the Cuckoo's Nest,Lullaby for Pi,twelve monkeys
Paul,0.16041,0.0,0.760194,1.824202,1.644052,0.039572,0.053058,0.340339,0.0,1.269504


In [55]:
user_R.loc[:,list(set(user_R.columns).difference(set(new_user_input.keys())))]

Unnamed: 0,One Flew Over the Cuckoo's Nest,The Mauritanian,Lullaby for Pi,Der Clou / The Sting,Me and You and Everyone We Know,Night on Earth,La vita è bella,The Last King of Scotland
Paul,0.340339,0.053058,0.0,0.0,0.760194,0.16041,1.824202,0.039572


In [56]:
# Remove movies that have already been seen
recommendations = user_R.loc[:,list(set(user_R.columns).difference(set(new_user_input.keys())))]
recommendations

Unnamed: 0,One Flew Over the Cuckoo's Nest,The Mauritanian,Lullaby for Pi,Der Clou / The Sting,Me and You and Everyone We Know,Night on Earth,La vita è bella,The Last King of Scotland
Paul,0.340339,0.053058,0.0,0.0,0.760194,0.16041,1.824202,0.039572


In [58]:
# Sort the recomendations
recommendations.sort_values(axis=1, by='Paul', ascending=False)

Unnamed: 0,La vita è bella,Me and You and Everyone We Know,One Flew Over the Cuckoo's Nest,Night on Earth,The Mauritanian,The Last King of Scotland,Lullaby for Pi,Der Clou / The Sting
Paul,1.824202,0.760194,0.340339,0.16041,0.053058,0.039572,0.0,0.0


---

## NMF_extension

In [59]:
pip install git+https://github.com/stefanfroth/nmf_extension

Collecting git+https://github.com/stefanfroth/nmf_extension
  Cloning https://github.com/stefanfroth/nmf_extension to /private/var/folders/5k/g90jxpln6hd_1v980jl2q12w0000gn/T/pip-req-build-4w2uzy7f
  Running command git clone -q https://github.com/stefanfroth/nmf_extension /private/var/folders/5k/g90jxpln6hd_1v980jl2q12w0000gn/T/pip-req-build-4w2uzy7f
Note: you may need to restart the kernel to use updated packages.


In [61]:
from nmf_extension.nmf import CustomNMF

In [60]:
R = pd.read_csv('movie_ratings.csv', usecols=list(range(1,12)), index_col=0, 
               skiprows=1, nrows=13)

In [62]:
custom_nmf = CustomNMF(n_components=4)

In [64]:
custom_nmf.fit(R)

CustomNMF(init='custom', n_components=4)

In [68]:
R_hat_custom = round(pd.DataFrame(np.dot(custom_nmf.transform(R), custom_nmf.components_), 
                            columns=R.columns, index=R.index), 2)
R_hat_custom

Unnamed: 0_level_0,Night on Earth,Der Clou / The Sting,Me and You and Everyone We Know,La vita è bella,Before Sunset,The Last King of Scotland,The Mauritanian,One Flew Over the Cuckoo's Nest,Lullaby for Pi,twelve monkeys
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Stefan,5.03,5.35,5.31,4.12,4.92,4.16,4.35,4.72,5.36,3.22
Thomas,3.13,4.75,4.38,4.44,2.53,3.26,4.83,3.22,3.95,3.13
Filipe,5.15,3.51,5.16,4.93,3.67,3.95,6.78,4.14,4.52,5.05
Marija,4.77,4.04,4.68,3.21,4.65,3.26,4.02,4.13,4.37,2.59
Andreas,4.98,5.67,5.38,4.37,4.87,4.42,4.35,4.82,5.6,3.43
Daniel,5.09,2.17,4.09,3.52,3.88,3.64,5.01,3.88,4.1,4.59
Matthias,4.17,4.49,4.54,3.93,3.78,3.67,4.25,3.92,4.54,3.26
Jana,4.75,6.01,5.45,5.08,4.33,4.84,4.86,4.79,5.84,4.17
Homa,5.89,4.89,6.0,5.24,4.87,4.64,6.59,5.06,5.6,4.9
Saskia,5.9,4.26,5.76,4.35,5.16,3.9,6.08,4.82,5.0,3.93


# Next steps 
* download 100k movie lense dataset - all the data you'll need is in ratings.csv
* work out a way to create a matrix with rows=users, columns=movies, values in the matrix= user_rating a for movie_id=1 etc
* FOLLOW the steps in this notebook to create a trained NMF able to make predictions