# Question 3

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.nn.functional as F

In [6]:
movies_df= pd.read_csv('movies.csv')
rating_df= pd.read_csv('ratings.csv')

## A

In [7]:
movies_df.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
rating_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
movies_df.tail(5)

Unnamed: 0,movieId,title,genres
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy


In [10]:
rating_df.tail(5)

Unnamed: 0,userId,movieId,rating,timestamp
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


In [11]:
movies_df.shape

(9742, 3)

In [12]:
rating_df.shape

(100836, 4)

In [13]:
movies_df.index.values

array([   0,    1,    2, ..., 9739, 9740, 9741])

In [14]:
movies_df['List_Index']= movies_df.index

## B

In [15]:
data = movies_df.merge(rating_df, on= 'movieId')
data.head()

Unnamed: 0,movieId,title,genres,List_Index,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,17,4.5,1305696483


## C

In [16]:
data.drop(['title','genres','timestamp'], axis= 1, inplace= True)

## D

In [17]:
data.groupby(by= 'userId').count()

Unnamed: 0_level_0,movieId,List_Index,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,232,232,232
2,29,29,29
3,39,39,39
4,216,216,216
5,44,44,44
...,...,...,...
606,1115,1115,1115
607,187,187,187
608,831,831,831
609,37,37,37


In [18]:
data_gp= data.groupby(by= 'userId')
data_gp

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f9725bd4a00>

## E

In [58]:
x= np.array(data['rating']).reshape(-1,1)

In [59]:
scaler= StandardScaler()
data.rating= scaler.fit_transform(x)
# train_X

#### user_df

In [150]:
m_id= data.movieId.unique() 
train_X= []
df_all= []
for user_Id, groups in data_gp:
    user_df= pd.DataFrame(m_id, columns= ['movieId'])
    user_df['rating']= 0

    for id in groups.movieId.values:
        rate= data.rating.loc[(data.movieId == id) & (data.userId == user_Id)].values[0] 
        user_df.rating.loc[user_df['movieId']== id] = rate
        
    df_all.append(user_df)
        
    a= list(user_df.rating.values)
    train_X.append(a)
    
train_X= np.array(train_X)
train_X= torch.tensor(train_X, dtype=torch.float)

In [151]:
train_X.shape

torch.Size([610, 9724])

In [152]:
train_X[75]

tensor([0.5000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000])

In [153]:
print(df_all[75])

      movieId  rating
0           1     0.5
1           2     0.0
2           3     0.0
3           4     0.0
4           5     0.0
...       ...     ...
9719   193581     0.0
9720   193583     0.0
9721   193585     0.0
9722   193587     0.0
9723   193609     0.0

[9724 rows x 2 columns]


# F

In [214]:
class RBM():
    def __init__(self, n_v, n_h, k):
        self.w= nn.Parameter(torch.randn(n_h,n_v)*0.01, requires_grad=False)
        self.bias_v = nn.Parameter(torch.zeros(1, n_v), requires_grad=False)
        self.bias_h = nn.Parameter(torch.zeros(1, n_h), requires_grad=False)
        self.n_v = n_v
        self.n_h= n_h
        self.k = k
        
    def forward(self, x):
        x = F.linear(x, self.w, self.bias_h)
        a = torch.sigmoid(x)
        return a
    
    def backward(self, a):
        a = F.linear(a, self.w.T, self.bias_v)
        x = torch.sigmoid(a)
        return x
    
    def update(self, x, x2, a, a2):
        self.w += self.k * (torch.mm(x.T, a) - torch.mm(x2.T, a2)).T
        
        self.bias_v += self.k * (x - x2)
        self.bias_h += self.k * (a - a2)

In [215]:
rbm = RBM(data.movieId.unique().shape[0], 20, 1)

In [216]:
for epoch in range(20):
    
    for user_data in train_X:
        user_data = user_data.view(-1, len(user_data))
        
        a = rbm.forward(user_data)
        rating_p = rbm.backward(a)
        a2 = rbm.forward(rating_p)
        # print(rating_p.shape)
        # print(user_data.shape)

        rbm.update(user_data, rating_p, a, a2)

In [217]:
rating = train_X[75]


rating = rating.view(-1, len(rating))

a = rbm.forward(rating)
xp = rbm.backward(a)

In [218]:
prd= np.transpose( xp.detach().numpy()).tolist()
c= pd.DataFrame({'movieId':df_all[75].movieId.values, 'rating': df_all[75].rating.values, 'prediction': prd })
c.prediction = c.prediction.apply(lambda x: x[0])

In [219]:
c.nlargest(15, 'prediction')

Unnamed: 0,movieId,rating,prediction
0,1,0.5,1.0
1,2,0.0,1.0
5,6,0.0,1.0
9,10,0.0,1.0
10,11,0.0,1.0
15,16,0.0,1.0
16,17,0.0,1.0
20,21,0.0,1.0
24,25,0.0,1.0
27,28,0.0,1.0


In [220]:
cc= c.loc[c['rating'] == 0]

In [221]:
ccc=cc.nlargest(15, 'prediction')
ccc


Unnamed: 0,movieId,rating,prediction
1,2,0.0,1.0
5,6,0.0,1.0
9,10,0.0,1.0
10,11,0.0,1.0
15,16,0.0,1.0
16,17,0.0,1.0
20,21,0.0,1.0
24,25,0.0,1.0
27,28,0.0,1.0
31,32,0.0,1.0


In [202]:
pd_title= []
for i in ccc.movieId:
    t= data.title.loc[data['movieId']== i].unique()
    pd_title.append(t[0])
    

In [203]:
pd_title

['GoldenEye (1995)',
 'American President, The (1995)',
 'Sense and Sensibility (1995)',
 'Get Shorty (1995)',
 'Leaving Las Vegas (1995)',
 'Persuasion (1995)',
 'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
 'Clueless (1995)',
 'Misérables, Les (1995)',
 'Broken Arrow (1996)',
 'Bottle Rocket (1996)',
 'Rumble in the Bronx (Hont faan kui) (1995)',
 'Birdcage, The (1996)',
 'Apollo 13 (1995)',
 'Batman Forever (1995)']

In [204]:
ccc['pd_title'] = pd_title

In [205]:
ccc

Unnamed: 0,movieId,rating,prediction,pd_title
9,10,0.0,1.0,GoldenEye (1995)
10,11,0.0,1.0,"American President, The (1995)"
16,17,0.0,1.0,Sense and Sensibility (1995)
20,21,0.0,1.0,Get Shorty (1995)
24,25,0.0,1.0,Leaving Las Vegas (1995)
27,28,0.0,1.0,Persuasion (1995)
28,29,0.0,1.0,"City of Lost Children, The (Cité des enfants p..."
35,39,0.0,1.0,Clueless (1995)
65,73,0.0,1.0,"Misérables, Les (1995)"
84,95,0.0,1.0,Broken Arrow (1996)
