In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

In [2]:
rating = pd.read_csv('ratings.csv')
architect = pd.read_csv('architects.csv')
user = pd.read_csv('users.csv');
architect_rating = pd.merge(rating, architect, on='architect_id')
cols = ['Registration', 'Country', 'Address 2', 'Address 3', 'Company', 'WorkPhone', 'City', 'State', 'Postcode', 'Member Type']
architect_rating.drop(cols, axis=1, inplace=True)

architect_rating.head()

Unnamed: 0,user_id,architect_id,rating,Name,Address 1
0,37,967,0,Mr Russell Pfitz,Level 7 16 Marcus Clarke Street
1,97,967,6,Mr Russell Pfitz,Level 7 16 Marcus Clarke Street
2,35,967,7,Mr Russell Pfitz,Level 7 16 Marcus Clarke Street
3,49,967,5,Mr Russell Pfitz,Level 7 16 Marcus Clarke Street
4,9,967,4,Mr Russell Pfitz,Level 7 16 Marcus Clarke Street


In [3]:
rating_count = (architect_rating.
                groupby(by = ['architect_id'])['rating'].
                count().
                reset_index().
                rename(columns = {'rating': 'rating_count'})
               )
rating_count.head()

Unnamed: 0,architect_id,rating_count
0,1,5
1,3,4
2,4,1
3,5,3
4,7,1


In [4]:
threshold = 5
rating_count = rating_count.query('rating_count >= @threshold')

user_rating = pd.merge(rating_count, architect_rating, left_on='architect_id', right_on='architect_id', how='left')

In [5]:
user_count = (user_rating.
              groupby(by = ['user_id'])['rating'].
              count().
              reset_index().
              rename(columns = {'rating': 'rating_count'})
              [['user_id', 'rating_count']]
             )

In [6]:
threshold = 5
user_count = user_count.query('rating_count >= @threshold')

combined = user_rating.merge(user_count, left_on='user_id', right_on='user_id', how='inner')

print('Number of unique architects: ', combined['architect_id'].nunique())
print('Number of unique users: ', combined['user_id'].nunique())

Number of unique architects:  41
Number of unique users:  12


In [7]:
scaler = MinMaxScaler()
combined['rating'] = combined['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(combined['rating'].values.reshape(-1,1)))
combined['rating'] = rating_scaled
combined.head()

Unnamed: 0,architect_id,rating_count_x,user_id,rating,Name,Address 1,rating_count_y
0,1,5,69,0.5,Mr Brian Hodgen,L1/ 453 Ruthven Street,6
1,118,5,69,0.3,Mr Martin McGrane,48-54 Middle Road,6
2,748,6,69,0.6,Mr Robert McAdam,L7/ 32 Cordelia Street,6
3,748,6,69,0.3,Mr Robert McAdam,L7/ 32 Cordelia Street,6
4,862,6,69,0.2,Mr Brett Anstis,30 Commercial Road,6


In [8]:
combined = combined.drop_duplicates(['user_id', 'architect_id'])
user_architect_matrix = combined.pivot(index='user_id', columns='architect_id', values='rating')
user_architect_matrix.fillna(0, inplace=True)

users = user_architect_matrix.index.tolist()
architects = user_architect_matrix.columns.tolist()

#df.as_matrix() deprecated as of v0.23.0 using df.values
user_architect_matrix = user_architect_matrix.values

In [9]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [10]:
num_input = combined['architect_id'].nunique()
num_hidden_1 = 10
num_hidden_2 = 5

X = tf.placeholder(tf.float64, [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

def encoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2

def decoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2

In [11]:
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

y_pred = decoder_op

y_true = X

In [12]:
loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)
eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [13]:
init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()
pred_data = pd.DataFrame()
print(pred_data)

Empty DataFrame
Columns: []
Index: []


In [14]:
with tf.Session() as session:
    epochs = 50
    batch_size =10
    
    session.run(init)
    session.run(local_init)
    
    num_batches = int(user_architect_matrix.shape[0] / batch_size)
    user_architect_matrix = np.array_split(user_architect_matrix, num_batches)
    
    for i in range (epochs):
        avg_cost = 0
        for batch in user_architect_matrix:
            _, l = session.run([optimizer, loss], feed_dict = {X: batch})
            avg_cost += 1
            
        avg_cost /= num_batches
        
        print("epoch: {} Loss: {}".format(i+1, avg_cost))
        
    user_architect_matrix = np.concatenate(user_architect_matrix, axis=0)
    
    preds = session.run(decoder_op, feed_dict = {X: user_architect_matrix})

    pred_data = pred_data.append(pd.DataFrame(preds))
    
    pred_data = pred_data.stack().reset_index(name='rating')
    pred_data.rename(columns = {'level_0': 'user_id', 'level_1': 'architect_id'}, inplace=True)
    pred_data['user_id'] = pred_data['user_id'].map(lambda value: users[value])
    pred_data['architect_id'] = pred_data['architect_id'].map(lambda value: architects[value])
    
    keys = ['user_id', 'architect_id']
    index_1 = pred_data.set_index(keys).index
    index_2 = combined.set_index(keys).index
    
    top_ten_ranked = pred_data[~index_1.isin(index_2)]
    top_ten_ranked = top_ten_ranked.sort_values(['user_id', 'rating'], ascending=[True, False])
    top_ten_ranked = top_ten_ranked.groupby('user_id').head(10)

epoch: 1 Loss: 1.0
epoch: 2 Loss: 1.0
epoch: 3 Loss: 1.0
epoch: 4 Loss: 1.0
epoch: 5 Loss: 1.0
epoch: 6 Loss: 1.0
epoch: 7 Loss: 1.0
epoch: 8 Loss: 1.0
epoch: 9 Loss: 1.0
epoch: 10 Loss: 1.0
epoch: 11 Loss: 1.0
epoch: 12 Loss: 1.0
epoch: 13 Loss: 1.0
epoch: 14 Loss: 1.0
epoch: 15 Loss: 1.0
epoch: 16 Loss: 1.0
epoch: 17 Loss: 1.0
epoch: 18 Loss: 1.0
epoch: 19 Loss: 1.0
epoch: 20 Loss: 1.0
epoch: 21 Loss: 1.0
epoch: 22 Loss: 1.0
epoch: 23 Loss: 1.0
epoch: 24 Loss: 1.0
epoch: 25 Loss: 1.0
epoch: 26 Loss: 1.0
epoch: 27 Loss: 1.0
epoch: 28 Loss: 1.0
epoch: 29 Loss: 1.0
epoch: 30 Loss: 1.0
epoch: 31 Loss: 1.0
epoch: 32 Loss: 1.0
epoch: 33 Loss: 1.0
epoch: 34 Loss: 1.0
epoch: 35 Loss: 1.0
epoch: 36 Loss: 1.0
epoch: 37 Loss: 1.0
epoch: 38 Loss: 1.0
epoch: 39 Loss: 1.0
epoch: 40 Loss: 1.0
epoch: 41 Loss: 1.0
epoch: 42 Loss: 1.0
epoch: 43 Loss: 1.0
epoch: 44 Loss: 1.0
epoch: 45 Loss: 1.0
epoch: 46 Loss: 1.0
epoch: 47 Loss: 1.0
epoch: 48 Loss: 1.0
epoch: 49 Loss: 1.0
epoch: 50 Loss: 1.0


In [22]:
top_ten_ranked.loc[top_ten_ranked['user_id'] == 5]

Unnamed: 0,user_id,architect_id,rating
7,5,223,0.963458
39,5,967,0.958479
16,5,400,0.932344
24,5,679,0.854232
3,5,118,0.841018
30,5,816,0.787757
35,5,887,0.77413
29,5,814,0.77366
6,5,167,0.769225
20,5,505,0.768278


In [23]:
rating.loc[rating['user_id'] == 5].sort_values(by=['rating'], ascending=False)

Unnamed: 0,user_id,architect_id,rating
126,5,754,9
604,5,107,9
1182,5,362,9
607,5,579,7
1411,5,1011,7
1733,5,969,7
1034,5,346,6
91,5,1009,5
363,5,92,5
818,5,450,4
