# Import packages

In [1]:
from datetime import datetime
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
from developed_methods import *

In [3]:
import scipy as sp
from scipy.stats import chi2

def mahalanobis_method(df):
    #M-Distance
    x_minus_mu = df - np.mean(df)
    cov = np.cov(df.values.T)                           #Covariance
    inv_covmat = sp.linalg.inv(cov)                     #Inverse covariance
    left_term = np.dot(x_minus_mu, inv_covmat) 
    mahal = np.dot(left_term, x_minus_mu.T)
    md = np.sqrt(mahal.diagonal())
    
    #Flag as outlier
    outlier = []
    #Cut-off point
    C = np.sqrt(chi2.ppf((1-0.001), df=df.shape[1]))    #degrees of freedom = number of variables
    for index, value in enumerate(md):
        if value > C:
            outlier.append(index)
        else:
            continue
    return outlier, md

# save the predicted ratings to csv file
def save_csv(df, folder_path, method):
    nowTime = datetime.now().strftime("%Y-%m-%d_%H-%M")
    fileName = "{folder_path}/{method}_{nowTime}.csv".format(folder_path = folder_path, method = method, nowTime = nowTime)
    df.to_csv(fileName, index = False)

# Load dataset

In [4]:
train_rating = pd.read_csv("../data/train_rating.csv")
test_pair = pd.read_csv("../data/test_pair.csv")

item_feat = pd.read_csv("../data/item_feats.csv")
user_feat = pd.read_csv("../data/user_feats.csv")

sub = pd.read_csv('../predict/sample_submission.csv')

In [5]:
# UserID
le_user = preprocessing.LabelEncoder()
le_user.fit(np.append(np.append(train_rating['UserId'], test_pair["UserId"]), user_feat["UserId"]))

user_feat['UserId'] = le_user.transform(user_feat["UserId"])
test_pair["UserId"] = le_user.transform(test_pair["UserId"])
train_rating['UserId'] = le_user.transform(train_rating["UserId"])

# ItemID
le_item = preprocessing.LabelEncoder()
le_item.fit(np.append(np.append(train_rating['ItemId'], test_pair["ItemId"]), item_feat["ItemId"]))

item_feat['ItemId'] = le_item.transform(item_feat["ItemId"])
test_pair["ItemId"] = le_item.transform(test_pair["ItemId"])
train_rating['ItemId'] = le_item.transform(train_rating["ItemId"])

#Inf value
user_feat.loc[np.isinf(user_feat['V1']),'V1']=-3
item_feat.loc[np.isinf(item_feat['V2']),'V2']=2

# Missing data
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(user_feat['V1'].values.reshape(-1, 1))
user_feat['V1'] = imp_mean.transform(user_feat['V1'].values.reshape(-1, 1))

In [6]:
# tran_pair, train_rating
train_pairs = train_rating[['UserId', 'ItemId']].values
train_ratings = train_rating['rating'].values
train_pair=train_rating.drop(columns='rating')

# test_pair
test_pairs = test_pair[['UserId', 'ItemId']].values

# number of users and items
# n_user, n_item = len(le_item.classes_), len(le_item.classes_)

In [7]:
n_user, n_item = max(train_pairs[:,0].max(), test_pairs[:,0].max())+1, max(train_pairs[:,1].max(), test_pairs[:,1].max())+1

In [8]:
class min_max_adj:
    def __init__(self, train_rating):
        self.min = np.min(train_rating)
        self.max = np.max(train_rating)
        self.true_rating = train_rating
    
    def adjust(self, pred_rating):
        pred_rating_adjusted = pred_rating.copy()
        pred_rating_adjusted[pred_rating > self.max] = self.max
        pred_rating_adjusted[pred_rating < self.min] = self.min
        return pred_rating_adjusted

    def rmse(self, pred_rating):
        return np.sqrt(np.mean((pred_rating - self.true_rating)**2))

adjustment = min_max_adj(train_rating["rating"])
print("Minmium and maximum:", [adjustment.min, adjustment.max])

Minmium and maximum: [0.0, 5.0]


# Additonal features
additional features and {rating_mean, rating_count}  

## user_pd and item_pd
using outer join and fill missing data  
if no rating records, rating_count = 0 and rating_mean = glb_avg

In [9]:
## generate cont feats for users
user_pd = pd.merge(left=train_rating.groupby('UserId')['rating'].mean(), 
				   right=train_rating.groupby('UserId')['rating'].count(), on='UserId')
user_pd.columns = ['rating_mean', 'rating_count']
user_pd = pd.merge(left = user_feat, right = user_pd, on = "UserId", how = "outer") # using outer join

## handle missing data
# if the user has no rating record, set rating_count = 0
user_pd.fillna(value = {"rating_count": 0}, inplace = True)
# if the rating_mean is missing, then use global mean
imp_mean.fit(user_pd)
user_pd = pd.DataFrame(imp_mean.transform(user_pd), columns = user_pd.columns)

## generate cont feats for items
item_rating_pd = pd.merge(left=train_rating.groupby('ItemId')['rating'].mean(), 
						  right=train_rating.groupby('ItemId')['rating'].count(), on='ItemId')
item_rating_pd.columns	= ['rating_mean', 'rating_count']
item_pd = pd.merge(left=item_feat, right=item_rating_pd, on='ItemId', how = "outer") # using outer join

## handle missing data
# if the item has no rating record, set rating_count = 0
item_pd.fillna(value = {"rating_count": 0}, inplace = True)
# if the rating_mean is missing, then use global mean
imp_mean.fit(item_pd)
item_pd = pd.DataFrame(imp_mean.transform(item_pd), columns = item_pd.columns)


print('#######################################################')
print('########## 10 random samples for users feats ##########')
print('#######################################################')

print(user_pd.sample(10))
print('#######################################################')
print('########## 10 random samples for items feats ##########')
print('#######################################################')

print(item_pd.sample(10))

#######################################################
########## 10 random samples for users feats ##########
#######################################################
        UserId        V1      V2      V3     V4  rating_mean  rating_count
8236   18617.0  3.389981  6579.0  1134.0  330.0     2.211891           0.0
22416   5006.0  3.423729  3040.0  1079.0   56.0     2.211891           0.0
7457    8995.0  3.423729  2249.0   626.0  330.0     2.211891           0.0
1421   22188.0  3.139943   582.0   296.0  290.0     2.211891           0.0
16075  12405.0  3.423729  5989.0   972.0  330.0     2.211891           0.0
19253  19047.0  3.223776  8195.0  1630.0  300.0     2.211891           0.0
19715  15778.0  3.447693  1491.0  1134.0  330.0     2.211891           0.0
11122  16882.0  3.423729  6508.0  1555.0  330.0     0.000000           1.0
2386   13911.0  3.423729  1489.0   439.0  325.0     2.211891           0.0
25946   8164.0  3.423729  5654.0  1134.0  330.0     2.211891           0.0
#######

## Standardize continous features

In [10]:
## pre-processing for users
user_cont = ["V1", "V2", "V3", "V4", "rating_mean", "rating_count"]
user_pd[user_cont] = StandardScaler().fit_transform(user_pd[user_cont])

## pre-processing for item
item_cont = ["V1", "V2", "V3", "rating_mean", "rating_count"]
item_pd[item_cont] = StandardScaler().fit_transform(item_pd[item_cont])


user_pd = user_pd.set_index('UserId', drop=True)
item_pd = item_pd.set_index('ItemId', drop=True)

print('#######################################################')
print('########## 10 random samples for users feats ##########')
print('#######################################################')
print(user_pd.sample(10))

print('#######################################################')
print('########## 10 random samples for items feats ##########')
print('#######################################################')
print(item_pd.sample(10))

#######################################################
########## 10 random samples for users feats ##########
#######################################################
               V1        V2        V3        V4   rating_mean  rating_count
UserId                                                                     
4524.0   0.000000 -0.893384 -0.266972  0.762978 -5.564071e-16     -0.067335
20398.0  0.589498  0.628581 -1.572501  0.762978 -5.564071e-16     -0.067335
20655.0  0.025396 -1.478170 -1.456652  0.412484 -5.564071e-16     -0.067335
14222.0 -0.769082  0.534778  0.838050 -1.147216 -5.564071e-16     -0.067335
15450.0  0.926189 -1.707606  0.477135 -1.068355 -5.564071e-16     -0.067335
15686.0 -0.531399  0.906185  0.131816  0.762978 -5.564071e-16     -0.067335
19959.0  0.000000  1.218860  0.289995  0.482583 -5.564071e-16     -0.067335
12507.0  0.000000  0.758721  0.844733 -1.637908 -5.564071e-16     -0.067335
3771.0   0.480938 -1.213242  0.588529  0.762978 -5.564071e-16     -0.067

# NCF model
only two embeddings for categorical features, UserId and ItemId.

In [11]:
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf

In [12]:
class SideNCF(keras.Model):
    def __init__(self, num_users, num_items, embedding_size, **kwargs):
        super(SideNCF, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-2),
        )
        self.itme_embedding = layers.Embedding(
            num_items,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-2),
        )

        self.concatenate = layers.Concatenate()
        self.dense1 = layers.Dense(100, name='fc-1', activation='relu')
        self.dense2 = layers.Dense(50, name='fc-2', activation='relu')
        self.dense3 = layers.Dense(1, name='fc-3', activation='relu')

    def call(self, inputs):
        cont_feats = inputs[0]
        cate_feats = inputs[1]

        user_vector = self.user_embedding(cate_feats[:,0])
        itme_vector = self.itme_embedding(cate_feats[:,1])

        concatted_vec = self.concatenate([cont_feats, user_vector, itme_vector])
        fc_1 = self.dense1(concatted_vec)
        fc_2 = self.dense2(fc_1)
        fc_3 = self.dense3(fc_2)
        return fc_3

In [13]:
model = SideNCF(num_users=n_user, num_items=n_item, embedding_size=50)

metrics = [
    keras.metrics.MeanAbsoluteError(name='mae'),
    keras.metrics.RootMeanSquaredError(name='rmse')
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-3), 
    loss=tf.keras.losses.MeanSquaredError(), 
    metrics=metrics
)

## Find the continuous features and categorical features for user and item, respectively
cate_feats = ["UserId", "ItemId"]

In [14]:
item_cont = ["V1", "V2", "V3", 'rating_mean', 'rating_count']
user_cont = ["V1", "V2", "V3", "V4", 'rating_mean', 'rating_count']

train_cont_feats = np.hstack((user_pd.loc[train_pairs[:,0]][user_cont], item_pd.loc[train_pairs[:,1]][item_cont]))
train_cate_feats = train_pairs.copy()

test_cont_feats = np.hstack((user_pd.loc[test_pairs[:,0]][user_cont], item_pd.loc[test_pairs[:,1]][item_cont]))
test_cate_feats = test_pairs.copy()

## Fit the model

In [15]:
callbacks = [keras.callbacks.EarlyStopping( 
    monitor='val_rmse', min_delta=0, patience=5, verbose=1, 
    mode='auto', baseline=None, restore_best_weights=True)]

history = model.fit(
    x=[train_cont_feats, train_cate_feats],
    y=train_ratings,
    batch_size=64,
    epochs=50,
    verbose=1,
    validation_split=.2,
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


## Result of training dataset
the result is adjusted for min=0, max=5

In [16]:
pred_rating = model.predict([train_cont_feats, train_cate_feats]).flatten()
pred_rating = adjustment.adjust(pred_rating)
print(pred_rating)
print('rmse: SideNCF: %.3f' %rmse(train_ratings, pred_rating))

[0.       0.       0.       ... 0.       0.       4.631906]
rmse: SideNCF: 0.874


## Save prediction

In [17]:
pred = model.predict([test_cont_feats, test_cate_feats]).flatten()
pred = adjustment.adjust(pred)
sub["rating"] = pred
save_csv(sub, "../predict", "NCF")

# Random Forest Regressor

In [18]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
# fit the regressor using RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 500, random_state = 3009)
regressor.fit(train_cate_feats, train_ratings)
pred_rating = regressor.predict(test_cate_feats)

pred = adjustment.adjust(pred_rating)
sub["rating"] = pred
save_csv(sub, "../predict", "rf")

# NCF + Random Forest
Train a random forest regressor by the residuals of NCF

In [21]:
pred_rating = model.predict([train_cont_feats, train_cate_feats]).flatten()
pred_rating = adjustment.adjust(pred_rating)
train_ratings_cm = train_ratings - pred_rating

NCF_regressor = RandomForestRegressor(n_estimators = 500, random_state = 3009)
NCF_regressor.fit(train_cate_feats, train_ratings_cm)
pred_rating = regressor.predict(test_cate_feats)

In [23]:
pred_rating = NCF_regressor.predict(test_cate_feats)
pred = adjustment.adjust(pred_rating)
sub["rating"] = pred
save_csv(sub, "../predict", "NCF_rf")