In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

from multipledispatch import dispatch 

import warnings
warnings.filterwarnings('ignore')

In [None]:
import seaborn as sns
def corr_heatmap(df):
    corr = df.corr()
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    sns.set(font_scale=1.15)
    with sns.axes_style(style="ticks"):
        f, ax = plt.subplots(figsize=(7, 5))
    ax = sns.heatmap(corr, mask=mask, annot=True, linewidths=1, cmap="YlGnBu")

In [None]:
def macro_accuracy(actual, pred):
    assert len(actual) == len(pred)
    correct = 0
    for i, j in zip(actual, pred):
        if i == j:
            correct += 1
    return correct / len(actual)
    

In [None]:
def get_top_n(predictions, n=10, min_rating=4.0):
    top_n = defaultdict(list)

    for user_id, business_id, actual_rating, estimated_rating, _ in predictions:
        if (estimated_rating >= min_rating):
            top_n[user_id].append((business_id, estimated_rating))

    for user_id, ratings in top_n.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[user_id] = ratings[:n]

    return top_n

In [None]:
def get_genres(data):
    genres = defaultdict(list)
    genreIDs = {}
    maxGenreID = 0

    for i in range(data.shape[0]):
        row = data.iloc[i]
        businessID = row["business_id"]
        genreList = row["business_categories"].split(',')
        genreIDList = []
        for genre in genreList:
            if genre in genreIDs:
                genreID = genreIDs[genre]
            else:
                genreID = maxGenreID
                genreIDs[genre] = genreID
                maxGenreID += 1
            genreIDList.append(genreID)
        genres[businessID] = genreIDList

        
    for (businessID, genreIDList) in genres.items():
        bitfield = [0] * maxGenreID
        for genreID in genreIDList:
            bitfield[genreID] = 1
        genres[businessID] = bitfield            

    return genres

In [None]:
@dispatch(np.ndarray,pd.core.series.Series)
def evaluate(y_pred, y_test):
#     y_pred = model.predict(X_test)

    print('\nLinear Regression Performance Metrics')
    print('R^2=', explained_variance_score(y_test,y_pred))
    print('MAE:', mean_absolute_error(y_test,y_pred))
    print('MSE:', mean_squared_error(y_test,y_pred))
    print('RMSE:',np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
@dispatch(object,list)
def evaluate(model, testset):
    print("\nComputing recommendations...")
    predictions = model.test(testset)

    print("\nEvaluating accuracy of model...")
    print("RMSE: ", accuracy.mae(predictions, verbose=False))
    print("MAE: ", accuracy.rmse(predictions, verbose=False))
#     print("ACC: ", macro_accuracy([i[2] for i in testset], [predictions[i].r_ui for i in range(len(predictions))]))
    return predictions

In [None]:
def encode_sentence(s, tokenizer):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

def bert_encode(X, tokenizer):
  num_examples = len(X)
  
  sentence1 = tf.ragged.constant([encode_sentence(s, tokenizer) for s in np.array(X)])

#   sentence2 = tf.ragged.constant([
#       encode_sentence(s, tokenizer)
#        for s in np.array(glue_dict["sentence2"])])

  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
  input_word_ids = tf.concat([cls, sentence1], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s1 = tf.zeros_like(sentence1)
#   type_s2 = tf.ones_like(sentence2)
  input_type_ids = tf.concat([type_cls, type_s1], axis=-1).to_tensor()

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

  return inputs