In [1]:
#set up, main tools= np,pd,tf
import numpy as np
import pandas as pd
import collections
from mpl_toolkits.mplot3d import Axes3D
from IPython import display
from matplotlib import pyplot as plt
import sklearn
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
tf.logging.set_verbosity(tf.logging.ERROR)
import altair as alt
alt.data_transformers.enable('default', max_rows=None)

# Add some convenience functions to Pandas DataFrame.
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.2f}'.format
def mask(df, key, function):
  """Returns a filtered dataframe, by applying function to key"""
  return df[function(df[key])]

def flatten_cols(df):
  df.columns = [' '.join(col).strip() for col in df.columns.values]
  return df

pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols

Matplotlib created a temporary config/cache directory at /var/folders/_x/fh3t8wcj3_xbs4fddcxmbpc00000gn/T/matplotlib-sp1lhqhg because the default path (/Users/hansshen/.matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
#prelims, just giving a demonstration of the input vector space 
def split_df(df,testing_fraction=0.2):
    test=df.sample(frac=testing_fraction,replace=False)
    train=df[~df.index.isin(test.index)]
    return train, test

mood=list(range(10))
energy_levels=list(range(5))
talk_preferences=[0,1]
internal_journeys=["winding_road","steep_staircase","flowing_river","open_field"]
user_cols=["mood","energy_level","talk_preference","internal_journey"]

In [None]:
#read in data of our users, connected to the backend database 
user_info="link_to_backend"
df_user=pd.DataFrame(user_info,columns=user_cols)

#Interactive Altair visualization Tool, helpful when we have a large user database
#display histograms of data,sliced by a given attribute

#first, create a filter
major_filter=alt.selection_multi(fields=["Major"])
major_chart=alt.Chart().mark_bar().encode(
    x="count()",
    y=alt.Y("Major:N"),
    color=alt.condition(major_filter,
                        alt.Color("Major:N",scale=alt.Scale(scheme='category20')),
                        alt.value("lightgray"))
).properties(width=300,height=300,selection=major_filter)

# A function that generates a histogram of filtered data.
def filtered_hist(field, label, filter):
  """Creates a layered chart of histograms.
  The first layer (light gray) contains the histogram of the full data, and the
  second contains the histogram of the filtered data.
  Args:
    field: the field for which to generate the histogram.
    label: String label of the histogram.
    filter: an alt.Selection object to be used to filter the data.
  """
  base = alt.Chart().mark_bar().encode(
      x=alt.X(field, bin=alt.Bin(maxbins=20), title=label),
      y="count()").properties(width=300)
  return alt.layer(
      base.transform_filter(filter),
      base.encode(color=alt.value('lightgray'), opacity=alt.value(.7)),
  ).resolve_scale(y='independent')

# Create a chart for the count, and one for the mean.
alt.hconcat(
    filtered_hist('num_mentor', 'num_mentor', major_filter),
    filtered_hist('avg_rating', 'avg_rating', major_filter),
    major_chart,
    data=df_mentee)

In [3]:
#Actual Machine Learning
def sparse_tensor(ratings_df):
    indices=ratings_df[["mentor_id","mentee_id"]].values
    values=ratings_df["score"].tolist()
    dense_shape=[num_mentor,num_mentee+1]
    return tf.SparseTensor(indices=indices,
                           values=values,
                           dense_shape=dense_shape)
def sparse_mse_loss(sparse_ratings,mentor_embeddings,mentee_embeddings):
    predictions = tf.gather_nd(
      tf.matmul(mentor_embeddings, mentee_embeddings, transpose_b=True),
      sparse_ratings.indices)
    loss = tf.losses.mean_squared_error(sparse_ratings.values, predictions)
    return loss
#collaborative filtering(CF)
class CF_model(object):
    def __init__(self,embedding_vars,loss,metrics=None):
        #embedding_Vars= dict of tf.Variable
        #loss: a float tensor to optimize
        #metric optional list of dict of tensors, which would 
        #be separately plotted during training
        self._embedding_vars=embedding_vars
        self._loss=loss
        self._metrics=metrics
        self._embeddings={k: None for k in embedding_vars}
        self._session=None
    #why do we need property decorator here?
    #returns the metrics dictionary valauted at the last iteration
    @property 
    def embeddings(self):
        return self._embeddings
    def train(self,num_iterations=100,
              learning_rate=1,
              plot_results=True,
              optimizer=tf.train.GradientDescentOptimizer):
        with self._loss.graph.as_default():
            opt=optimizer(learning_rate)
            train_op=opt.minimize(self._loss)
            local_init_op=tf.group(
                tf.variables_initializer(opt.variables()),
                tf.local_variables_initializer()
            )
            if self._session is None:
                self._session=tf.Session()
                with self._session.as_default():
                    self._session.run(tf.global_variables_initializer())
                    self._session.run(tf.tables_initializer())
                    tf.train.start_queue_runners()
        with self._session.as_default():
            local_init_op.run()
            iterations=[]
            metrics=self._metrics or ({},)
            metrics_vals=[collections.defaultdict(list) for _ in self._metrics]
    # Training and appending results
            for i in range(num_iterations+1):
                _,results=self._session.run((train_op,metrics))
                if (i % 10 == 0) or i == num_iterations:
                    print("\r iteration %d: " % i + ", ".join(
                            ["%s=%f" % (k, v) for r in results for k, v in r.items()]),
                            end='')
                    iterations.append(i)
                    for metric_val, result in zip(metrics_vals, results):
                        for k, v in result.items():
                            metric_val[k].append(v)
            for k, v in self._embedding_vars.items():
                self._embeddings[k] = v.eval()

            #plot the metrics
            if plot_results:
                num_subplots=len(metrics)+1
                fig=plt.figure()
                fig.set_size_inches(num_subplots*10,8)
                for i,metric_vals in enumerate(metrics_vals):
                    ax=fig.add_subplot(1,num_subplots,i+1)
                    for k,v in metric_vals.items():
                        ax.plot(iterations,v,label=k)
                    ax.set_xlim([1,num_iterations])
                    ax.legend()
            return results

In [None]:
#regularization
#input: two embedding matrices, output: loss
def gravity(U,V):
    return 1. / (U.shape[0].value*V.shape[0].value) * tf.reduce_sum(
      tf.matmul(U, U, transpose_a=True) * tf.matmul(V, V, transpose_a=True))
def regular(U,V):
    loss_U=1/(U.shape[0].value)*tf.reduce_sum(U*U)
    loss_V=1/(V.shape[0].value)*tf.reduce_sum(V*V)
    return loss_U+loss_V
def build_regularized_model(ratings,embedding_dim=3,reg_coef=1,grav_coef=1,init_stddev=0.5):
    train,test=split_df(ratings)
    A_train=sparse_tensor(train)
    A_test=sparse_tensor(test)
    U=tf.Variable(tf.random_normal(
                            [A_train.dense_shape[0],embedding_dim],stddev=init_stddev)
                 )
    V=tf.Variable(tf.random_normal(
                            [A_train.dense_shape[1],embedding_dim],stddev=init_stddev)
                 )
    error_train=sparse_mse_loss(A_train,U,V)
    error_test=sparse_mse_loss(A_test,U,V)
    reg_loss=reg_coef*regular(U,V)
    grav_loss=grav_coef*gravity(U,V)
    total_loss=error_train+reg_loss+grav_loss
    losses={
        'train_error':error_train,
        'test_error':error_test
    }
    loss_components ={
        "observed_loss":error_train,
        "regularization_loss":reg_loss,
        "gravity_loss":grav_loss
    }
    embeddings={"mentor_id":U,"mentee_id":V}
    return CF_model(embeddings,total_loss,[losses,loss_components])
    