In [31]:
# Packages
import numpy as np
import pandas as pd

In [32]:
#Import Data

#TODO replace csv with database connection for taken_course_c table
df = pd.read_csv("/Users/andymrkva/git/CourseRecommender/data/cleaned_data/taken_course_c.csv")
print(df.shape)

# Remove grades we are not interested in.
df = df[df["grade_code"].str.startswith(('A','B','C','D','F')) & ~df["grade_code"].str.startswith('DNG')]
# Arrange by grade and remove duplicates
df = df.sort_values("grade_code").drop_duplicates(subset=("student_id","course_id"))
# Add Taken variable to represent matrix value
df["taken"] = 1

(65980, 5)


In [33]:
print(df.shape)
df.head()

(53544, 6)


Unnamed: 0,level_id,term_id,course_id,grade_code,student_id,taken
0,GR,2061,115540,A,104387,1
36008,GR,2094,113142,A,101491,1
36007,GR,2091,123947,A,101491,1
36001,GR,2161,113154,A,100896,1
35997,GR,2144,113085,A,100896,1


In [38]:
# Building a SparseTensor for efficiency (maybe not needed for this size data set but here we go)

def build_taken_sparse_tensor(taken_df):
  """
  Args:
    taken_df: a pd.DataFrame with `student_id`, `course_id` and `taken` columns.
  Returns:
    a tf.SparseTensor representing the courses matrix.
  """
  indices = taken_df[['student_id', 'course_id']].values
  values = taken_df['taken'].values
  return tf.SparseTensor(
      indices=indices,
      values=values,
      dense_shape=[students.shape[0], courses.shape[0]])

In [41]:
#@title Solution
def sparse_mean_square_error(sparse_taken, student_embeddings, course_embeddings):
  """
  Args:
    sparse_taken: A SparseTensor taken matrix, of dense_shape [N, M]
    student_embeddings: A dense Tensor U of shape [N, k] where k is the embedding
      dimension, such that U_i is the embedding of student i.
    course_embeddings: A dense Tensor V of shape [M, k] where k is the embedding
      dimension, such that V_j is the embedding of course j.
  Returns:
    A scalar Tensor representing the MSE between the true taken and the
      model's predictions.
  """
  predictions = tf.gather_nd(
      tf.matmul(student_embeddings, course_embeddings, transpose_b=True),
      sparse_taken.indices)
  loss = tf.losses.mean_squared_error(sparse_taken.values, predictions)
  return loss

In [44]:
#@title Alternate Solution that is more MEMORY friendly (and probably not needed for this data set)
def sparse_mean_square_error(sparse_taken, student_embeddings, course_embeddings):
  """
  Args:
    sparse_taken: A SparseTensor taken matrix, of dense_shape [N, M]
    student_embeddings: A dense Tensor U of shape [N, k] where k is the embedding
      dimension, such that U_i is the embedding of student i.
    course_embeddings: A dense Tensor V of shape [M, k] where k is the embedding
      dimension, such that V_j is the embedding of course j.
  Returns:
    A scalar Tensor representing the MSE between the true taken and the
      model's predictions.
  """
  predictions = tf.reduce_sum(
      tf.gather(student_embeddings, sparse_taken.indices[:, 0]) *
      tf.gather(course_embeddings, sparse_taken.indices[:, 1]),
      axis=1)
  loss = tf.losses.mean_squared_error(sparse_taken.values, predictions)
  return loss

In [42]:
# Training a Matrix Factorization model
# This is a simple class to train a matrix factorization model using stochastic gradient descent.

# @title CFModel helper class (run this cell)
class CFModel(object):
  """Simple class that represents a collaborative filtering model"""
  def __init__(self, embedding_vars, loss, metrics=None):
    """Initializes a CFModel.
    Args:
      embedding_vars: A dictionary of tf.Variables.
      loss: A float Tensor. The loss to optimize.
      metrics: optional list of dictionaries of Tensors. The metrics in each
        dictionary will be plotted in a separate figure during training.
    """
    self._embedding_vars = embedding_vars
    self._loss = loss
    self._metrics = metrics
    self._embeddings = {k: None for k in embedding_vars}
    self._session = None

  @property
  def embeddings(self):
    """The embeddings dictionary."""
    return self._embeddings

  def train(self, num_iterations=100, learning_rate=1.0, plot_results=True,
            optimizer=tf.train.GradientDescentOptimizer):
    """Trains the model.
    Args:
      iterations: number of iterations to run.
      learning_rate: optimizer learning rate.
      plot_results: whether to plot the results at the end of training.
      optimizer: the optimizer to use. Default to GradientDescentOptimizer.
    Returns:
      The metrics dictionary evaluated at the last iteration.
    """
    with self._loss.graph.as_default():
      opt = optimizer(learning_rate)
      train_op = opt.minimize(self._loss)
      local_init_op = tf.group(
          tf.variables_initializer(opt.variables()),
          tf.local_variables_initializer())
      if self._session is None:
        self._session = tf.Session()
        with self._session.as_default():
          self._session.run(tf.global_variables_initializer())
          self._session.run(tf.tables_initializer())
          tf.train.start_queue_runners()

    with self._session.as_default():
      local_init_op.run()
      iterations = []
      metrics = self._metrics or ({},)
      metrics_vals = [collections.defaultdict(list) for _ in self._metrics]

      # Train and append results.
      for i in range(num_iterations + 1):
        _, results = self._session.run((train_op, metrics))
        if (i % 10 == 0) or i == num_iterations:
          print("\r iteration %d: " % i + ", ".join(
                ["%s=%f" % (k, v) for r in results for k, v in r.items()]),
                end='')
          iterations.append(i)
          for metric_val, result in zip(metrics_vals, results):
            for k, v in result.items():
              metric_val[k].append(v)

      for k, v in self._embedding_vars.items():
        self._embeddings[k] = v.eval()

      if plot_results:
        # Plot the metrics.
        num_subplots = len(metrics)+1
        fig = plt.figure()
        fig.set_size_inches(num_subplots*10, 8)
        for i, metric_vals in enumerate(metrics_vals):
          ax = fig.add_subplot(1, num_subplots, i+1)
          for k, v in metric_vals.items():
            ax.plot(iterations, v, label=k)
          ax.set_xlim([1, num_iterations])
          ax.legend()
      return results


NameError: name 'tf' is not defined