add auto deduction for augmentation step

KiddoZhu · KiddoZhu · commit 9915a0fabff7 · 2019-10-18T01:48:13.000-04:00
diff --git a/config/template/graph.yaml b/config/template/graph.yaml
@@ -65,9 +65,9 @@ train:
   negative_weight: 5
   # Exponent of degrees in negative sampling. Default is recommended.
   negative_sample_exponent: 0.75
-  # Augmentation step. Need to be tuned on the validation set.
+  # Augmentation step. Default is usually reasonable.
   # Larger value is needed for sparser graphs.
-  augmentation_step: 5
+  augmentation_step: auto
   # Return parameter and in-out parameters (node2vec). Need to be tuned on the validation set.
   p: 1
   q: 1
diff --git a/doc/source/user/auto.rst b/doc/source/user/auto.rst
@@ -24,5 +24,9 @@ Here lists hyperparameters that support auto deduction.
         num_partition: auto
         episode_size: auto
 
+    train:
+        # for node embedding
+        augmentation_step: auto
+
 .. note::
     The auto value for ``gpus`` is an empty list.
diff --git a/include/bind.h b/include/bind.h
@@ -451,11 +451,11 @@ class pyGraphSolver : public py::class_<graphvite::GraphSolver<dim, Float, Index
 
         def("train", &GraphSolver::train, py::no_gil(),
             py::arg("model") = "LINE", py::arg("num_epoch") = 2000, py::arg("resume") = false,
-            py::arg("augmentation_step") = 5, py::arg("random_walk_length") = 40,
+            py::arg("augmentation_step") = graphvite::kAuto, py::arg("random_walk_length") = 40,
             py::arg("random_walk_batch_size") = 100, py::arg("shuffle_base") = graphvite::kAuto, py::arg("p") = 1,
             py::arg("q") = 1, py::arg("positive_reuse") = 1, py::arg("negative_sample_exponent") = 0.75,
             py::arg("negative_weight") = 5, py::arg("log_frequency") = 1000,
-            "train(model='LINE', num_epoch=2000, resume=False, augmentation_step=5, random_walk_length=40, "
+            "train(model='LINE', num_epoch=2000, resume=False, augmentation_step=auto, random_walk_length=40, "
                   "random_walk_batch_size=100, shuffle_base=auto, p=1, q=1, positive_reuse=1, "
                   "negative_sample_exponent=0.75, negative_weight=5, log_frequency=1000)"
             R"(
diff --git a/include/instance/graph.cuh b/include/instance/graph.cuh
@@ -52,6 +52,8 @@
 
 namespace graphvite {
 
+const int kExpectedDegree = 1600;
+
 /**
  * @brief Normal graphs without attributes
  * @tparam _Index integral type of node indexes
@@ -742,7 +744,7 @@ public:
      * @param _log_frequency log every log_frequency batches
      */
     void train(const std::string &_model = "LINE", int _num_epoch = 2000, bool _resume = false,
-               int _augmentation_step = 5, int _random_walk_length = 40, int _random_walk_batch_size = 100,
+               int _augmentation_step = kAuto, int _random_walk_length = 40, int _random_walk_batch_size = 100,
                int _shuffle_base = kAuto, float _p = 1, float _q = 1, int _positive_reuse = 1,
                float _negative_sample_exponent = 0.75, float _negative_weight = 5, int _log_frequency = 1000) {
         augmentation_step = _augmentation_step;
@@ -752,6 +754,8 @@ public:
         p = _p;
         q = _q;
 
+        if (augmentation_step == kAuto)
+            augmentation_step = log(kExpectedDegree) / log(float(num_edge) / num_vertex);
         if (shuffle_base == kAuto)
             shuffle_base = augmentation_step;
         if (model == "DeepWalk" || model == "node2vec")
diff --git a/python/graphvite/application/application.py b/python/graphvite/application/application.py
@@ -43,13 +43,16 @@ class ApplicationMixin(object):
         dim (int): dimension of embeddings
         gpus (list of int, optional): GPU ids, default is all GPUs
         cpu_per_gpu (int, optional): number of CPU threads per GPU, default is all CPUs
+        gpu_memory_limit (int, optional): memory limit per GPU in bytes, default is all memory
         float_type (dtype, optional): type of parameters
         index_type (dtype, optional): type of graph indexes
     """
-    def __init__(self, dim, gpus=[], cpu_per_gpu=auto, float_type=cfg.float_type, index_type=cfg.index_type):
+    def __init__(self, dim, gpus=[], cpu_per_gpu=auto, gpu_memory_limit=auto,
+                 float_type=cfg.float_type, index_type=cfg.index_type):
         self.dim = dim
         self.gpus = gpus
         self.cpu_per_gpu = cpu_per_gpu
+        self.gpu_memory_limit = gpu_memory_limit
         self.float_type = float_type
         self.index_type = index_type
         self.set_format()
@@ -236,7 +239,8 @@ def get_solver(self, **kwargs):
             num_sampler_per_worker = auto
         else:
             num_sampler_per_worker = self.cpu_per_gpu - 1
-        return solver.GraphSolver(self.dim, self.float_type, self.index_type, self.gpus, num_sampler_per_worker)
+        return solver.GraphSolver(self.dim, self.float_type, self.index_type, self.gpus, num_sampler_per_worker,
+                                  self.gpu_memory_limit)
 
     def node_classification(self, X=None, Y=None, file_name=None, portions=(0.02,), normalization=False, times=1,
                             patience=100):
@@ -513,7 +517,8 @@ def get_solver(self, **kwargs):
             num_sampler_per_worker = auto
         else:
             num_sampler_per_worker = self.cpu_per_gpu - 1
-        return solver.GraphSolver(self.dim, self.float_type, self.index_type, self.gpus, num_sampler_per_worker)
+        return solver.GraphSolver(self.dim, self.float_type, self.index_type, self.gpus, num_sampler_per_worker,
+                                  self.gpu_memory_limit)
 
 
 class KnowledgeGraphApplication(ApplicationMixin):
@@ -573,7 +578,8 @@ def get_solver(self, **kwargs):
             num_sampler_per_worker = auto
         else:
             num_sampler_per_worker = self.cpu_per_gpu - 1
-        return solver.KnowledgeGraphSolver(self.dim, self.float_type, self.index_type, self.gpus, num_sampler_per_worker)
+        return solver.KnowledgeGraphSolver(self.dim, self.float_type, self.index_type, self.gpus, num_sampler_per_worker,
+                                           self.gpu_memory_limit)
 
     def entity_prediction(self, H=None, R=None, T=None, file_name=None, save_file=None, target="tail", k=10,
                           backend=cfg.backend):
@@ -1032,7 +1038,8 @@ def get_solver(self, **kwargs):
         else:
             num_sampler_per_worker = self.cpu_per_gpu - 1
 
-        return solver.VisualizationSolver(self.dim, self.float_type, self.index_type, self.gpus, num_sampler_per_worker)
+        return solver.VisualizationSolver(self.dim, self.float_type, self.index_type, self.gpus, num_sampler_per_worker,
+                                          self.gpu_memory_limit)
 
     def visualization(self, Y=None, file_name=None, save_file=None, figure_size=10, scale=2):
         """
diff --git a/python/graphvite/cmd.py b/python/graphvite/cmd.py
@@ -176,8 +176,7 @@ def load_data(file_name):
     else:
         labels = None
 
-    gv.init_logging(logging.INFO)
-    # gv.init_logging(logging.WARNING)
+    gv.init_logging(logging.WARNING)
 
     app = gap.VisualizationApplication(args.dim, [0])
     app.load(vectors=vectors, perplexity=args.perplexity)
diff --git a/python/graphvite/dataset.py b/python/graphvite/dataset.py
@@ -886,7 +886,7 @@ def train_feature_data_preprocess(self, save_file):
         numpy_file = os.path.splitext(save_file)[0] + ".npy"
         if os.path.exists(numpy_file):
             return np.load(numpy_file)
-        features = self.image_feature_data(self.train_image, save_file)
+        features = self.image_feature_data(self.train_image)
         np.save(numpy_file, features)
         return features
 
@@ -932,7 +932,7 @@ def valid_feature_data_preprocess(self, save_file):
         numpy_file = os.path.splitext(save_file)[0] + ".npy"
         if os.path.exists(numpy_file):
             return np.load(numpy_file)
-        features = self.image_feature_data(self.valid_image, save_file)
+        features = self.image_feature_data(self.valid_image)
         np.save(numpy_file, features)
         return features