Skip to content

Commit

Permalink
Support collecting worker metrics in collectMetrics command
Browse files Browse the repository at this point in the history
### What changes are proposed in this pull request?
#14966 : Support collecting worker metrics in collectMetrics command.

### Why are the changes needed?
Support collecting worker metrics in collectMetrics command

### Does this PR introduce any user facing changes?
None

pr-link: #15233
change-id: cid-e83de55f2d1c5e5239a5de9934a8d0fb7a880c3b
  • Loading branch information
ljl1988com committed Apr 6, 2022
1 parent ab82b50 commit 21560be
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 28 deletions.
2 changes: 1 addition & 1 deletion shell/src/main/java/alluxio/cli/bundler/CollectInfo.java
Expand Up @@ -64,7 +64,7 @@
public class CollectInfo extends AbstractShell {
private static final Logger LOG = LoggerFactory.getLogger(CollectInfo.class);
private static final String USAGE =
"collectInfo [--max-threads <threadNum>] [--local] [--help] "
"collectInfo [--max-threads <threadNum>] [--local] [--help] [--exclude-worker-metrics]"
+ "[--exclude-logs <filename-prefixes>] [--include-logs <filename-prefixes>] "
+ "[--additional-logs <filename-prefixes>] [--start-time <datetime>] "
+ "[--end-time <datetime>] COMMAND <outputPath>\n\n"
Expand Down
Expand Up @@ -11,6 +11,7 @@

package alluxio.cli.bundler.command;

import alluxio.client.block.BlockWorkerInfo;
import alluxio.client.file.FileSystemContext;
import alluxio.conf.PropertyKey;
import alluxio.exception.AlluxioException;
Expand All @@ -19,6 +20,8 @@
import alluxio.util.network.HttpUtils;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -28,6 +31,7 @@
import java.io.StringWriter;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.List;

/**
* Command to probe Alluxio metrics for a few times.
Expand All @@ -40,6 +44,15 @@ public class CollectMetricsCommand extends AbstractCollectInfoCommand {
private static final int COLLECT_METRICS_TIMEOUT = 5 * 1000;
private static final String METRICS_SERVLET_PATH = "/metrics/json/";

private static final String EXCLUDE_OPTION_NAME = "exclude-worker-metrics";
private static final Option ONLY_MASTER_OPTION =
Option.builder().required(false).longOpt(EXCLUDE_OPTION_NAME).hasArg(false)
.desc("only collect master metrics\n"
+ "By default collect master metrics and all worker metrics.")
.build();
// Class specific options are aggregated into CollectInfo with reflection
public static final Options OPTIONS = new Options().addOption(ONLY_MASTER_OPTION);

/**
* Creates a new instance of {@link CollectMetricsCommand}.
*
Expand All @@ -63,22 +76,77 @@ public int run(CommandLine cl) throws AlluxioException, IOException {
StringWriter outputBuffer = new StringWriter();
for (int i = 0; i < COLLECT_METRICS_TIMES; i++) {
LocalDateTime now = LocalDateTime.now();
String timeMsg = String.format("Collecting metrics at %s", dtf.format(now));
LOG.info(timeMsg);
outputBuffer.write(timeMsg);

// Generate URL from config properties
String masterAddr;
try {
masterAddr = mFsContext.getMasterAddress().getHostName();
} catch (UnavailableException e) {
String noMasterMsg = "No Alluxio master available. Skip metrics collection.";
LOG.warn(noMasterMsg);
outputBuffer.write(noMasterMsg);
break;
String masterMsg = String.format("Collecting master metrics at %s ", dtf.format(now));
LOG.info(masterMsg);
outputBuffer.write(masterMsg);
masterMetrics(outputBuffer, i);
if (!cl.hasOption(EXCLUDE_OPTION_NAME)) {
String workerMsg = String.format("Collecting worker metrics at %s ", dtf.format(now));
LOG.info(workerMsg);
outputBuffer.write(workerMsg);
workerMetrics(outputBuffer, i);
}
String url = String.format("http://%s:%s%s", masterAddr,
mFsContext.getClusterConf().get(PropertyKey.MASTER_WEB_PORT),
// Wait for an interval
SleepUtils.sleepMs(LOG, COLLECT_METRICS_INTERVAL);
}

// TODO(jiacheng): phase 2 consider outputting partial results in a finally block
File outputFile = generateOutputFile(mWorkingDirPath,
String.format("%s.txt", getCommandName()));
FileUtils.writeStringToFile(outputFile, outputBuffer.toString());

return 0;
}

private void masterMetrics(StringWriter outputBuffer, int i) throws IOException {
// Generate URL from config properties
String masterAddr;
try {
masterAddr = mFsContext.getMasterAddress().getHostName();
} catch (UnavailableException e) {
String noMasterMsg = "No Alluxio master available. Skip metrics collection.";
LOG.warn(noMasterMsg);
outputBuffer.write(noMasterMsg);
return;
}
String url = String.format("http://%s:%s%s", masterAddr,
mFsContext.getClusterConf().get(PropertyKey.MASTER_WEB_PORT),
METRICS_SERVLET_PATH);
LOG.info(String.format("Metric address URL: %s", url));

// Get metrics
String metricsResponse;
try {
metricsResponse = getMetricsJson(url);
} catch (Exception e) {
// Do not break the loop since the HTTP failure can be due to many reasons
// Return the error message instead
LOG.error("Failed to get Alluxio master metrics from URL {}. Exception: ", url, e);
metricsResponse = String.format("Url: %s%nError: %s", url, e.getMessage());
}
outputBuffer.write(metricsResponse);
outputBuffer.write("\n");

// Write to file
File outputFile = generateOutputFile(mWorkingDirPath,
String.format("%s-master-%s", getCommandName(), i));
FileUtils.writeStringToFile(outputFile, metricsResponse);
}

private void workerMetrics(StringWriter outputBuffer, int i) throws IOException {
// Generate URL from config properties
List<BlockWorkerInfo> workers;
try {
workers = mFsContext.getCachedWorkers();
} catch (UnavailableException e) {
String noWorkerMsg = "No Alluxio workers available. Skip metrics collection.";
LOG.warn(noWorkerMsg);
outputBuffer.write(noWorkerMsg);
return;
}
for (BlockWorkerInfo worker : workers) {
String url = String.format("http://%s:%s%s", worker.getNetAddress().getHost(),
mFsContext.getClusterConf().get(PropertyKey.WORKER_WEB_PORT),
METRICS_SERVLET_PATH);
LOG.info(String.format("Metric address URL: %s", url));

Expand All @@ -89,26 +157,17 @@ public int run(CommandLine cl) throws AlluxioException, IOException {
} catch (Exception e) {
// Do not break the loop since the HTTP failure can be due to many reasons
// Return the error message instead
LOG.error("Failed to get Alluxio metrics from URL {}. Exception: ", url, e);
LOG.error("Failed to get Alluxio worker metrics from URL {}. Exception: ", url, e);
metricsResponse = String.format("Url: %s%nError: %s", url, e.getMessage());
}
outputBuffer.write(metricsResponse);
outputBuffer.write("\n");

// Write to file
File outputFile = generateOutputFile(mWorkingDirPath,
String.format("%s-%s", getCommandName(), i));
String.format("%s-worker-%s", getCommandName(), i));
FileUtils.writeStringToFile(outputFile, metricsResponse);

// Wait for an interval
SleepUtils.sleepMs(LOG, COLLECT_METRICS_INTERVAL);
}

// TODO(jiacheng): phase 2 consider outputting partial results in a finally block
File outputFile = generateOutputFile(mWorkingDirPath,
String.format("%s.txt", getCommandName()));
FileUtils.writeStringToFile(outputFile, outputBuffer.toString());

return 0;
}

@Override
Expand Down

0 comments on commit 21560be

Please sign in to comment.