diff --git a/VERSION.in b/VERSION.in index bd73f4707..2eb3c4fe4 100644 --- a/VERSION.in +++ b/VERSION.in @@ -1 +1 @@ -0.4 +0.5 diff --git a/cuebot/src/main/java/com/imageworks/spcue/FrameDetail.java b/cuebot/src/main/java/com/imageworks/spcue/FrameDetail.java index 7d33618bf..60c07a030 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/FrameDetail.java +++ b/cuebot/src/main/java/com/imageworks/spcue/FrameDetail.java @@ -37,5 +37,6 @@ public class FrameDetail extends FrameEntity implements FrameInterface { public Timestamp dateStarted; public Timestamp dateStopped; public Timestamp dateUpdated; + public Timestamp dateLLU; } diff --git a/cuebot/src/main/java/com/imageworks/spcue/LayerDetail.java b/cuebot/src/main/java/com/imageworks/spcue/LayerDetail.java index bb964c93f..3b473f8c1 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/LayerDetail.java +++ b/cuebot/src/main/java/com/imageworks/spcue/LayerDetail.java @@ -36,6 +36,8 @@ public class LayerDetail extends LayerEntity implements LayerInterface { public long minimumMemory; public long minimumGpu; public int chunkSize; + public int timeout; + public int timeout_llu; public int dispatchOrder; public int totalFrameCount; @@ -90,6 +92,22 @@ public void setThreadable(boolean isThreadable) { this.isThreadable = isThreadable; } + public int getTimeout() { + return timeout; + } + + public void setTimeout(int timeout) { + this.timeout = timeout; + } + + public int getTimeoutLLU() { + return timeout; + } + + public void setTimeoutLLU(int timeout_llu) { + this.timeout_llu = timeout_llu; + } + public long getMinimumMemory() { return minimumMemory; } diff --git a/cuebot/src/main/java/com/imageworks/spcue/ServiceEntity.java b/cuebot/src/main/java/com/imageworks/spcue/ServiceEntity.java index 3c84525c8..373877e69 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/ServiceEntity.java +++ b/cuebot/src/main/java/com/imageworks/spcue/ServiceEntity.java @@ -55,5 +55,9 @@ public class ServiceEntity extends Entity { */ public LinkedHashSet tags = new LinkedHashSet(); + public int timeout = 0; + + public int timeout_llu = 0; + } diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/FrameDao.java b/cuebot/src/main/java/com/imageworks/spcue/dao/FrameDao.java index 4a6853c25..e576a0f59 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/FrameDao.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/FrameDao.java @@ -315,6 +315,19 @@ boolean updateFrameStopped(FrameInterface frame, FrameState state, int exitStatu */ ResourceUsage getResourceUsage(FrameInterface f); + /** + * Update Frame usage values for the given frame. The + * frame must be in the Running state. If the frame + * is locked by another thread, the process is aborted because + * we'll most likely get a new update one minute later. + * + * @param f + * @param lluTime + * @throws FrameReservationException if the frame is locked + * by another thread. + */ + void updateFrameUsage(FrameInterface f, long lluTime); + /** * Update memory usage values for the given frame. The * frame must be in the Running state. If the frame diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/LayerDao.java b/cuebot/src/main/java/com/imageworks/spcue/dao/LayerDao.java index e7eb7162f..7843d8e8d 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/LayerDao.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/LayerDao.java @@ -281,6 +281,24 @@ public interface LayerDao { */ void updateThreadable(LayerInterface layer, boolean threadable); + /** + * Update a layer's timeout value, which limits how + * much the frame can run on a host. + * + * @param job + * @param timeout + */ + void updateTimeout(LayerInterface layer, int timeout); + + /** + * Update a layer's LLU timeout value, which limits how + * much the frame can run on a host without updates in the log file. + * + * @param job + * @param timeout + */ + void updateTimeoutLLU(LayerInterface layer, int timeout_llu); + /** * Lowers the minimum memory on a layer if the layer * is using less memory and the currnet min memory is diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/oracle/FrameDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/oracle/FrameDaoJdbc.java index ac36b49ab..fd3a421d1 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/oracle/FrameDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/oracle/FrameDaoJdbc.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.EnumSet; import java.util.List; +import java.sql.Timestamp; import java.util.Optional; import org.springframework.jdbc.core.RowMapper; @@ -990,6 +991,21 @@ public ResourceUsage getResourceUsage(FrameInterface f) { "pk_frame = ?", RESOURCE_USAGE_MAPPER, f.getFrameId()); } + private static final String UPDATE_FRAME_IO_USAGE = + "UPDATE " + + "frame " + + "SET " + + "ts_updated = current_timestamp," + + "ts_llu = ? " + + "WHERE " + + "pk_frame = ? "; + + @Override + public void updateFrameUsage(FrameInterface f, long lluTime) { + getJdbcTemplate().update(UPDATE_FRAME_IO_USAGE, + new Timestamp(lluTime * 1000l), f.getFrameId()); + } + private static final String UPDATE_FRAME_MEMORY_USAGE = "UPDATE " + "frame " + diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/oracle/LayerDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/oracle/LayerDaoJdbc.java index 04cb1b14f..f189af1d9 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/oracle/LayerDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/oracle/LayerDaoJdbc.java @@ -624,6 +624,20 @@ public void updateThreadable(LayerInterface layer, boolean threadable) { threadable, layer.getLayerId()); } + @Override + public void updateTimeout(LayerInterface layer, int timeout){ + getJdbcTemplate().update( + "UPDATE layer SET int_timeout=? WHERE pk_layer=?", + timeout, layer.getLayerId()); + } + + @Override + public void updateTimeoutLLU(LayerInterface layer, int timeout_llu){ + getJdbcTemplate().update( + "UPDATE layer SET int_timeout_llu=? WHERE pk_layer=?", + timeout_llu, layer.getLayerId()); + } + @Override public void enableMemoryOptimizer(LayerInterface layer, boolean value) { getJdbcTemplate().update( diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java index fb85cdf10..5c043e995 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java @@ -24,6 +24,7 @@ import java.util.ArrayList; import java.util.EnumSet; import java.util.List; +import java.sql.Timestamp; import java.util.Optional; import org.springframework.jdbc.core.RowMapper; @@ -397,6 +398,7 @@ public FrameDetail mapRow(ResultSet rs, int rowNum) throws SQLException { frame.dateStarted = rs.getTimestamp("ts_started"); frame.dateStopped = rs.getTimestamp("ts_stopped"); frame.dateUpdated = rs.getTimestamp("ts_updated"); + frame.dateLLU = rs.getTimestamp("ts_llu"); frame.version = rs.getInt("int_version"); if (rs.getString("str_host") != null) { @@ -472,9 +474,10 @@ public boolean isOrphan(FrameInterface frame) { "int_number, " + "int_dispatch_order, " + "int_layer_order, "+ - "ts_updated "+ + "ts_updated, "+ + "ts_llu "+ ") " + - "VALUES (?,?,?,?,?,?,?,?,current_timestamp)"; + "VALUES (?,?,?,?,?,?,?,?,current_timestamp,current_timestamp)"; @Override public void insertFrames(LayerDetail layer, List frames) { @@ -692,6 +695,7 @@ public boolean updateFrameState(FrameInterface frame, FrameState state) { "SET " + "str_state=?, " + "ts_updated = current_timestamp, " + + "ts_llu = current_timestamp, " + "int_depend_count = 0, " + "int_version = int_version + 1 " + "WHERE " + @@ -965,6 +969,21 @@ public ResourceUsage getResourceUsage(FrameInterface f) { "pk_frame = ?", RESOURCE_USAGE_MAPPER, f.getFrameId()); } + private static final String UPDATE_FRAME_IO_USAGE = + "UPDATE " + + "frame " + + "SET " + + "ts_updated = current_timestamp," + + "ts_llu = ? " + + "WHERE " + + "pk_frame = ? "; + + @Override + public void updateFrameUsage(FrameInterface f, long lluTime) { + getJdbcTemplate().update(UPDATE_FRAME_IO_USAGE, + new Timestamp(lluTime * 1000l), f.getFrameId()); + } + private static final String UPDATE_FRAME_MEMORY_USAGE = "UPDATE " + "frame " + diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/LayerDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/LayerDaoJdbc.java index c2be2cc1a..26654f392 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/LayerDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/LayerDaoJdbc.java @@ -211,6 +211,8 @@ public LayerDetail mapRow(ResultSet rs, int rowNum) throws SQLException { rs.getString("str_tags").replaceAll(" ", "").split("\\|")); layer.services.addAll( Lists.newArrayList(rs.getString("str_services").split(","))); + layer.timeout = rs.getInt("int_timeout"); + layer.timeout_llu = rs.getInt("int_timeout_llu"); return layer; } }; @@ -310,9 +312,11 @@ public LayerInterface getLayer(String id) { "b_threadable, " + "int_mem_min, " + "int_gpu_min, " + - "str_services " + + "str_services, " + + "int_timeout," + + "int_timeout_llu " + ") " + - "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"; + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"; @Override public void insertLayerDetail(LayerDetail l) { @@ -322,7 +326,8 @@ public void insertLayerDetail(LayerDetail l) { l.range, l.chunkSize, l.dispatchOrder, StringUtils.join(l.tags," | "), l.type.toString(), l.minimumCores, l.maximumCores, l.isThreadable, - l.minimumMemory, l.minimumGpu, StringUtils.join(l.services,",")); + l.minimumMemory, l.minimumGpu, StringUtils.join(l.services,","), + l.timeout, l.timeout_llu); } @Override @@ -623,6 +628,20 @@ public void updateThreadable(LayerInterface layer, boolean threadable) { threadable, layer.getLayerId()); } + @Override + public void updateTimeout(LayerInterface layer, int timeout){ + getJdbcTemplate().update( + "UPDATE layer SET int_timeout=? WHERE pk_layer=?", + timeout, layer.getLayerId()); + } + + @Override + public void updateTimeoutLLU(LayerInterface layer, int timeout_llu){ + getJdbcTemplate().update( + "UPDATE layer SET int_timeout_llu=? WHERE pk_layer=?", + timeout_llu, layer.getLayerId()); + } + @Override public void enableMemoryOptimizer(LayerInterface layer, boolean value) { getJdbcTemplate().update( diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ServiceDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ServiceDaoJdbc.java index bc41fb038..b31d9ade0 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ServiceDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ServiceDaoJdbc.java @@ -63,6 +63,8 @@ public ServiceEntity mapRow(ResultSet rs, int rowNum) throws SQLException { s.minGpu = rs.getLong("int_gpu_min"); s.threadable = rs.getBoolean("b_threadable"); s.tags = splitTags(rs.getString("str_tags")); + s.timeout = rs.getInt("int_timeout"); + s.timeout_llu = rs.getInt("int_timeout_llu"); return s; } }; @@ -81,6 +83,8 @@ public ServiceOverrideEntity mapRow(ResultSet rs, int rowNum) s.threadable = rs.getBoolean("b_threadable"); s.tags = splitTags(rs.getString("str_tags")); s.showId = rs.getString("pk_show"); + s.timeout = rs.getInt("int_timeout"); + s.timeout_llu = rs.getInt("int_timeout_llu"); return s; } }; @@ -94,7 +98,9 @@ public ServiceOverrideEntity mapRow(ResultSet rs, int rowNum) "service.int_cores_max," + "service.int_mem_min," + "service.int_gpu_min," + - "service.str_tags " + + "service.str_tags, " + + "service.int_timeout, " + + "service.int_timeout_llu " + "FROM " + "service "; @@ -114,7 +120,9 @@ public ServiceEntity get(String id) { "show_service.int_cores_max, "+ "show_service.int_mem_min," + "show_service.int_gpu_min," + - "show_service.str_tags, " + + "show_service.str_tags," + + "show_service.int_timeout," + + "show_service.int_timeout_llu," + "show.pk_show " + "FROM " + "show_service," + @@ -160,8 +168,10 @@ public boolean isOverridden(String service, String show) { "int_cores_max, "+ "int_mem_min," + "int_gpu_min," + - "str_tags" + - ") VALUES (?,?,?,?,?,?,?,?)"; + "str_tags," + + "int_timeout," + + "int_timeout_llu " + + ") VALUES (?,?,?,?,?,?,?,?,?,?)"; @Override public void insert(ServiceEntity service) { @@ -169,7 +179,8 @@ public void insert(ServiceEntity service) { getJdbcTemplate().update(INSERT_SERVICE, service.id, service.name, service.threadable, service.minCores, service.maxCores, service.minMemory, service.minGpu, - StringUtils.join(service.tags.toArray(), " | ")); + StringUtils.join(service.tags.toArray(), " | "), + service.timeout, service.timeout_llu); } private static final String INSERT_SERVICE_WITH_SHOW = @@ -184,8 +195,10 @@ public void insert(ServiceEntity service) { "int_cores_max," + "int_mem_min," + "int_gpu_min," + - "str_tags " + - ") VALUES (?,?,?,?,?,?,?,?,?)"; + "str_tags," + + "int_timeout," + + "int_timeout_llu " + + ") VALUES (?,?,?,?,?,?,?,?,?,?,?)"; @Override public void insert(ServiceOverrideEntity service) { @@ -193,7 +206,8 @@ public void insert(ServiceOverrideEntity service) { getJdbcTemplate().update(INSERT_SERVICE_WITH_SHOW, service.id, service.showId, service.name, service.threadable, service.minCores, service.maxCores, service.minMemory, - service.minGpu, joinTags(service.tags)); + service.minGpu, joinTags(service.tags), + service.timeout, service.timeout_llu); } private static final String UPDATE_SERVICE = @@ -206,7 +220,9 @@ public void insert(ServiceOverrideEntity service) { "int_cores_max=?,"+ "int_mem_min=?," + "int_gpu_min=?," + - "str_tags=? " + + "str_tags=?," + + "int_timeout=?," + + "int_timeout_llu=? " + "WHERE " + "pk_service = ?"; @@ -215,7 +231,7 @@ public void update(ServiceEntity service) { getJdbcTemplate().update(UPDATE_SERVICE, service.name, service.threadable, service.minCores, service.maxCores, service.minMemory, service.minGpu, joinTags(service.tags), - service.getId()); + service.timeout, service.timeout_llu, service.getId()); } private static final String UPDATE_SERVICE_WITH_SHOW = @@ -228,7 +244,9 @@ service.minMemory, service.minGpu, joinTags(service.tags), "int_cores_max=?," + "int_mem_min=?," + "int_gpu_min=?," + - "str_tags=? " + + "str_tags=?," + + "int_timeout=?," + + "int_timeout_llu=? " + "WHERE " + "pk_show_service = ?"; @@ -237,7 +255,7 @@ public void update(ServiceOverrideEntity service) { getJdbcTemplate().update(UPDATE_SERVICE_WITH_SHOW, service.name, service.threadable, service.minCores, service.maxCores, service.minMemory, service.minGpu, joinTags(service.tags), - service.getId()); + service.timeout, service.timeout_llu, service.getId()); } @Override diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/WhiteboardDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/WhiteboardDaoJdbc.java index 71ab45af1..687e91ecd 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/WhiteboardDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/WhiteboardDaoJdbc.java @@ -1246,7 +1246,9 @@ public Layer mapRow(ResultSet rs, int rowNum) throws SQLException { replaceAll(" ","").split("\\|"))) .addAllServices(Arrays.asList(SqlUtil.getString(rs,"str_services").split(","))) .addAllLimits(Arrays.asList(SqlUtil.getString(rs,"str_limit_names").split(","))) - .setMemoryOptimizerEnabled(rs.getBoolean("b_optimize")); + .setMemoryOptimizerEnabled(rs.getBoolean("b_optimize")) + .setTimeout(rs.getInt("int_timeout")) + .setTimeoutLlu(rs.getInt("int_timeout_llu")); LayerStats.Builder statsBuilder = LayerStats.newBuilder() .setReservedCores(Convert.coreUnitsToCores(rs.getInt("int_cores"))) @@ -1410,6 +1412,8 @@ public Service mapRow(ResultSet rs, int rowNum) throws SQLException { .setMinGpu(rs.getInt("int_gpu_min")) .addAllTags(Lists.newArrayList(ServiceDaoJdbc.splitTags( SqlUtil.getString(rs,"str_tags")))) + .setTimeout(rs.getInt("int_timeout")) + .setTimeoutLlu(rs.getInt("int_timeout_llu")) .build(); } }; @@ -1427,6 +1431,8 @@ public ServiceOverride mapRow(ResultSet rs, int rowNum) throws SQLException { .setMinGpu(rs.getInt("int_gpu_min")) .addAllTags(Lists.newArrayList(ServiceDaoJdbc.splitTags( SqlUtil.getString(rs,"str_tags")))) + .setTimeout(rs.getInt("int_timeout")) + .setTimeoutLlu(rs.getInt("int_timeout_llu")) .build(); return ServiceOverride.newBuilder() .setId(SqlUtil.getString(rs,"pk_show_service")) @@ -1505,6 +1511,7 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "frame.int_dispatch_order,"+ "frame.ts_started,"+ "frame.ts_stopped,"+ + "frame.ts_llu,"+ "frame.int_retries,"+ "frame.str_state,"+ "frame.str_host,"+ @@ -1589,6 +1596,7 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "frame.str_state,"+ "frame.str_host,"+ "frame.int_cores,"+ + "frame.ts_llu,"+ "COALESCE(proc.int_mem_max_used, frame.int_mem_max_used) AS int_mem_max_used," + "COALESCE(proc.int_mem_used, frame.int_mem_used) AS int_mem_used " + "FROM "+ @@ -1988,7 +1996,9 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "service.int_cores_max," + "service.int_mem_min," + "service.int_gpu_min," + - "service.str_tags " + + "service.str_tags," + + "service.int_timeout," + + "service.int_timeout_llu " + "FROM "+ "service "; @@ -2001,7 +2011,9 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "show_service.int_cores_max," + "show_service.int_mem_min," + "show_service.int_gpu_min," + - "show_service.str_tags " + + "show_service.str_tags," + + "show_service.int_timeout," + + "show_service.int_timeout_llu " + "FROM "+ "show_service, " + "show " + @@ -2118,6 +2130,7 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "frame.int_dispatch_order,"+ "frame.ts_started,"+ "frame.ts_stopped,"+ + "frame.ts_llu,"+ "frame.int_retries,"+ "frame.str_state,"+ "frame.str_host,"+ diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java index 533066881..ebdd5082d 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java @@ -400,6 +400,14 @@ List findNextDispatchFrames(LayerInterface layer, VirtualProc pro */ void clearFrame(DispatchFrame frame); + /** + * Update usage data for the given frame. + * + * @param frame + * @param lluTime + */ + void updateFrameUsage(FrameInterface frame, long lluTime); + /** * Update memory usage data for the given frame. * diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java index 739afc468..3e3d82b2f 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java @@ -532,6 +532,22 @@ public void updateProcMemoryUsage(FrameInterface frame, long rss, long maxRss, procDao.updateProcMemoryUsage(frame, rss, maxRss, vsize, maxVsize); } + @Override + @Transactional(propagation = Propagation.REQUIRED) + public void updateFrameUsage(FrameInterface frame, long lluTime) { + + try { + frameDao.updateFrameUsage(frame, lluTime); + } + catch (FrameReservationException ex) { + // Eat this, the frame was not in the correct state or + // was locked by another thread. The only reason it would + // be locked by another thread would be if the state is + // changing. + logger.warn("failed to update io stats for frame: " + frame); + } + } + @Override @Transactional(propagation = Propagation.REQUIRED) public void updateFrameMemoryUsage(FrameInterface frame, long rss, long maxRss) { diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java index 206181fb6..fe2482720 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java @@ -19,6 +19,7 @@ package com.imageworks.spcue.dispatcher; +import java.sql.Timestamp; import java.util.EnumSet; import java.util.Random; import java.util.concurrent.atomic.AtomicLong; @@ -30,6 +31,7 @@ import com.imageworks.spcue.DispatchHost; import com.imageworks.spcue.DispatchJob; import com.imageworks.spcue.JobDetail; +import com.imageworks.spcue.LayerDetail; import com.imageworks.spcue.LayerInterface; import com.imageworks.spcue.Source; import com.imageworks.spcue.VirtualProc; @@ -132,10 +134,9 @@ public void handleFrameCompleteReport(final FrameCompleteReport report) { } final DispatchJob job = jobManager.getDispatchJob(proc.getJobId()); - final DispatchFrame frame = jobManager.getDispatchFrame( - report.getFrame().getFrameId()); - final FrameState newFrameState = determineFrameState(job, - frame, report); + final LayerDetail layer = jobManager.getLayerDetail(report.getFrame().getLayerId()); + final DispatchFrame frame = jobManager.getDispatchFrame(report.getFrame().getFrameId()); + final FrameState newFrameState = determineFrameState(job, layer, frame, report); if (dispatchSupport.stopFrame(frame, newFrameState, report.getExitStatus(), report.getFrame().getMaxRss())) { @@ -228,7 +229,7 @@ public void handlePostFrameCompleteOperations(VirtualProc proc, try { /* - * The default behavior is to keep the proc on the same. + * The default behavior is to keep the proc on the same job. */ boolean unbookProc = proc.unbooked; @@ -513,7 +514,7 @@ else if (report.getHost().getNimbyLocked()) { * @param report * @return */ - public static final FrameState determineFrameState(DispatchJob job, DispatchFrame frame, FrameCompleteReport report) { + public static final FrameState determineFrameState(DispatchJob job, LayerDetail layer, DispatchFrame frame, FrameCompleteReport report) { if (EnumSet.of(FrameState.WAITING, FrameState.EATEN).contains( frame.state)) { @@ -528,6 +529,9 @@ else if (frame.state.equals(FrameState.DEAD)) { } } else if (report.getExitStatus() != 0) { + long r = System.currentTimeMillis() / 1000; + long lastUpdate = (r - report.getFrame().getLluTime()) / 60; + FrameState newState = FrameState.WAITING; if (report.getExitStatus() == FrameExitStatus.SKIP_RETRY_VALUE || (job.maxRetries != 0 && report.getExitSignal() == 119)) { @@ -535,6 +539,11 @@ else if (frame.state.equals(FrameState.DEAD)) { newState = FrameState.WAITING; } else if (job.autoEat) { newState = FrameState.EATEN; + // ETC Time out and LLU timeout + } else if (layer.timeout_llu != 0 && report.getFrame().getLluTime() != 0 && lastUpdate > (layer.timeout_llu -1)) { + newState = FrameState.DEAD; + } else if (layer.timeout != 0 && report.getRunTime() > layer.timeout * 60) { + newState = FrameState.DEAD; } else if (report.getRunTime() > Dispatcher.FRAME_TIME_NO_RETRY) { newState = FrameState.DEAD; } else if (frame.retries >= job.maxRetries) { diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java index 83d7342de..815689bb4 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java @@ -36,6 +36,7 @@ import com.imageworks.spcue.FrameInterface; import com.imageworks.spcue.JobEntity; import com.imageworks.spcue.LayerEntity; +import com.imageworks.spcue.LayerDetail; import com.imageworks.spcue.LocalHostAssignment; import com.imageworks.spcue.Source; import com.imageworks.spcue.VirtualProc; @@ -203,6 +204,17 @@ public void handleHostReport(HostReport report, boolean isBoot) { */ updateMemoryUsage(report.getFramesList()); + /* + * Updates usage for the proc, frames, + * jobs, and layers. + */ + updateFrameUsage(report.getFramesList()); + + /* + * kill frames that have over run. + */ + killTimedOutFrames(report); + /* * Increase/decreased reserved memory. */ @@ -505,6 +517,68 @@ private void handleMemoryReservations(final DispatchHost host, final HostReport } } + /** + * Kill frames that over run. + * + * @param rFrames + */ + private void killTimedOutFrames(HostReport report) { + + final Map layers = new HashMap(5); + + for (RunningFrameInfo frame: report.getFramesList()) { + String layerId = frame.getLayerId(); + LayerDetail layer = layerDao.getLayerDetail(layerId); + long runtimeMinutes = ((System.currentTimeMillis() - frame.getStartTime()) / 1000l) / 60; + + if (layer.timeout != 0 && runtimeMinutes > layer.timeout){ + try { + killQueue.execute(new DispatchRqdKillFrame(report.getHost().getName(), + frame.getFrameId(), + "This frame has reached it timeout.", + rqdClient)); + } catch (TaskRejectedException e) { + logger.warn("Unable to queue RQD kill, task rejected, " + e); + } + } + + if (layer.timeout_llu == 0){ + continue; + } + + if (frame.getLluTime() == 0){ + continue; + } + + long r = System.currentTimeMillis() / 1000; + long lastUpdate = (r - frame.getLluTime()) / 60; + + if (layer.timeout_llu != 0 && lastUpdate > (layer.timeout_llu -1)){ + try { + killQueue.execute(new DispatchRqdKillFrame(report.getHost().getName(), + frame.getFrameId(), + "This frame has reached it LLU timeout.", + rqdClient)); + } catch (TaskRejectedException e) { + logger.warn("Unable to queue RQD kill, task rejected, " + e); + } + } + } + } + + /** + * Update IO usage for the given list of frames. + * + * @param rFrames + */ + private void updateFrameUsage(List rFrames) { + + for (RunningFrameInfo rf: rFrames) { + FrameInterface frame = jobManager.getFrame(rf.getFrameId()); + dispatchSupport.updateFrameUsage(frame, rf.getLluTime()); + } + } + /** * Update memory usage for the given list of frames. * @@ -708,7 +782,7 @@ public void verifyRunningFrameInfo(HostReport report) { } catch (Exception e) { CueExceptionUtil.logStackTrace("failed", e); logger.warn("failed to verify " + - runningFrame.getJobName() +"/" + + runningFrame.getJobName() + "/" + runningFrame.getFrameName() + " was running but the frame was " + " unable to be killed, " + e); diff --git a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageLayer.java b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageLayer.java index 9a2a73c97..126639a1b 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageLayer.java +++ b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageLayer.java @@ -102,6 +102,10 @@ import com.imageworks.spcue.grpc.job.LayerSetTagsResponse; import com.imageworks.spcue.grpc.job.LayerSetThreadableRequest; import com.imageworks.spcue.grpc.job.LayerSetThreadableResponse; +import com.imageworks.spcue.grpc.job.LayerSetTimeoutRequest; +import com.imageworks.spcue.grpc.job.LayerSetTimeoutResponse; +import com.imageworks.spcue.grpc.job.LayerSetTimeoutLLURequest; +import com.imageworks.spcue.grpc.job.LayerSetTimeoutLLUResponse; import com.imageworks.spcue.grpc.job.LayerStaggerFramesRequest; import com.imageworks.spcue.grpc.job.LayerStaggerFramesResponse; import com.imageworks.spcue.grpc.limit.Limit; @@ -351,6 +355,22 @@ public void setThreadable(LayerSetThreadableRequest request, StreamObserver responseObserver) { + updateLayer(request.getLayer()); + layerDao.updateTimeout(layer, request.getTimeout()); + responseObserver.onNext(LayerSetTimeoutResponse.newBuilder().build()); + responseObserver.onCompleted(); + } + + @Override + public void setTimeoutLLU(LayerSetTimeoutLLURequest request, StreamObserver responseObserver) { + updateLayer(request.getLayer()); + layerDao.updateTimeoutLLU(layer, request.getTimeoutLlu()); + responseObserver.onNext(LayerSetTimeoutLLUResponse.newBuilder().build()); + responseObserver.onCompleted(); + } + @Override public void addLimit(LayerAddLimitRequest request, StreamObserver responseObserver) { diff --git a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageService.java b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageService.java index 6e70fc5e9..eae767006 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageService.java @@ -58,6 +58,8 @@ public void createService(ServiceCreateServiceRequest request, service.minGpu = request.getData().getMinGpu(); service.tags = Sets.newLinkedHashSet(request.getData().getTagsList()); service.threadable = request.getData().getThreadable(); + service.timeout = request.getData().getTimeout(); + service.timeout_llu = request.getData().getTimeoutLlu(); serviceManager.createService(service); responseObserver.onNext(ServiceCreateServiceResponse.newBuilder() .setService(whiteboard.getService(service.getId())) @@ -130,6 +132,8 @@ private ServiceEntity toServiceEntity(Service service) { entity.minGpu = service.getMinGpu(); entity.tags = new LinkedHashSet<> (service.getTagsList()); entity.threadable = service.getThreadable(); + entity.timeout = service.getTimeout(); + entity.timeout_llu = service.getTimeoutLlu(); return entity; } } diff --git a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageServiceOverride.java b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageServiceOverride.java index e2de68220..bd90575b5 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageServiceOverride.java +++ b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageServiceOverride.java @@ -70,6 +70,8 @@ private ServiceEntity toServiceEntity(Service service) { entity.minGpu = service.getMinGpu(); entity.tags = new LinkedHashSet<>(service.getTagsList()); entity.threadable = service.getThreadable(); + entity.timeout = service.getTimeout(); + entity.timeout_llu = service.getTimeoutLlu(); return entity; } } diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/JobSpec.java b/cuebot/src/main/java/com/imageworks/spcue/service/JobSpec.java index d309fa264..d553456fd 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/JobSpec.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/JobSpec.java @@ -403,6 +403,15 @@ private void handleLayerTags(BuildableJob buildableJob, Element jobTag) { buildableLayer); determineMinimumGpu(buildableJob, layerTag, layer); + // set a timeout value on the layer + if (layerTag.getChildTextTrim("timeout") != null) { + layer.timeout = Integer.parseInt(layerTag.getChildTextTrim("timeout")); + } + + if (layerTag.getChildTextTrim("timeout_llu") != null) { + layer.timeout_llu = Integer.parseInt(layerTag.getChildTextTrim("timeout_llu")); + } + /* * Handle the layer environment */ @@ -671,6 +680,8 @@ private void determineResourceDefaults(Element layerTag, layer.tags.addAll(primaryService.tags); layer.services.addAll(services); layer.limits.addAll(limits); + layer.timeout = primaryService.timeout; + layer.timeout_llu = primaryService.timeout_llu; } /** diff --git a/cuebot/src/main/resources/conf/ddl/postgres/migrations/V7__Add_layer_service_timeout.sql b/cuebot/src/main/resources/conf/ddl/postgres/migrations/V7__Add_layer_service_timeout.sql new file mode 100644 index 000000000..4e0733996 --- /dev/null +++ b/cuebot/src/main/resources/conf/ddl/postgres/migrations/V7__Add_layer_service_timeout.sql @@ -0,0 +1,13 @@ + +-- Add timeout + +ALTER TABLE show_service ADD COLUMN int_timeout INT DEFAULT 0 NOT NULL; +ALTER TABLE service ADD COLUMN int_timeout INT DEFAULT 0 NOT NULL; +ALTER TABLE layer ADD COLUMN int_timeout INT DEFAULT 0 NOT NULL; + +-- Add LLU timeout + +ALTER TABLE frame ADD COLUMN ts_llu TIMESTAMP (6) WITH TIME ZONE; +ALTER TABLE show_service ADD COLUMN int_timeout_llu INT DEFAULT 0 NOT NULL; +ALTER TABLE service ADD COLUMN int_timeout_llu INT DEFAULT 0 NOT NULL; +ALTER TABLE layer ADD COLUMN int_timeout_llu INT DEFAULT 0 NOT NULL; \ No newline at end of file diff --git a/cuebot/src/main/resources/public/dtd/cjsl-1.10.dtd b/cuebot/src/main/resources/public/dtd/cjsl-1.10.dtd new file mode 100644 index 000000000..19a5ece29 --- /dev/null +++ b/cuebot/src/main/resources/public/dtd/cjsl-1.10.dtd @@ -0,0 +1,94 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ServiceDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ServiceDaoTests.java index 621ec1504..fee824fc1 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ServiceDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ServiceDaoTests.java @@ -67,6 +67,8 @@ public void testInsertService() { ServiceEntity s = new ServiceEntity(); s.name = "dillweed"; s.minCores = 100; + s.timeout = 0; + s.timeout_llu = 0; s.minMemory = CueUtil.GB4; s.minGpu = CueUtil.GB; s.threadable = false; @@ -83,6 +85,8 @@ public void testUpdateService() { ServiceEntity s = new ServiceEntity(); s.name = "dillweed"; s.minCores = 100; + s.timeout = 0; + s.timeout_llu = 0; s.minMemory = CueUtil.GB4; s.minGpu = CueUtil.GB; s.threadable = false; @@ -93,6 +97,8 @@ public void testUpdateService() { s.name = "smacktest"; s.minCores = 200; + s.timeout = 0; + s.timeout_llu = 0; s.minMemory = CueUtil.GB8; s.minGpu = CueUtil.GB2; s.threadable = true; @@ -116,6 +122,8 @@ public void testDeleteService() { ServiceEntity s = new ServiceEntity(); s.name = "dillweed"; s.minCores = 100; + s.timeout = 0; + s.timeout_llu = 0; s.minMemory = CueUtil.GB4; s.minGpu = CueUtil.GB; s.threadable = false; @@ -138,6 +146,8 @@ public void testInsertServiceOverride() { ServiceOverrideEntity s = new ServiceOverrideEntity(); s.name = "dillweed"; s.minCores = 100; + s.timeout = 0; + s.timeout_llu = 0; s.minMemory = CueUtil.GB4; s.minGpu = CueUtil.GB; s.threadable = false; @@ -155,6 +165,8 @@ public void testUpdateServiceOverride() { ServiceOverrideEntity s = new ServiceOverrideEntity(); s.name = "dillweed"; s.minCores = 100; + s.timeout = 0; + s.timeout_llu = 0; s.minMemory = CueUtil.GB4; s.minGpu = CueUtil.GB2; s.threadable = false; @@ -167,6 +179,8 @@ public void testUpdateServiceOverride() { s.name = "smacktest"; s.minCores = 200; + s.timeout = 10; + s.timeout_llu = 10; s.minMemory = CueUtil.GB8; s.minGpu = CueUtil.GB4; s.threadable = true; @@ -178,6 +192,8 @@ public void testUpdateServiceOverride() { assertEquals(s.name, s1.name); assertEquals(s.minCores, s1.minCores); + assertEquals(s.timeout, s1.timeout); + assertEquals(s.timeout_llu, s1.timeout_llu); assertEquals(s.minMemory, s1.minMemory); assertEquals(s.minGpu, s1.minGpu); assertEquals(s.threadable, s1.threadable); @@ -191,6 +207,8 @@ public void testDeleteServiceOverride() { ServiceOverrideEntity s = new ServiceOverrideEntity(); s.name = "dillweed"; s.minCores = 100; + s.timeout = 0; + s.timeout_llu = 0; s.minMemory = CueUtil.GB4; s.minGpu = CueUtil.GB; s.threadable = false; diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/WhiteboardDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/WhiteboardDaoTests.java index fe5514efb..99449337b 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/WhiteboardDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/WhiteboardDaoTests.java @@ -324,6 +324,8 @@ public void getServiceOverride() { ServiceOverrideEntity s = new ServiceOverrideEntity(); s.name = "test"; s.minCores = 100; + s.timeout = 0; + s.timeout_llu = 0; s.minMemory = 320000; s.tags.add("general"); s.threadable = false; diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/service/ServiceManagerTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/service/ServiceManagerTests.java index 7aeeadb33..3573cbe59 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/service/ServiceManagerTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/service/ServiceManagerTests.java @@ -86,6 +86,8 @@ public void testCreateService() { s.minMemory = CueUtil.GB4; s.minGpu = CueUtil.GB2; s.threadable = false; + s.timeout = 0; + s.timeout_llu = 0; s.tags.addAll(Sets.newHashSet("general")); serviceManager.createService(s); @@ -100,6 +102,8 @@ public void testOverrideExistingService() { ServiceOverrideEntity s = new ServiceOverrideEntity(); s.name = "arnold"; s.minCores = 400; + s.timeout = 10; + s.timeout_llu = 10; s.minMemory = CueUtil.GB8; s.minGpu = CueUtil.GB2; s.threadable = false; @@ -111,6 +115,8 @@ public void testOverrideExistingService() { ServiceEntity newService = serviceManager.getService("arnold", s.showId); assertEquals(s, newService); assertEquals(400, newService.minCores); + assertEquals(10, newService.timeout); + assertEquals(10, newService.timeout_llu); assertEquals(CueUtil.GB8, newService.minMemory); assertEquals(CueUtil.GB2, newService.minGpu); assertFalse(newService.threadable); diff --git a/cuebot/src/test/resources/conf/dtd/cjsl-1.10.dtd b/cuebot/src/test/resources/conf/dtd/cjsl-1.10.dtd new file mode 100644 index 000000000..4be16aa98 --- /dev/null +++ b/cuebot/src/test/resources/conf/dtd/cjsl-1.10.dtd @@ -0,0 +1,94 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/cuebot/src/test/resources/conf/dtd/cjsl-1.9.dtd b/cuebot/src/test/resources/conf/dtd/cjsl-1.9.dtd new file mode 100644 index 000000000..c945f948f --- /dev/null +++ b/cuebot/src/test/resources/conf/dtd/cjsl-1.9.dtd @@ -0,0 +1,92 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/cuegui/cuegui/LayerDialog.py b/cuegui/cuegui/LayerDialog.py index 8364dda01..a091cdba2 100644 --- a/cuegui/cuegui/LayerDialog.py +++ b/cuegui/cuegui/LayerDialog.py @@ -154,6 +154,20 @@ def __init__(self, layers, parent=None): self.__thread = QtWidgets.QCheckBox(self) self.__thread.setChecked(self.getThreading()) + # Timeout + self.__timeout = QtWidgets.QSpinBox(self) + self.__timeout.setRange(0, 4320) + self.__timeout.setSingleStep(1) + self.__timeout.setSuffix(" minutes") + self.__timeout.setSpecialValueText("No timeout") + + # Timeout LLU + self.__timeout_llu = QtWidgets.QSpinBox(self) + self.__timeout_llu.setRange(0, 4320) + self.__timeout_llu.setSingleStep(1) + self.__timeout_llu.setSuffix(" minutes") + self.__timeout_llu.setSpecialValueText("No timeout") + # Memory Optimizer self.__mem_opt = QtWidgets.QCheckBox() self.__mem_opt.setChecked(self.getMemoryOptSetting()) @@ -194,6 +208,8 @@ def __init__(self, layers, parent=None): self.__gpu.slider.setValue(self.getMaxGpu()) self.__core.setValue(self.getMinCores()) self.__max_cores.setValue(self.getMaxCores()) + self.__timeout.setValue(self.getTimeout()) + self.__timeout_llu.setValue(self.getTimeoutLLU()) QtWidgets.QVBoxLayout(self) @@ -222,6 +238,14 @@ def __init__(self, layers, parent=None): self.__gpu, False), multiSelect)) + layout.addWidget(EnableableItem(LayerPropertiesItem("Timeout:", + self.__timeout, + False), + multiSelect)) + layout.addWidget(EnableableItem(LayerPropertiesItem("Timeout LLU:", + self.__timeout_llu, + False), + multiSelect)) layout.addStretch() self.__group.setLayout(layout) @@ -277,7 +301,10 @@ def apply(self): layer.setThreadable(self.__thread.isChecked()) if self.__gpu.isEnabled(): layer.setMinGpu(self.__gpu.slider.value() * self.gpu_tick_kb) - + if self.__timeout.isEnabled(): + layer.setTimeout(self.__timeout.value()) + if self.__timeout_llu.isEnabled(): + layer.setTimeoutLLU(self.__timeout_llu.value()) if self.__tags.isEnabled(): self.__tags.apply() if self.__limits.isEnabled(): @@ -316,6 +343,20 @@ def getThreading(self): break return result + def getTimeout(self): + result = 0 + for layer in self.__layers: + if layer.data.timeout > result: + result = layer.data.timeout + return result + + def getTimeoutLLU(self): + result = 0 + for layer in self.__layers: + if layer.data.timeout_llu > result: + result = layer.data.timeout_llu + return result + def getMemoryOptSetting(self): result = False for layer in self.__layers: diff --git a/cuegui/cuegui/LayerMonitorTree.py b/cuegui/cuegui/LayerMonitorTree.py index 09c012334..19f21b78e 100644 --- a/cuegui/cuegui/LayerMonitorTree.py +++ b/cuegui/cuegui/LayerMonitorTree.py @@ -126,7 +126,14 @@ def __init__(self, parent): data=lambda layer: layer.percentCompleted(), sort=lambda layer: layer.percentCompleted(), tip="Progress for the Layer") - + self.addColumn("Timeout", 45, id=20, + data=lambda layer: cuegui.Utils.secondsToHHHMM(layer.data.timeout*60), + sort=lambda layer: layer.data.timeout, + tip="Timeout for the frames, Hours:Minutes") + self.addColumn("Timeout LLU", 45, id=21, + data=lambda layer: cuegui.Utils.secondsToHHHMM(layer.data.timeout_llu*60), + sort=lambda layer: layer.data.timeout_llu, + tip="Timeout for a frames\' LLU, Hours:Minutes") cuegui.AbstractTreeWidget.AbstractTreeWidget.__init__(self, parent) self.itemDoubleClicked.connect(self.__itemDoubleClickedFilterLayer) diff --git a/cuegui/cuegui/ServiceDialog.py b/cuegui/cuegui/ServiceDialog.py index 13837e624..2eccc6d70 100644 --- a/cuegui/cuegui/ServiceDialog.py +++ b/cuegui/cuegui/ServiceDialog.py @@ -65,6 +65,12 @@ def __init__(self, parent=None): self.min_gpu.setValue(0) self.min_gpu.setSingleStep(self.gpu_tick_mb) self.min_gpu.setSuffix(" MB") + self.timeout = QtWidgets.QSpinBox(self) + self.timeout.setRange(0, 4320) + self.timeout.setValue(0) + self.timeout_llu = QtWidgets.QSpinBox(self) + self.timeout_llu.setRange(0, 4320) + self.timeout_llu.setValue(0) layout = QtWidgets.QGridLayout(self) layout.addWidget(QtWidgets.QLabel("Name:", self), 0, 0) layout.addWidget(self.name, 0, 1) @@ -78,18 +84,22 @@ def __init__(self, parent=None): layout.addWidget(self.min_memory, 4, 1) layout.addWidget(QtWidgets.QLabel("Min Gpu Memory MB:", self), 5, 0) layout.addWidget(self.min_gpu, 5, 1) + layout.addWidget(QtWidgets.QLabel("Timeout (in minutes):", self), 6, 0) + layout.addWidget(self.timeout, 6, 1) + layout.addWidget(QtWidgets.QLabel("Timeout LLU (in minutes):", self), 7, 0) + layout.addWidget(self.timeout_llu, 7, 1) + self._tags_w = cuegui.TagsWidget.TagsWidget(allowed_tags=cuegui.Constants.ALLOWED_TAGS) + layout.addWidget(self._tags_w, 8, 0, 1, 2) self.__buttons = QtWidgets.QDialogButtonBox(QtWidgets.QDialogButtonBox.Save, QtCore.Qt.Horizontal, self) self.__buttons.setDisabled(True) - layout.addWidget(self.__buttons, 8, 1) + layout.addWidget(self.__buttons, 9, 1) self.__buttons.accepted.connect(self.save) - self._tags_w = cuegui.TagsWidget.TagsWidget(allowed_tags=cuegui.Constants.ALLOWED_TAGS) - layout.addWidget(self._tags_w, 6, 0, 1, 2) def _cfg(self): """ @@ -115,6 +125,8 @@ def setService(self, service): self.min_memory.setValue(service.data.min_memory // 1024) self.min_gpu.setValue(service.data.min_gpu // 1024) self._tags_w.set_tags(service.data.tags) + self.timeout.setValue(service.data.timeout) + self.timeout_llu.setValue(service.data.timeout_llu) def new(self): """ @@ -129,6 +141,8 @@ def new(self): self.max_cores.setValue(100) self.min_memory.setValue(3276) self.min_gpu.setValue(0) + self.timeout.setValue(0) + self.timeout_llu.setValue(0) self._tags_w.set_tags(['general']) def save(self): @@ -154,6 +168,8 @@ def save(self): service.setMaxCores(self.max_cores.value()) service.setMinMemory(self.min_memory.value() * 1024) service.setMinGpu(self.min_gpu.value() * 1024) + service.setTimeout(self.timeout.value()) + service.setTimeoutLLU(self.timeout_llu.value()) service.setTags(self._tags_w.get_tags()) self.saved.emit(service) diff --git a/proto/job.proto b/proto/job.proto index 4b33c015d..4a74f3aa8 100644 --- a/proto/job.proto +++ b/proto/job.proto @@ -343,6 +343,12 @@ service LayerInterface { // Set whether the layer is threadable or not rpc SetThreadable(LayerSetThreadableRequest) returns (LayerSetThreadableResponse); + // Set whether the timeout for frames in the layer + rpc SetTimeout(LayerSetTimeoutRequest) returns (LayerSetTimeoutResponse); + + // Set whether the LLU timeout for frames in the layer + rpc SetTimeoutLLU(LayerSetTimeoutLLURequest) returns (LayerSetTimeoutLLUResponse); + // Staggers the specified frame range. rpc StaggerFrames(LayerStaggerFramesRequest) returns (LayerStaggerFramesResponse); } @@ -449,6 +455,7 @@ message Frame { CheckpointState checkpoint_state = 16; int32 checkpoint_count = 17; int32 total_core_time = 18; + int32 llu_time = 19; } // Object for frame searching @@ -491,6 +498,7 @@ message UpdatedFrame { int64 max_rss = 7; int64 used_memory = 8; string last_resource = 9; + int32 llu_time = 10; } message UpdatedFrameSeq { @@ -621,6 +629,8 @@ message Layer { LayerStats layer_stats = 15; string parent_id = 16; repeated string limits = 17; + int32 timeout = 18; + int32 timeout_llu = 19; } message LayerSeq { @@ -1567,6 +1577,23 @@ message LayerSetThreadableRequest { message LayerSetThreadableResponse {} // Empty +// SetTimeout +message LayerSetTimeoutRequest { + Layer layer = 1; + int32 timeout = 2; +} + +message LayerSetTimeoutResponse {} // Empty + +// SetTimeoutLLU +message LayerSetTimeoutLLURequest { + Layer layer = 1; + int32 timeout_llu = 2; +} + +message LayerSetTimeoutLLUResponse {} // Empty + + // StaggerFrames message LayerStaggerFramesRequest { Layer layer = 1; diff --git a/proto/report.proto b/proto/report.proto index e7477df47..7a1fffdb2 100644 --- a/proto/report.proto +++ b/proto/report.proto @@ -86,6 +86,7 @@ message RunningFrameInfo { int64 max_vsize = 11; // kB int64 vsize = 12; // kB map attributes = 13; //additional data can be provided about the running frame + int64 llu_time = 14; }; diff --git a/proto/service.proto b/proto/service.proto index fac6722c1..8b554b388 100644 --- a/proto/service.proto +++ b/proto/service.proto @@ -49,6 +49,8 @@ message Service { int32 min_memory = 6; int32 min_gpu = 7; repeated string tags = 8; + int32 timeout = 9; + int32 timeout_llu = 10; } message ServiceSeq { diff --git a/pycue/opencue/wrappers/layer.py b/pycue/opencue/wrappers/layer.py index b02a5ea7e..dc33bc57a 100644 --- a/pycue/opencue/wrappers/layer.py +++ b/pycue/opencue/wrappers/layer.py @@ -162,6 +162,22 @@ def setThreadable(self, threadable): layer=self.data, threadable=threadable), timeout=Cuebot.Timeout) + def setTimeout(self, timeout): + """Set time out to the value. + :type timeout: int + :param timeout: value for timeout in minutes""" + return self.stub.SetTimeout(job_pb2.LayerSetTimeoutRequest( + layer=self.data, timeout=timeout), + timeout=Cuebot.Timeout) + + def setTimeoutLLU(self, timeout_llu): + """Set LLU time out to the value. + :type timeout: int + :param timeout: value for timeout in minutes""" + return self.stub.SetTimeoutLLU(job_pb2.LayerSetTimeoutLLURequest( + layer=self.data, timeout_llu=timeout_llu), + timeout=Cuebot.Timeout) + def addRenderPartition(self, hostname, threads, max_cores, num_mem, max_gpu): """Add a render partition to the layer. @type hostname: str diff --git a/pyoutline/outline/backend/cue.py b/pyoutline/outline/backend/cue.py index f3a27e940..61d38e2ff 100644 --- a/pyoutline/outline/backend/cue.py +++ b/pyoutline/outline/backend/cue.py @@ -281,6 +281,12 @@ def _serialize(launcher, use_pycuerun): if layer.get_arg("memory"): sub_element(spec_layer, "memory", "%s" % (layer.get_arg("memory"))) + if layer.get_arg("timeout"): + sub_element(spec_layer, "timeout", "%s" % (layer.get_arg("timeout"))) + + if layer.get_arg("timeout_llu"): + sub_element(spec_layer, "timeout_llu", "%s" % (layer.get_arg("timeout_llu"))) + if os.environ.get("OL_TAG_OVERRIDE", False): sub_element(spec_layer, "tags", scrub_tags(os.environ["OL_TAG_OVERRIDE"])) @@ -319,7 +325,7 @@ def _serialize(launcher, use_pycuerun): xml = [ '', '', + '"http://localhost:8080/spcue/dtd/cjsl-1.10.dtd">', Et.tostring(root).decode() ] diff --git a/rqd/rqd/rqmachine.py b/rqd/rqd/rqmachine.py index dae6f3752..e2cc365a0 100644 --- a/rqd/rqd/rqmachine.py +++ b/rqd/rqd/rqmachine.py @@ -262,6 +262,10 @@ def rssUpdate(self, frames): frame.rss = rss frame.maxRss = max(rss, frame.maxRss) + if os.path.exists(frame.runFrame.log_dir_file): + stat = os.stat(frame.runFrame.log_dir_file).st_mtime + frame.lluTime = int(stat) + frame.vsize = vsize frame.maxVsize = max(vsize, frame.maxVsize) diff --git a/rqd/rqd/rqnetwork.py b/rqd/rqd/rqnetwork.py index e65124d43..0639a5943 100644 --- a/rqd/rqd/rqnetwork.py +++ b/rqd/rqd/rqnetwork.py @@ -67,6 +67,8 @@ def __init__(self, rqCore, runFrame): self.utime = 0 self.stime = 0 + self.lluTime = 0 + def runningFrameInfo(self): """Returns the RunningFrameInfo object""" runningFrameInfo = rqd.compiled_proto.report_pb2.RunningFrameInfo( @@ -82,7 +84,8 @@ def runningFrameInfo(self): rss=self.rss, max_vsize=self.maxVsize, vsize=self.vsize, - attributes=self.runFrame.attributes + attributes=self.runFrame.attributes, + llu_time=self.lluTime ) return runningFrameInfo