diff --git a/src/occ_405/amec/amec_master_smh.c b/src/occ_405/amec/amec_master_smh.c index 0b024ec5..244bde50 100755 --- a/src/occ_405/amec/amec_master_smh.c +++ b/src/occ_405/amec/amec_master_smh.c @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2016 */ +/* Contributors Listed Below - COPYRIGHT 2011,2017 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -23,9 +23,9 @@ /* */ /* IBM_PROLOG_END_TAG */ -//************************************************************************* +//*************************************************************************/ // Includes -//************************************************************************* +//*************************************************************************/ #include #include #include // Error logging @@ -38,18 +38,19 @@ #include "amec_sys.h" #include "amec_service_codes.h" //For AMEC_MST_CHECK_PCAPS_MATCH #include "dcom.h" +#include -//************************************************************************* +//*************************************************************************/ // Externs -//************************************************************************* +//*************************************************************************/ -//************************************************************************* +//*************************************************************************/ // Macros -//************************************************************************* +//*************************************************************************/ -//************************************************************************* +//*************************************************************************/ // Defines/Enums -//************************************************************************* +//*************************************************************************/ //Power cap mismatch threshold set to 8 ticks (2 milliseconds) #define PCAPS_MISMATCH_THRESHOLD 8 @@ -57,13 +58,13 @@ //Power cap failure threshold set to 32 (ticks) #define PCAP_FAILURE_THRESHOLD 32 -//************************************************************************* +//*************************************************************************/ // Structures -//************************************************************************* +//*************************************************************************/ -//************************************************************************* +//*************************************************************************/ // Globals -//************************************************************************* +//*************************************************************************/ smh_state_t G_amec_mst_state = {AMEC_INITIAL_STATE, AMEC_INITIAL_STATE, AMEC_INITIAL_STATE}; @@ -189,13 +190,13 @@ const smh_tbl_t amec_mst_state_table[AMEC_SMH_STATES_PER_LVL] = // fw timings when the AMEC master State Machine finishes. smh_state_timing_t G_amec_mst_state_timings = {amec_mst_update_smh_sensors}; -//************************************************************************* +//*************************************************************************/ // Function Prototypes -//************************************************************************* +//*************************************************************************/ -//************************************************************************* +//*************************************************************************/ // Functions -//************************************************************************* +//*************************************************************************/ // Function Specification // diff --git a/src/occ_405/amec/amec_sensors_power.c b/src/occ_405/amec/amec_sensors_power.c index b8704aef..ab6f1ee9 100755 --- a/src/occ_405/amec/amec_sensors_power.c +++ b/src/occ_405/amec/amec_sensors_power.c @@ -54,6 +54,17 @@ // This holds the converted ADC Reads uint32_t G_lastValidAdcValue[MAX_APSS_ADC_CHANNELS] = {0}; +// Indicates if we have determined GPU presence +bool G_gpu_config_done = FALSE; + +// Bitmap of GPUs present +uint32_t G_first_proc_gpu_config = 0; +uint32_t G_first_sys_gpu_config = 0; +uint32_t G_first_num_gpus_sys = 0; +uint32_t G_curr_proc_gpu_config = 0; +uint32_t G_curr_sys_gpu_config = 0; +uint32_t G_curr_num_gpus_sys = 0; + // There are only MAX_APSS_ADC_CHANNELS channels. Therefore if the channel value // is greater then the MAX, then there was no channel associated with the function id. #define ADC_CONVERTED_VALUE(i_chan) \ @@ -63,9 +74,9 @@ extern uint8_t G_occ_interrupt_type; extern bool G_vrm_thermal_monitoring; extern bool G_apss_present; -//************************************************************************* +//*************************************************************************/ // Code -//************************************************************************* +//*************************************************************************/ // Function Specification // @@ -231,6 +242,8 @@ void amec_update_apss_sensors(void) sensor_update(AMECSENSOR_PTR(PWRAPSSCH0 + l_idx), (uint16_t) temp32); } } + + amec_update_apss_gpio(); } // ---------------------------------------------------------- @@ -396,6 +409,9 @@ void amec_update_apss_sensors(void) //Count of number of updates. g_pwr250us_over30sec.count++; + // Check the GPU presence signals + amec_update_gpu_configuration(); + // ---------------------------------------------------- // Clear Flag to indicate that AMEC has received the data. // ---------------------------------------------------- @@ -691,7 +707,167 @@ void amec_update_avsbus_sensors(void) } // end amec_update_avsbus_sensors() +// Function Specification +// +// Name: amec_update_apss_gpio +// +// Description: Updates sensors based on the GPIO data from the APSS +// +// Thread: RealTime Loop +// +// End Function Specification +void amec_update_apss_gpio(void) +{ + // GPIO port numbers from system model + uint8_t * l_vrhot_port_nums = G_sysConfigData.apss_gpio_map.vr_fan; + + // Actual values of the GPIO + uint8_t l_vrhot0 = 1, l_vrhot1 = 1; + + // Data is valid? + uint8_t l_valid0 = FALSE, l_valid1 = FALSE; + + // Get value from most recent APSS data + l_valid0 = apss_gpio_get(l_vrhot_port_nums[0], &l_vrhot0); //GPIO_VR_HOT_MEM_PROC_0 + l_valid1 = apss_gpio_get(l_vrhot_port_nums[1], &l_vrhot1); //GPIO_VR_HOT_MEM_PROC_1 + // Only log once + static uint8_t L_err_logged; + + // These signals are active low + if( (l_valid0 && !l_vrhot0) || (l_valid1 && !l_vrhot1) ) + { + // Update the sensor indicating that one of the vrhot signals was asserted + sensor_update(AMECSENSOR_PTR(VRHOTMEMPRCCNT), 1); + + // Only log once + if(!L_err_logged) + { + INTR_TRAC_ERR("GPIO_VR_HOT_MEM_PROC_0[%d, valid=%d] GPIO_VR_HOT_MEM_PROC_1[%d, valid=%d]", + l_vrhot0, l_valid0, l_vrhot1, l_valid1); + /* + * @errortype + * @moduleid AMEC_UPDATE_APSS_GPIO + * @reasoncode VR_HOT_MEM_PROC_ASSERTED + * @userdata1 0 + * @userdata2 0 + * @userdata4 OCC_NO_EXTENDED_RC + * @devdesc GPIO_VR_HOT_MEM_PROC_0/1 was asserted + */ + errlHndl_t l_err = createErrl(AMEC_UPDATE_APSS_GPIO, + VR_HOT_MEM_PROC_ASSERTED, + OCC_NO_EXTENDED_RC, + ERRL_SEV_INFORMATIONAL, + NULL, + DEFAULT_TRACE_SIZE, + 0, + 0); + + // Manufacturing error only + setErrlActions(l_err, ERRL_ACTIONS_MANUFACTURING_ERROR); + + // Processor callout + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_HUID, + G_sysConfigData.proc_huid, + ERRL_CALLOUT_PRIORITY_HIGH); + + // APSS callout + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_HUID, + G_sysConfigData.apss_huid, + ERRL_CALLOUT_PRIORITY_LOW); + + commitErrl(&l_err); + + L_err_logged = TRUE; + } + } + else if ( (l_valid0 && l_vrhot0) && (l_valid1 && l_vrhot1) ) + { + sensor_update(AMECSENSOR_PTR(VRHOTMEMPRCCNT), 0); + } +} + +// Function Specification +// +// Name: amec_update_gpu_configuration +// +// Description: Checks the APSS data to see which GPUs are present +// +// Thread: RealTime Loop +// +// End Function Specification +void amec_update_gpu_configuration(void) +{ + // GPIO port numbers from system model + uint8_t * l_gpu_port_nums = G_sysConfigData.apss_gpio_map.gpu; + + // Actual values of the GPIO + uint8_t l_gpu_pres = 1; + + // Data is valid? + bool l_valid = FALSE; + bool l_all_valid = FALSE; + + uint8_t i = 0; + uint8_t l_start_proc = (G_pbax_id.chip_id * GPU_PRES_SIGN_PER_OCC); + + uint8_t l_valid_bitmask_proc = 0; // Bitmask for present GPUs behind just this proc + uint8_t l_valid_bitmask_sys = 0; // Bitmask for present GPUs behind both procs + uint8_t l_num_gpus_sys = 0; // Number of GPUs both procs + + // Check which GPUs are present + for( i=0; i < MAX_GPU_PRES_SIGNALS; i++ ) + { + l_valid = apss_gpio_get(l_gpu_port_nums[i], &l_gpu_pres); + + // Presence signal is active low + l_gpu_pres = (l_gpu_pres ? 0 : 1); + if(l_valid) + { + l_all_valid = TRUE; + + // Keep track of number and configuration of GPUs behind both procs + l_num_gpus_sys += l_gpu_pres; + l_valid_bitmask_sys |= (l_gpu_pres << i); + + // Also want to keep a separate tally of GPUs behind only this proc + if( (i >= l_start_proc) && (i < (l_start_proc + GPU_PRES_SIGN_PER_OCC)) ) + { + l_valid_bitmask_proc |= (l_gpu_pres << (i - l_start_proc)); + } + } + else + { + l_all_valid = FALSE; + break; + } + } + + // If all GPU signals are valid, update the global if this is the first read. + // If this is not the first read, make sure that the signals match the first. + if(l_all_valid) + { + G_curr_proc_gpu_config = l_valid_bitmask_proc; + G_curr_sys_gpu_config = l_valid_bitmask_sys; + G_curr_num_gpus_sys = l_num_gpus_sys; + if(!G_gpu_config_done) + { + G_gpu_config_done = TRUE; + G_first_proc_gpu_config = l_valid_bitmask_proc; + G_first_sys_gpu_config = l_valid_bitmask_sys; + G_first_num_gpus_sys = l_num_gpus_sys; + TRAC_IMP("GPU presence detection completed. GPU configuration for this OCC: 0x%08X, total[%d]", + G_curr_proc_gpu_config, G_curr_num_gpus_sys); + } + else if (G_curr_sys_gpu_config != G_first_sys_gpu_config) + { + TRAC_ERR("GPU presence has changed unexpectedly! Old:0x%02X, New:0x%02X", + G_first_sys_gpu_config, l_valid_bitmask_sys); + } + } +} /*----------------------------------------------------------------------------*/ /* End */ /*----------------------------------------------------------------------------*/ diff --git a/src/occ_405/amec/amec_sensors_power.h b/src/occ_405/amec/amec_sensors_power.h index 9fef1de7..2cb55fbf 100755 --- a/src/occ_405/amec/amec_sensors_power.h +++ b/src/occ_405/amec/amec_sensors_power.h @@ -1,11 +1,11 @@ /* IBM_PROLOG_BEGIN_TAG */ /* This is an automatically generated prolog. */ /* */ -/* $Source: src/occ/amec/amec_sensors_power.h $ */ +/* $Source: src/occ_405/amec/amec_sensors_power.h $ */ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2015 */ +/* Contributors Listed Below - COPYRIGHT 2011,2017 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -46,8 +46,16 @@ // sensors for data that comes from the APSS (Power Data from APSS ADCs) void amec_update_apss_sensors(void); +// Function that is called by AMEC State Machine that will update the AMEC +// sensors for GPIO data collected from the APSS. +void amec_update_apss_gpio(void); + // Function that is called by AMEC State Machine that will update the AMEC // sensors for data that comes from the AVS Bus (Voltage/Current) void amec_update_avsbus_sensors(void); +// Function called by the AMEC state machine until GPU configuration is +// successfully determined +void amec_update_gpu_configuration(void); + #endif // _AMEC_SENSORS_POWER_H diff --git a/src/occ_405/amec/amec_service_codes.h b/src/occ_405/amec/amec_service_codes.h index 076bef99..47d6c09c 100755 --- a/src/occ_405/amec/amec_service_codes.h +++ b/src/occ_405/amec/amec_service_codes.h @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2016 */ +/* Contributors Listed Below - COPYRIGHT 2011,2017 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -65,6 +65,7 @@ enum occAmecModuleId AMEC_HEALTH_CHECK_PROC_TIMEOUT = AMEC_COMP_ID | 0x14, AMEC_CALC_DTS_SENSORS = AMEC_COMP_ID | 0x16, AMEC_SET_FREQ_RANGE = AMEC_COMP_ID | 0x17, + AMEC_UPDATE_APSS_GPIO = AMEC_COMP_ID | 0x18, }; /*----------------------------------------------------------------------------*/ diff --git a/src/occ_405/amec/amec_slave_smh.c b/src/occ_405/amec/amec_slave_smh.c index 2137d35f..e92308ac 100755 --- a/src/occ_405/amec/amec_slave_smh.c +++ b/src/occ_405/amec/amec_slave_smh.c @@ -210,6 +210,7 @@ smh_state_timing_t G_amec_slv_state_timings = {amec_slv_update_smh_sensors}; //*************************************************************************/ // Globals //*************************************************************************/ +extern bool G_gpu_config_done; //*************************************************************************/ // Function Prototypes diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h index 66d1b84b..40afd494 100755 --- a/src/occ_405/amec/amec_sys.h +++ b/src/occ_405/amec/amec_sys.h @@ -361,6 +361,7 @@ typedef struct sensor_t pwr250usgpu; sensor_t pwrapssch[MAX_APSS_ADC_CHANNELS]; sensor_t cur12Vstby; + sensor_t vrhot_mem_proc; sensor_t vrfan; diff --git a/src/occ_405/occ_service_codes.h b/src/occ_405/occ_service_codes.h index 57198011..417aa2d6 100644 --- a/src/occ_405/occ_service_codes.h +++ b/src/occ_405/occ_service_codes.h @@ -65,6 +65,8 @@ enum occReasonCode VRM_ERROR_TEMP = 0x20, /// VR_FAN - AVS Bus over-temperature reported VRM_VRFAN_WARNING = 0x22, + /// GPIO_VR_HOT_MEM_PROC signal from APSS asserted + VR_HOT_MEM_PROC_ASSERTED = 0x23, /// DIMM reached error threshold DIMM_ERROR_TEMP = 0x30, /// Frequency limited due to oversubscription condition diff --git a/src/occ_405/proc/proc_data.c b/src/occ_405/proc/proc_data.c index ac8ee9ff..ab88c192 100755 --- a/src/occ_405/proc/proc_data.c +++ b/src/occ_405/proc/proc_data.c @@ -654,17 +654,27 @@ void nest_dts_init(void) void task_24x7(task_t * i_task) { static uint8_t L_numTicks = 0x00; // never called since OCC started + static bool L_idle_trace = FALSE; if (!G_24x7_disabled) { // Schedule 24x7 task if idle if (!async_request_is_idle(&G_24x7_request.request)) { - INTR_TRAC_ERR("task_24x7: request not idle"); + if(!L_idle_trace) + { + INTR_TRAC_ERR("task_24x7: request not idle"); + L_idle_trace = TRUE; + } L_numTicks++; } else { + if(L_idle_trace) + { + INTR_TRAC_INFO("task_24x7: previously was not idle and is now idle after %d ticks", L_numTicks); + L_idle_trace = FALSE; + } // Clear errors and init parameters for GPE task G_24x7_parms.error.error = 0; G_24x7_parms.numTicksPassed = L_numTicks; diff --git a/src/occ_405/pss/apss.c b/src/occ_405/pss/apss.c index 83eff117..eb9c3d75 100755 --- a/src/occ_405/pss/apss.c +++ b/src/occ_405/pss/apss.c @@ -770,7 +770,6 @@ void task_apss_complete_pwr_meas(struct task *i_self) APSS_DBG("task_apss_complete_pwr_meas: finished w/rc=0x%08X\n", G_gpe_complete_pwr_meas_read_args.error.rc); APSS_DBG_HEXDUMP(&G_gpe_complete_pwr_meas_read_args, sizeof(G_gpe_complete_pwr_meas_read_args), "G_gpe_complete_pwr_meas_read_args"); - } // end task_apss_complete_pwr_meas bool apss_gpio_get(uint8_t i_pin_number, uint8_t *o_pin_value) @@ -784,9 +783,9 @@ bool apss_gpio_get(uint8_t i_pin_number, uint8_t *o_pin_value) bool l_dcom_data_valid = FALSE; int i=0; - for(;i