diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C index 1aea86c2d36..7ad37bcca7f 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C @@ -153,19 +153,49 @@ uint32_t handleMemUe( ExtensibleChip * i_chip, const MemAddr & i_addr, #ifdef __HOSTBOOT_MODULE -uint32_t maskMemPort( ExtensibleChip * i_chip ) +template<> +uint32_t maskMemPort( ExtensibleChip * i_chip ) { - #define PRDF_FUNC "[MemEcc::maskMemPort] " + #define PRDF_FUNC "[MemEcc::maskMemPort] " + PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( TYPE_MCA == i_chip->getType() ); - SCAN_COMM_REGISTER_CLASS * c = i_chip->getRegister("MCACALFIR_MASK_OR"); - SCAN_COMM_REGISTER_CLASS * d = i_chip->getRegister("DDRPHYFIR_MASK_OR"); - SCAN_COMM_REGISTER_CLASS * e = i_chip->getRegister("MCAECCFIR_MASK_OR"); + uint32_t o_rc = SUCCESS; + + do + { + // Mask all FIRs on the port. + SCAN_COMM_REGISTER_CLASS * c = i_chip->getRegister("MCACALFIR_MASK_OR"); + SCAN_COMM_REGISTER_CLASS * d = i_chip->getRegister("DDRPHYFIR_MASK_OR"); + SCAN_COMM_REGISTER_CLASS * e = i_chip->getRegister("MCAECCFIR_MASK_OR"); + + c->setAllBits(); d->setAllBits(); e->setAllBits(); + + o_rc = c->Write() | d->Write() | e->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on 0x%08x", i_chip->getHuid() ); + break; + } + + #ifdef __HOSTBOOT_RUNTIME - c->setAllBits(); d->setAllBits(); e->setAllBits(); + /* TODO RTC 136129 + // Dynamically deallocate the port. + o_rc = MemDealloc::port( i_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MemDealloc::port(0x%08x) failed", + i_chip->getHuid() ); + } + */ + + #endif - return ( c->Write() | d->Write() | e->Write() ); + } while (0); + + return o_rc; #undef PRDF_FUNC } @@ -176,10 +206,13 @@ uint32_t maskMemPort( ExtensibleChip * i_chip ) #ifdef __HOSTBOOT_RUNTIME -uint32_t iuePortFail(ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc) +template<> +uint32_t iuePortFail( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) { - #define PRDF_FUNC "[MemEcc::iuePortFail] " + #define PRDF_FUNC "[MemEcc::iuePortFail] " + PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( TYPE_MCA == i_chip->getType() ); uint32_t o_rc = SUCCESS; @@ -714,89 +747,68 @@ uint32_t analyzeFetchUe( ExtensibleChip * i_chip, //------------------------------------------------------------------------------ -#ifdef __HOSTBOOT_MODULE - template -uint32_t __analyzeIue( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc, - MemAddr i_addr ) +uint32_t handleMemIue( ExtensibleChip * i_chip, const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc ) { - #define PRDF_FUNC "[MemEcc::__analyzeIue] " + #define PRDF_FUNC "[MemEcc::handleMemIue] " + PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( T == i_chip->getType() ); + uint32_t o_rc = SUCCESS; - do - { - // get data bundle from chip - D db = static_cast( i_chip->getDataBundle() ); + // Add the DIMM to the callout list. + MemoryMru mm { i_chip->getTrgt(), i_rank, MemoryMruData::CALLOUT_RANK }; + io_sc.service_data->SetCallout( mm ); - // get the rank - MemRank rank = i_addr.getRank(); + #ifdef __HOSTBOOT_MODULE - TargetHandle_t trgt = i_chip->getTrgt(); + do + { + // Nothing else to do if handling a system checkstop. + if ( CHECK_STOP == io_sc.service_data->getPrimaryAttnType() ) break; - // Add the DIMM to the callout list - MemoryMru memmru(trgt, rank, MemoryMruData::CALLOUT_RANK); - io_sc.service_data->SetCallout( memmru ); + // Get the data bundle from chip. + D db = static_cast( i_chip->getDataBundle() ); - uint8_t ds = rank.getDimmSlct(); + // Get the DIMM select. + uint8_t ds = i_rank.getDimmSlct(); - // Initialize threshold if it doesn't exist yet + // Initialize threshold if it doesn't exist yet. if ( 0 == db->iv_iueTh.count(ds) ) { db->iv_iueTh[ds] = TimeBasedThreshold( getIueTh() ); } - // increment the threshold - check if at threshold + // Increment the count and check if at threshold. if ( db->iv_iueTh[ds].inc(io_sc) ) { - // Make the error log predictive + // Make the error log predictive. io_sc.service_data->setServiceCall(); - #ifdef __HOSTBOOT_RUNTIME - - /* TODO RTC 136129 - // Dynamically deallocate the rank. - uint32_t dealloc_rc = MemDealloc::rank( i_chip, rank ); - if ( SUCCESS != dealloc_rc ) - { - PRDF_ERR( PRDF_FUNC "MemDealloc::rank() failed: i_chip=0x%08x " - "rank=m%ds%d", i_chip->getHuid(), rank.getMaster(), - rank.getSlave() ); - o_rc = dealloc_rc; break; - } - */ - - #endif // __HOSTBOOT_RUNTIME + // The port fail will be triggered in the PostAnalysis plugin after + // the error log has been committed. - // mask off the entire port to avoid collateral - o_rc = maskMemPort( i_chip ); + // Mask off the entire port to avoid collateral. + o_rc = MemEcc::maskMemPort( i_chip ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort failed: i_chip=0x%08x", - i_chip->getHuid() ); + PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort(0x%08x) failed", + i_chip->getHuid() ); break; } - - // Port fail will be triggered in PostAnalysis after the error log - // has been committed. } - }while(0); + } while (0); + + #endif // __HOSTBOOT_MODULE return o_rc; #undef PRDF_FUNC } -// To resolve template linker errors. -template -uint32_t __analyzeIue(ExtensibleChip * i_chip, - STEP_CODE_DATA_STRUCT & io_sc, - MemAddr i_addr ); - -#endif // __HOSTBOOT_MODULE - //------------------------------------------------------------------------------ template @@ -805,44 +817,39 @@ uint32_t analyzeMainlineIue( ExtensibleChip * i_chip, { #define PRDF_FUNC "[MemEcc::analyzeMainlineIue] " + PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( T == i_chip->getType() ); - uint32_t o_rc = SUCCESS; - #ifdef __HOSTBOOT_MODULE + uint32_t o_rc = SUCCESS; do { - - // get the address of the failure - MemAddr addr; - // Use the address in MBRCER. This address also traps IRCDs, but it is // not likely that we will have two independent failure modes at the // same time. So we just assume the address is correct. + MemAddr addr; o_rc = getMemReadAddr( i_chip, MemAddr::READ_RCE_ADDR, addr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMemReadAddr(0x%08x, READ_RCE_ADDR) failed", - i_chip->getHuid() ); + i_chip->getHuid() ); break; } + MemRank rank = addr.getRank(); - o_rc = __analyzeIue( i_chip, io_sc, addr ); + o_rc = handleMemIue( i_chip, rank, io_sc ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__analyzeIue failed. Chip HUID: 0x%08x", - i_chip->getHuid() ); + PRDF_ERR( PRDF_FUNC "handleMemIue(0x%08x,m%ds%d) failed", + i_chip->getHuid(), rank.getMaster(), rank.getSlave() ); break; } - }while(0); - - #endif + } while (0); return o_rc; #undef PRDF_FUNC - } // To resolve template linker errors. @@ -858,40 +865,37 @@ uint32_t analyzeMaintIue( ExtensibleChip * i_chip, { #define PRDF_FUNC "[MemEcc::analyzeMaintIue] " + PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( T == i_chip->getType() ); - uint32_t o_rc = SUCCESS; - #ifdef __HOSTBOOT_MODULE + uint32_t o_rc = SUCCESS; do { + // Use the current address in the MCBMCAT. MemAddr addr; - - // Use the current address in the MCBMCAT o_rc = getMemMaintAddr( i_chip, addr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", - i_chip->getHuid() ); + i_chip->getHuid() ); break; } + MemRank rank = addr.getRank(); - o_rc = __analyzeIue( i_chip, io_sc, addr ); + o_rc = handleMemIue( i_chip, rank, io_sc ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__analyzeIue failed. Chip HUID: " - "0x%08x", i_chip->getHuid() ); + PRDF_ERR( PRDF_FUNC "handleMemIue(0x%08x,m%ds%d) failed", + i_chip->getHuid(), rank.getMaster(), rank.getSlave() ); break; } - }while(0); - - #endif + } while (0); return o_rc; #undef PRDF_FUNC - } // To resolve template linker errors. diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H index 330fb25255d..37beecdaf7a 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H @@ -84,6 +84,27 @@ template uint32_t handleMemUe( ExtensibleChip * i_chip, const MemAddr & i_addr, UE_TABLE::Type i_type, STEP_CODE_DATA_STRUCT & io_sc ); +/** + * @brief Does mainline and maintenance IUE handling. + * + * Adds the memory IUE to the callout list. At threshold, will make the error + * log predictive. When threshold is reached at runtime there is a good chance + * these IUEs are going to lead to a data integrity issue. Therefore, the port + * will be forced to fail, the entire port will be masked off, and dynamic + * memory deallocation will be applied. Note that this function will not issue + * the port failure because it is possible that it may crash the host. Instead, + * the port failure is issued in the PostAnalysis plugin after the error log has + * been committed. + * + * @param i_chip MCA chip. + * @param i_rank Rank containing the IUE. + * @param io_sc The step code data struct. + * @return Non-SUCCESS if an interal function fails, SUCCESS otherwise. + */ +template +uint32_t handleMemIue( ExtensibleChip * i_chip, const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc ); + /** * @brief Analyzes a fetch MPE attention. * @param i_chip MCA or MBA. @@ -158,22 +179,25 @@ uint32_t analyzeImpe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ); #ifdef __HOSTBOOT_RUNTIME /** - * @brief Will trigger a port fail if the number of IUEs is over threshold - * @param i_chip MCA chip - * @param io_sc The step code data struct. + * @brief Will trigger a port fail if the number of IUEs is over threshold. + * @param i_chip MCA chip + * @param io_sc The step code data struct. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise */ -uint32_t iuePortFail(ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc); +template +uint32_t iuePortFail( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ); #endif // __HOSTBOOT_RUNTIME #ifdef __HOSTBOOT_MODULE /** - * @brief Will mask off the entire mem port - * @param i_chip MCA chip + * @brief Will mask off an entire memory port. At runtime will issue dynamic + * memory deallocation of the port. + * @param i_chip MCA chip * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise */ +template uint32_t maskMemPort( ExtensibleChip * i_chip ); template diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C index 7016b06bd5b..9b54037ba8b 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C +++ b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C @@ -69,16 +69,14 @@ int32_t PostAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ) #ifdef __HOSTBOOT_RUNTIME - // If the IUE threshold in our data bundle has been reached, we trigger // a port fail. Once we trigger the port fail, the system may crash // right away. Since PRD is running in the hypervisor, it is possible we // may not get the error log. To better our chances, we trigger the port // fail here after the error log has been committed. - if ( SUCCESS != MemEcc::iuePortFail(i_chip, io_sc) ) + if ( SUCCESS != MemEcc::iuePortFail(i_chip, io_sc) ) { - PRDF_ERR( PRDF_FUNC "iuePortFail failed: i_chip=0x%08x", - i_chip->getHuid() ); + PRDF_ERR( PRDF_FUNC "iuePortFail(0x%08x) failed", i_chip->getHuid() ); } #endif // __HOSTBOOT_RUNTIME @@ -197,14 +195,13 @@ int32_t MemPortFailure( ExtensibleChip * i_chip, if ( CHECK_STOP != io_sc.service_data->getPrimaryAttnType() ) { - // The port is dead mask off the entire port. - uint32_t l_rc = MemEcc::maskMemPort( i_chip ); + // The port is dead. Mask off the entire port. + uint32_t l_rc = MemEcc::maskMemPort( i_chip ); if ( SUCCESS != l_rc ) { - PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort failed: i_chip=0x%08x", + PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort(0x%08x) failed", i_chip->getHuid() ); } - } return SUCCESS; // nothing to return to rule code diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C b/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C index 4a80c2203a4..1b017194ce2 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C +++ b/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C @@ -112,10 +112,10 @@ int32_t PostAnalysis( ExtensibleChip * i_mcbChip, // if there's an IUE and we've reached threshold trigger a port fail if ( eccAttns & MAINT_IUE ) { - if ( SUCCESS != MemEcc::iuePortFail(mca, io_sc) ) + if ( SUCCESS != MemEcc::iuePortFail(mca, io_sc) ) { - PRDF_ERR( PRDF_FUNC "iuePortFail failed: i_mcbChip=" - "0x%08x", i_mcbChip->getHuid() ); + PRDF_ERR( PRDF_FUNC "iuePortFail(0x%08x) failed", + i_mcbChip->getHuid() ); } } }