diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C index 314b50269a1..472d2ba9e93 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C @@ -513,69 +513,10 @@ uint32_t handleMemCe( ExtensibleChip * i_chip, const MemAddr & i_addr, //------------------------------------------------------------------------------ template -uint32_t __analyzeFetchNceTce( ExtensibleChip * i_chip, const MemAddr & i_addr, - STEP_CODE_DATA_STRUCT & io_sc, - bool i_isTce = false ) -{ - #define PRDF_FUNC "[MemEcc::__analyzeFetchNceTce] " - - uint32_t o_rc = SUCCESS; - - do - { - // Get the symbol of the failure. - MemSymbol symbol; - o_rc = getMemReadSymbol( i_chip, i_addr.getRank(), symbol, i_isTce ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "getMemReadSymbol(0x%08x) failed", - i_chip->getHuid() ); - break; - } - - // Add the symbol to the callout list and CE table. - bool doTps; - o_rc = handleMemCe( i_chip, i_addr, symbol, doTps, io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "handleMemCe(0x%08x) failed", - i_chip->getHuid() ); - break; - } - - // Initiate a TPS procedure, if needed. - if ( doTps ) - { - #ifdef __HOSTBOOT_RUNTIME - - // If a MNFG threshold has been reached (predictive callout), we - // will still try to start TPS just in case MNFG disables the - // termination policy. - - o_rc = addTpsEvent( i_chip, i_addr.getRank(), io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "addTpsEvent(0x%08x) failed", - i_chip->getHuid() ); - } - - #endif - } - - } while (0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - -template -uint32_t analyzeFetchNce( ExtensibleChip * i_chip, - STEP_CODE_DATA_STRUCT & io_sc ) +uint32_t analyzeFetchNceTce( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) { - #define PRDF_FUNC "[MemEcc::analyzeFetchNce] " + #define PRDF_FUNC "[MemEcc::analyzeFetchNceTce] " PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( T == i_chip->getType() ); @@ -593,72 +534,70 @@ uint32_t analyzeFetchNce( ExtensibleChip * i_chip, i_chip->getHuid() ); break; } + MemRank rank = addr.getRank(); - // Complete analysis. - o_rc = __analyzeFetchNceTce( i_chip, addr, io_sc ); + // Get the symbols for the NCE/TCE attention. + MemSymbol sym1, sym2; + o_rc = getMemReadSymbol( i_chip, rank, sym1, sym2 ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__analyzeFetchNceTce(0x%08x) failed", + PRDF_ERR( PRDF_FUNC "getMemReadSymbol(0x%08x) failed", i_chip->getHuid() ); break; } - } while (0); - - // Add ECC capture data for FFDC. - MemCaptureData::addEccData( i_chip, io_sc ); - - return o_rc; - - #undef PRDF_FUNC -} - -// To resolve template linker errors. -template -uint32_t analyzeFetchNce( ExtensibleChip * i_chip, - STEP_CODE_DATA_STRUCT & io_sc ); - -//------------------------------------------------------------------------------ - -template -uint32_t analyzeFetchTce( ExtensibleChip * i_chip, - STEP_CODE_DATA_STRUCT & io_sc ) -{ - #define PRDF_FUNC "[MemEcc::analyzeFetchTce] " - - PRDF_ASSERT( nullptr != i_chip ); - PRDF_ASSERT( T == i_chip->getType() ); - - uint32_t o_rc = SUCCESS; - - do - { - // Get the address of the failure. - MemAddr addr; - o_rc = getMemReadAddr( i_chip, MemAddr::READ_NCE_ADDR, addr ); - if ( SUCCESS != o_rc ) + // Add the first symbol to the callout list and CE table. + bool doTps = false; + if ( sym1.isValid() ) { - PRDF_ERR( PRDF_FUNC "getMemReadAddr(0x%08x) failed", - i_chip->getHuid() ); + o_rc = handleMemCe( i_chip, addr, sym1, doTps, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "handleMemCe(0x%08x,0x%02x,%d) failed", + i_chip->getHuid(), rank.getKey(), sym1.getSymbol() ); + break; + } + } + else + { + // The first symbol should always be valid. + PRDF_ERR( PRDF_FUNC "getMemReadSymbol(0x%08x) returned an invalid " + "symbol", i_chip->getHuid() ); + o_rc = FAIL; break; } - // Complete analysis for first symbol. - o_rc = __analyzeFetchNceTce( i_chip, addr, io_sc ); - if ( SUCCESS != o_rc ) + // Add the second symbol to the callout list and CE table, if it exists. + if ( sym2.isValid() ) { - PRDF_ERR( PRDF_FUNC "first __analyzeFetchNceTce(0x%08x) failed", - i_chip->getHuid() ); - break; + bool tmp; + o_rc = handleMemCe( i_chip, addr, sym2, tmp, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "handleMemCe(0x%08x,0x%02x,%d) failed", + i_chip->getHuid(), rank.getKey(), sym2.getSymbol() ); + break; + } + if ( tmp ) doTps = true; } - // Complete analysis for second symbol. - o_rc = __analyzeFetchNceTce( i_chip, addr, io_sc, true ); - if ( SUCCESS != o_rc ) + // Initiate a TPS procedure, if needed. + if ( doTps ) { - PRDF_ERR( PRDF_FUNC "second __analyzeFetchNceTce(0x%08x) failed", - i_chip->getHuid() ); - break; + #ifdef __HOSTBOOT_RUNTIME + + // If a MNFG threshold has been reached (predictive callout), we + // will still try to start TPS just in case MNFG disables the + // termination policy. + + o_rc = addTpsEvent( i_chip, rank, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "addTpsEvent(0x%08x,0x%02x) failed", + i_chip->getHuid(), rank.getKey() ); + } + + #endif } } while (0); @@ -673,7 +612,7 @@ uint32_t analyzeFetchTce( ExtensibleChip * i_chip, // To resolve template linker errors. template -uint32_t analyzeFetchTce( ExtensibleChip * i_chip, +uint32_t analyzeFetchNceTce( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ); //------------------------------------------------------------------------------ @@ -929,7 +868,7 @@ uint32_t analyzeImpe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ) if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Read() failed on MSR: i_chip=0x%08x", - i_chip->getHuid() ); + i_chip->getHuid() ); break; } @@ -944,6 +883,13 @@ uint32_t analyzeImpe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ) // get symbol and DRAM MemSymbol symbol = MemSymbol::fromGalois( trgt, rank, galois ); + if ( !symbol.isValid() ) + { + PRDF_ERR( PRDF_FUNC "Galois 0x%02x from MSR is invalid: 0x%08x," + "0x%02x", galois, i_chip->getHuid(), rank.getKey() ); + o_rc = FAIL; + break; + } uint8_t dram = symbol.getDram(); // Add the DIMM to the callout list diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H index 37beecdaf7a..9d954078522 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H @@ -117,24 +117,14 @@ uint32_t analyzeFetchMpe( ExtensibleChip * i_chip, const MemRank & i_rank, STEP_CODE_DATA_STRUCT & io_sc ); /** - * @brief Analyzes a fetch NCE attention. + * @brief Analyzes a fetch NCE/TCE attention. * @param i_chip MCA or MBA. * @param io_sc The step code data struct. * @return Non-SUCCESS if an interal function fails, SUCCESS otherwise. */ template -uint32_t analyzeFetchNce( ExtensibleChip * i_chip, - STEP_CODE_DATA_STRUCT & io_sc ); - -/** - * @brief Analyzes a fetch TCE attention. - * @param i_chip MCA or MBA. - * @param io_sc The step code data struct. - * @return Non-SUCCESS if an interal function fails, SUCCESS otherwise. - */ -template -uint32_t analyzeFetchTce( ExtensibleChip * i_chip, - STEP_CODE_DATA_STRUCT & io_sc ); +uint32_t analyzeFetchNceTce( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); /** * @brief Analyzes a fetch UE attention. diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemSymbol.C b/src/usr/diag/prdf/common/plat/mem/prdfMemSymbol.C index 3f5c5185182..48118828bd3 100755 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemSymbol.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemSymbol.C @@ -64,10 +64,10 @@ MemSymbol::MemSymbol( TARGETING::TargetHandle_t i_trgt, const MemRank & i_rank, iv_trgt(i_trgt), iv_rank(i_rank), iv_symbol(i_symbol), iv_pins(i_pins), iv_isDramSpared(false), iv_isEccSpared(false) { - PRDF_ASSERT( NULL != i_trgt ); + PRDF_ASSERT( nullptr != i_trgt ); PRDF_ASSERT( TYPE_MBA == getTargetType(i_trgt) || TYPE_MCA == getTargetType(i_trgt) ); - PRDF_ASSERT( i_symbol < SYMBOLS_PER_RANK ); + // Allowing an invalid symbol. Use isValid() to check validity. PRDF_ASSERT( i_pins <= CEN_SYMBOL::BOTH_SYMBOL_DQS ); } @@ -87,8 +87,6 @@ MemSymbol MemSymbol::fromGalois( TargetHandle_t i_trgt, const MemRank & i_rank, } } - PRDF_ASSERT( symbol < SYMBOLS_PER_RANK ); - // Get pins from mask. uint8_t pins = NO_SYMBOL_DQS; if ( TYPE_MBA == getTargetType(i_trgt) ) @@ -160,7 +158,7 @@ uint8_t MemSymbol::getDramPins() const template<> uint32_t getMemReadSymbol( ExtensibleChip * i_chip, const MemRank & i_rank, - MemSymbol & o_symbol, bool i_isTce ) + MemSymbol & o_sym1, MemSymbol & o_sym2 ) { #define PRDF_FUNC "[getMemReadSymbol] " @@ -170,6 +168,8 @@ uint32_t getMemReadSymbol( ExtensibleChip * i_chip, uint32_t o_rc = SUCCESS; + o_sym1 = o_sym2 = MemSymbol(); // both initially invalid + do { // Get the NCE/TCE galois and mask from hardware. @@ -190,22 +190,15 @@ uint32_t getMemReadSymbol( ExtensibleChip * i_chip, break; } - uint32_t bitPos = (mcaRelMcs * 32) + (i_isTce ? 16 : 0); - - uint8_t galois = reg->GetBitFieldJustified( bitPos, 8 ); - uint8_t mask = reg->GetBitFieldJustified( bitPos + 8, 8 ); + uint32_t bitPos = mcaRelMcs * 32; + uint8_t g1 = reg->GetBitFieldJustified( bitPos, 8 ); + uint8_t m1 = reg->GetBitFieldJustified( bitPos + 8, 8 ); + uint8_t g2 = reg->GetBitFieldJustified( bitPos + 16, 8 ); + uint8_t m2 = reg->GetBitFieldJustified( bitPos + 24, 8 ); // Get the NCE/TCE symbol. - o_symbol = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, galois, - mask ); - if ( !o_symbol.isValid() ) - { - PRDF_ERR( PRDF_FUNC "fromGalois(0x%08x,m%ds%d,0x%02x,0x%02x) " - "failed", i_chip->getHuid(), i_rank.getMaster(), - i_rank.getSlave(), galois, mask ); - o_rc = FAIL; - break; - } + o_sym1 = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, g1, m1 ); + o_sym2 = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, g2, m2 ); // TODO: RTC 157888 Check if the symbol is on a spare DRAM. @@ -221,17 +214,18 @@ uint32_t getMemReadSymbol( ExtensibleChip * i_chip, template<> uint32_t getMemReadSymbol( ExtensibleChip * i_chip, const MemRank & i_rank, - MemSymbol & o_symbol, bool i_isTce ) + MemSymbol & o_sym1, MemSymbol & o_sym2 ) { #define PRDF_FUNC "[getMemReadSymbol] " // Check parameters PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( TYPE_MBA == i_chip->getType() ); - PRDF_ASSERT( !i_isTce ); // TCEs do not exist on Centaur uint32_t o_rc = SUCCESS; + o_sym1 = o_sym2 = MemSymbol(); // both initially invalid + do { // Get the NCE galois and mask from hardware. @@ -249,20 +243,11 @@ uint32_t getMemReadSymbol( ExtensibleChip * i_chip, break; } - uint8_t galois = reg->GetBitFieldJustified( 40, 8 ); - uint8_t mask = reg->GetBitFieldJustified( 32, 8 ); + uint8_t g1 = reg->GetBitFieldJustified( 40, 8 ); + uint8_t m1 = reg->GetBitFieldJustified( 32, 8 ); // Get the NCE symbol. - o_symbol = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, galois, - mask ); - if ( !o_symbol.isValid() ) - { - PRDF_ERR( PRDF_FUNC "fromGalois(0x%08x,m%ds%d,0x%02x,0x%02x) " - "failed", i_chip->getHuid(), i_rank.getMaster(), - i_rank.getSlave(), galois, mask ); - o_rc = FAIL; - break; - } + o_sym1 = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, g1, m1 ); // TODO: RTC 157888 Check if the symbol is on a spare DRAM. diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemSymbol.H b/src/usr/diag/prdf/common/plat/mem/prdfMemSymbol.H index 4b03f4cd257..cfcdc447512 100755 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemSymbol.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemSymbol.H @@ -182,14 +182,17 @@ class MemSymbol * @brief Reads the memory NCE/TCE vector trap register from hardware. * @param i_chip MCA or MBA. * @param i_rank The rank this symbol is on. - * @param o_symbol The returned symbol. - * @param i_isTce Only applies to MCA. True if the TCE symbol is wanted. False - * if the NCE symbol is wanted (default). + * @param o_sym1 The first symbol. Should always be valid for both NCE/TCE. + * @param o_sym2 The second symbol. Only valid for TCEs. + * @note For MCAs, both NCEs and TCEs report to the same error vector and only + * the latest NCE/TCE is recorded. Therefore, it is possible that PRD + * handles a TCE attention, but only one symbol is found because a NCE + * was reported afterwards, wiping out the error vector for the TCE. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. */ template uint32_t getMemReadSymbol( ExtensibleChip * i_chip, const MemRank & i_rank, - MemSymbol & o_symbol, bool i_isTce = false ); + MemSymbol & o_sym1, MemSymbol & o_sym2 ); } // end namespace PRDF diff --git a/src/usr/diag/prdf/common/plat/mem/prdfP9Mca_common.C b/src/usr/diag/prdf/common/plat/mem/prdfP9Mca_common.C index 452f6de8fea..59deae32c25 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfP9Mca_common.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfP9Mca_common.C @@ -181,34 +181,18 @@ PLUGIN_FETCH_MPE_ERROR( 7 ) //------------------------------------------------------------------------------ /** - * @brief MCAECCFIR[8] - Mainline NCE. + * @brief MCAECCFIR[8:9] - Mainline NCE and/or TCE. * @param i_chip MCA chip. * @param io_sc The step code data struct. * @return SUCCESS */ -int32_t AnalyzeFetchNce( ExtensibleChip * i_chip, - STEP_CODE_DATA_STRUCT & io_sc ) -{ - MemEcc::analyzeFetchNce( i_chip, io_sc ); - return SUCCESS; // nothing to return to rule code -} -PRDF_PLUGIN_DEFINE( p9_mca, AnalyzeFetchNce ); - -//------------------------------------------------------------------------------ - -/** - * @brief MCAECCFIR[9] - Mainline TCE. - * @param i_chip MCA chip. - * @param io_sc The step code data struct. - * @return SUCCESS - */ -int32_t AnalyzeFetchTce( ExtensibleChip * i_chip, - STEP_CODE_DATA_STRUCT & io_sc ) +int32_t AnalyzeFetchNceTce( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) { - MemEcc::analyzeFetchTce( i_chip, io_sc ); + MemEcc::analyzeFetchNceTce( i_chip, io_sc ); return SUCCESS; // nothing to return to rule code } -PRDF_PLUGIN_DEFINE( p9_mca, AnalyzeFetchTce ); +PRDF_PLUGIN_DEFINE( p9_mca, AnalyzeFetchNceTce ); //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/common/plat/p9/p9_mca.rule b/src/usr/diag/prdf/common/plat/p9/p9_mca.rule index 4dd93bdb01e..8fc5d8117a3 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_mca.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_mca.rule @@ -383,12 +383,17 @@ group gMCAECCFIR filter priority( 14, 17, 37 ), # ensure UEs handled before NCEs /** MCAECCFIR[8] * Mainline read NCE */ - (rMCAECCFIR, bit(8)) ? mainline_nce_handling; + (rMCAECCFIR, bit(8)) ? mainline_nce_tce_handling; /** MCAECCFIR[9] * Mainline read TCE */ - (rMCAECCFIR, bit(9)) ? mainline_tce_handling; + (rMCAECCFIR, bit(9)) ? mainline_nce_tce_handling; + + /** MCAECCFIR[8:9] + * Mainline read NCE and TCE + */ + (rMCAECCFIR, bit(8,9)) ? mainline_nce_tce_handling; /** MCAECCFIR[10] * Mainline read SCE diff --git a/src/usr/diag/prdf/common/plat/p9/p9_mca_actions.rule b/src/usr/diag/prdf/common/plat/p9/p9_mca_actions.rule index a0a5d6044b0..440f1ef5bb0 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_mca_actions.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_mca_actions.rule @@ -33,11 +33,8 @@ actionclass verify_chip_mark_5 { funccall("AnalyzeFetchMpe_5"); }; actionclass verify_chip_mark_6 { funccall("AnalyzeFetchMpe_6"); }; actionclass verify_chip_mark_7 { funccall("AnalyzeFetchMpe_7"); }; -/** Mainline NCE handling */ -actionclass mainline_nce_handling { funccall("AnalyzeFetchNce"); }; - -/** Mainline TCE handling */ -actionclass mainline_tce_handling { funccall("AnalyzeFetchTce"); }; +/** Mainline NCE/TCE handling */ +actionclass mainline_nce_tce_handling { funccall("AnalyzeFetchNceTce"); }; /** Mainline UE handling */ actionclass mainline_ue_handling_UERE