Skip to content

Commit

Permalink
PRD: Fetch NCE/TCE redesign
Browse files Browse the repository at this point in the history
NCEs and TCEs use the same error vector register, which stores the
latest NCE/TCE. So it is possible that PRD could be handling one
attention and the vector could be overwritten by a subsequent attention.

Change-Id: Ia6955bc1f0a258e1450c426cec8c466a56b43432
CQ: SW392312
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/42013
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/42210
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
  • Loading branch information
zane131 committed Jun 22, 2017
1 parent f1b00fd commit 676eae6
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 193 deletions.
176 changes: 61 additions & 115 deletions src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C
Expand Up @@ -513,69 +513,10 @@ uint32_t handleMemCe( ExtensibleChip * i_chip, const MemAddr & i_addr,
//------------------------------------------------------------------------------

template<TARGETING::TYPE T, typename D>
uint32_t __analyzeFetchNceTce( ExtensibleChip * i_chip, const MemAddr & i_addr,
STEP_CODE_DATA_STRUCT & io_sc,
bool i_isTce = false )
{
#define PRDF_FUNC "[MemEcc::__analyzeFetchNceTce] "

uint32_t o_rc = SUCCESS;

do
{
// Get the symbol of the failure.
MemSymbol symbol;
o_rc = getMemReadSymbol<T>( i_chip, i_addr.getRank(), symbol, i_isTce );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "getMemReadSymbol(0x%08x) failed",
i_chip->getHuid() );
break;
}

// Add the symbol to the callout list and CE table.
bool doTps;
o_rc = handleMemCe<T,D>( i_chip, i_addr, symbol, doTps, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleMemCe(0x%08x) failed",
i_chip->getHuid() );
break;
}

// Initiate a TPS procedure, if needed.
if ( doTps )
{
#ifdef __HOSTBOOT_RUNTIME

// If a MNFG threshold has been reached (predictive callout), we
// will still try to start TPS just in case MNFG disables the
// termination policy.

o_rc = addTpsEvent<T,D>( i_chip, i_addr.getRank(), io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "addTpsEvent(0x%08x) failed",
i_chip->getHuid() );
}

#endif
}

} while (0);

return o_rc;

#undef PRDF_FUNC
}

//------------------------------------------------------------------------------

template<TARGETING::TYPE T, typename D>
uint32_t analyzeFetchNce( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc )
uint32_t analyzeFetchNceTce( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[MemEcc::analyzeFetchNce] "
#define PRDF_FUNC "[MemEcc::analyzeFetchNceTce] "

PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( T == i_chip->getType() );
Expand All @@ -593,72 +534,70 @@ uint32_t analyzeFetchNce( ExtensibleChip * i_chip,
i_chip->getHuid() );
break;
}
MemRank rank = addr.getRank();

// Complete analysis.
o_rc = __analyzeFetchNceTce<T,D>( i_chip, addr, io_sc );
// Get the symbols for the NCE/TCE attention.
MemSymbol sym1, sym2;
o_rc = getMemReadSymbol<T>( i_chip, rank, sym1, sym2 );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "__analyzeFetchNceTce(0x%08x) failed",
PRDF_ERR( PRDF_FUNC "getMemReadSymbol(0x%08x) failed",
i_chip->getHuid() );
break;
}

} while (0);

// Add ECC capture data for FFDC.
MemCaptureData::addEccData<T>( i_chip, io_sc );

return o_rc;

#undef PRDF_FUNC
}

// To resolve template linker errors.
template
uint32_t analyzeFetchNce<TYPE_MCA, McaDataBundle *>( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc );

//------------------------------------------------------------------------------

template<TARGETING::TYPE T, typename D>
uint32_t analyzeFetchTce( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[MemEcc::analyzeFetchTce] "

PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( T == i_chip->getType() );

uint32_t o_rc = SUCCESS;

do
{
// Get the address of the failure.
MemAddr addr;
o_rc = getMemReadAddr<T>( i_chip, MemAddr::READ_NCE_ADDR, addr );
if ( SUCCESS != o_rc )
// Add the first symbol to the callout list and CE table.
bool doTps = false;
if ( sym1.isValid() )
{
PRDF_ERR( PRDF_FUNC "getMemReadAddr(0x%08x) failed",
i_chip->getHuid() );
o_rc = handleMemCe<T,D>( i_chip, addr, sym1, doTps, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleMemCe(0x%08x,0x%02x,%d) failed",
i_chip->getHuid(), rank.getKey(), sym1.getSymbol() );
break;
}
}
else
{
// The first symbol should always be valid.
PRDF_ERR( PRDF_FUNC "getMemReadSymbol(0x%08x) returned an invalid "
"symbol", i_chip->getHuid() );
o_rc = FAIL;
break;
}

// Complete analysis for first symbol.
o_rc = __analyzeFetchNceTce<T,D>( i_chip, addr, io_sc );
if ( SUCCESS != o_rc )
// Add the second symbol to the callout list and CE table, if it exists.
if ( sym2.isValid() )
{
PRDF_ERR( PRDF_FUNC "first __analyzeFetchNceTce(0x%08x) failed",
i_chip->getHuid() );
break;
bool tmp;
o_rc = handleMemCe<T,D>( i_chip, addr, sym2, tmp, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleMemCe(0x%08x,0x%02x,%d) failed",
i_chip->getHuid(), rank.getKey(), sym2.getSymbol() );
break;
}
if ( tmp ) doTps = true;
}

// Complete analysis for second symbol.
o_rc = __analyzeFetchNceTce<T,D>( i_chip, addr, io_sc, true );
if ( SUCCESS != o_rc )
// Initiate a TPS procedure, if needed.
if ( doTps )
{
PRDF_ERR( PRDF_FUNC "second __analyzeFetchNceTce(0x%08x) failed",
i_chip->getHuid() );
break;
#ifdef __HOSTBOOT_RUNTIME

// If a MNFG threshold has been reached (predictive callout), we
// will still try to start TPS just in case MNFG disables the
// termination policy.

o_rc = addTpsEvent<T,D>( i_chip, rank, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "addTpsEvent(0x%08x,0x%02x) failed",
i_chip->getHuid(), rank.getKey() );
}

#endif
}

} while (0);
Expand All @@ -673,7 +612,7 @@ uint32_t analyzeFetchTce( ExtensibleChip * i_chip,

// To resolve template linker errors.
template
uint32_t analyzeFetchTce<TYPE_MCA, McaDataBundle *>( ExtensibleChip * i_chip,
uint32_t analyzeFetchNceTce<TYPE_MCA, McaDataBundle *>( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc );

//------------------------------------------------------------------------------
Expand Down Expand Up @@ -929,7 +868,7 @@ uint32_t analyzeImpe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc )
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "Read() failed on MSR: i_chip=0x%08x",
i_chip->getHuid() );
i_chip->getHuid() );
break;
}

Expand All @@ -944,6 +883,13 @@ uint32_t analyzeImpe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc )

// get symbol and DRAM
MemSymbol symbol = MemSymbol::fromGalois( trgt, rank, galois );
if ( !symbol.isValid() )
{
PRDF_ERR( PRDF_FUNC "Galois 0x%02x from MSR is invalid: 0x%08x,"
"0x%02x", galois, i_chip->getHuid(), rank.getKey() );
o_rc = FAIL;
break;
}
uint8_t dram = symbol.getDram();

// Add the DIMM to the callout list
Expand Down
16 changes: 3 additions & 13 deletions src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H
Expand Up @@ -117,24 +117,14 @@ uint32_t analyzeFetchMpe( ExtensibleChip * i_chip, const MemRank & i_rank,
STEP_CODE_DATA_STRUCT & io_sc );

/**
* @brief Analyzes a fetch NCE attention.
* @brief Analyzes a fetch NCE/TCE attention.
* @param i_chip MCA or MBA.
* @param io_sc The step code data struct.
* @return Non-SUCCESS if an interal function fails, SUCCESS otherwise.
*/
template<TARGETING::TYPE T, typename D>
uint32_t analyzeFetchNce( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc );

/**
* @brief Analyzes a fetch TCE attention.
* @param i_chip MCA or MBA.
* @param io_sc The step code data struct.
* @return Non-SUCCESS if an interal function fails, SUCCESS otherwise.
*/
template<TARGETING::TYPE T, typename D>
uint32_t analyzeFetchTce( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc );
uint32_t analyzeFetchNceTce( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc );

/**
* @brief Analyzes a fetch UE attention.
Expand Down
51 changes: 18 additions & 33 deletions src/usr/diag/prdf/common/plat/mem/prdfMemSymbol.C
Expand Up @@ -64,10 +64,10 @@ MemSymbol::MemSymbol( TARGETING::TargetHandle_t i_trgt, const MemRank & i_rank,
iv_trgt(i_trgt), iv_rank(i_rank), iv_symbol(i_symbol),
iv_pins(i_pins), iv_isDramSpared(false), iv_isEccSpared(false)
{
PRDF_ASSERT( NULL != i_trgt );
PRDF_ASSERT( nullptr != i_trgt );
PRDF_ASSERT( TYPE_MBA == getTargetType(i_trgt) ||
TYPE_MCA == getTargetType(i_trgt) );
PRDF_ASSERT( i_symbol < SYMBOLS_PER_RANK );
// Allowing an invalid symbol. Use isValid() to check validity.
PRDF_ASSERT( i_pins <= CEN_SYMBOL::BOTH_SYMBOL_DQS );
}

Expand All @@ -87,8 +87,6 @@ MemSymbol MemSymbol::fromGalois( TargetHandle_t i_trgt, const MemRank & i_rank,
}
}

PRDF_ASSERT( symbol < SYMBOLS_PER_RANK );

// Get pins from mask.
uint8_t pins = NO_SYMBOL_DQS;
if ( TYPE_MBA == getTargetType(i_trgt) )
Expand Down Expand Up @@ -160,7 +158,7 @@ uint8_t MemSymbol::getDramPins() const
template<>
uint32_t getMemReadSymbol<TYPE_MCA>( ExtensibleChip * i_chip,
const MemRank & i_rank,
MemSymbol & o_symbol, bool i_isTce )
MemSymbol & o_sym1, MemSymbol & o_sym2 )
{
#define PRDF_FUNC "[getMemReadSymbol<TYPE_MBA>] "

Expand All @@ -170,6 +168,8 @@ uint32_t getMemReadSymbol<TYPE_MCA>( ExtensibleChip * i_chip,

uint32_t o_rc = SUCCESS;

o_sym1 = o_sym2 = MemSymbol(); // both initially invalid

do
{
// Get the NCE/TCE galois and mask from hardware.
Expand All @@ -190,22 +190,15 @@ uint32_t getMemReadSymbol<TYPE_MCA>( ExtensibleChip * i_chip,
break;
}

uint32_t bitPos = (mcaRelMcs * 32) + (i_isTce ? 16 : 0);

uint8_t galois = reg->GetBitFieldJustified( bitPos, 8 );
uint8_t mask = reg->GetBitFieldJustified( bitPos + 8, 8 );
uint32_t bitPos = mcaRelMcs * 32;
uint8_t g1 = reg->GetBitFieldJustified( bitPos, 8 );
uint8_t m1 = reg->GetBitFieldJustified( bitPos + 8, 8 );
uint8_t g2 = reg->GetBitFieldJustified( bitPos + 16, 8 );
uint8_t m2 = reg->GetBitFieldJustified( bitPos + 24, 8 );

// Get the NCE/TCE symbol.
o_symbol = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, galois,
mask );
if ( !o_symbol.isValid() )
{
PRDF_ERR( PRDF_FUNC "fromGalois(0x%08x,m%ds%d,0x%02x,0x%02x) "
"failed", i_chip->getHuid(), i_rank.getMaster(),
i_rank.getSlave(), galois, mask );
o_rc = FAIL;
break;
}
o_sym1 = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, g1, m1 );
o_sym2 = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, g2, m2 );

// TODO: RTC 157888 Check if the symbol is on a spare DRAM.

Expand All @@ -221,17 +214,18 @@ uint32_t getMemReadSymbol<TYPE_MCA>( ExtensibleChip * i_chip,
template<>
uint32_t getMemReadSymbol<TYPE_MBA>( ExtensibleChip * i_chip,
const MemRank & i_rank,
MemSymbol & o_symbol, bool i_isTce )
MemSymbol & o_sym1, MemSymbol & o_sym2 )
{
#define PRDF_FUNC "[getMemReadSymbol<TYPE_MBA>] "

// Check parameters
PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( TYPE_MBA == i_chip->getType() );
PRDF_ASSERT( !i_isTce ); // TCEs do not exist on Centaur

uint32_t o_rc = SUCCESS;

o_sym1 = o_sym2 = MemSymbol(); // both initially invalid

do
{
// Get the NCE galois and mask from hardware.
Expand All @@ -249,20 +243,11 @@ uint32_t getMemReadSymbol<TYPE_MBA>( ExtensibleChip * i_chip,
break;
}

uint8_t galois = reg->GetBitFieldJustified( 40, 8 );
uint8_t mask = reg->GetBitFieldJustified( 32, 8 );
uint8_t g1 = reg->GetBitFieldJustified( 40, 8 );
uint8_t m1 = reg->GetBitFieldJustified( 32, 8 );

// Get the NCE symbol.
o_symbol = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, galois,
mask );
if ( !o_symbol.isValid() )
{
PRDF_ERR( PRDF_FUNC "fromGalois(0x%08x,m%ds%d,0x%02x,0x%02x) "
"failed", i_chip->getHuid(), i_rank.getMaster(),
i_rank.getSlave(), galois, mask );
o_rc = FAIL;
break;
}
o_sym1 = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, g1, m1 );

// TODO: RTC 157888 Check if the symbol is on a spare DRAM.

Expand Down
11 changes: 7 additions & 4 deletions src/usr/diag/prdf/common/plat/mem/prdfMemSymbol.H
Expand Up @@ -182,14 +182,17 @@ class MemSymbol
* @brief Reads the memory NCE/TCE vector trap register from hardware.
* @param i_chip MCA or MBA.
* @param i_rank The rank this symbol is on.
* @param o_symbol The returned symbol.
* @param i_isTce Only applies to MCA. True if the TCE symbol is wanted. False
* if the NCE symbol is wanted (default).
* @param o_sym1 The first symbol. Should always be valid for both NCE/TCE.
* @param o_sym2 The second symbol. Only valid for TCEs.
* @note For MCAs, both NCEs and TCEs report to the same error vector and only
* the latest NCE/TCE is recorded. Therefore, it is possible that PRD
* handles a TCE attention, but only one symbol is found because a NCE
* was reported afterwards, wiping out the error vector for the TCE.
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
*/
template<TARGETING::TYPE T>
uint32_t getMemReadSymbol( ExtensibleChip * i_chip, const MemRank & i_rank,
MemSymbol & o_symbol, bool i_isTce = false );
MemSymbol & o_sym1, MemSymbol & o_sym2 );

} // end namespace PRDF

Expand Down

0 comments on commit 676eae6

Please sign in to comment.