Skip to content

Commit

Permalink
PRD: Consolidate memory UE handling
Browse files Browse the repository at this point in the history
Change-Id: I925c5bd3db25bcd2d78b3353e4374a625d247672
RTC: 173423
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/39781
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/39966
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
  • Loading branch information
zane131 committed May 3, 2017
1 parent b670c4e commit 4ac944b
Show file tree
Hide file tree
Showing 7 changed files with 213 additions and 133 deletions.
15 changes: 14 additions & 1 deletion src/usr/diag/prdf/common/plat/mem/prdfMemAddress.H
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2016 */
/* Contributors Listed Below - COPYRIGHT 2016,2017 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -86,6 +86,19 @@ class MemAddr
template<TARGETING::TYPE T>
static MemAddr fromMaintAddr( uint64_t i_addr );

/**
* @brief Occassionally we will need a MemAddr, but all we have is a
* MemRank. This mostly happens when adding FFDC during Targeted
* Diagnostics procedures. In this case, we will input a valid rank
* and this function will add invalid bank, row, and column
* addresses.
* @param i_rank A valid rank for this address.
*/
static MemAddr fromRank( const MemRank & i_rank )
{
return MemAddr( i_rank, 0xff, 0xffff, 0xffff );
}

/** @return This address's rank. */
const MemRank& getRank() const { return iv_rnk; }

Expand Down
211 changes: 135 additions & 76 deletions src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C
Expand Up @@ -43,6 +43,114 @@ namespace MemEcc

//------------------------------------------------------------------------------

template<TARGETING::TYPE T, typename D>
uint32_t __handleMemUe( ExtensibleChip * i_chip, const MemAddr & i_addr,
UE_TABLE::Type i_type, STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[MemEcc::__handleMemUe] "

uint32_t o_rc = SUCCESS;

MemRank rank = i_addr.getRank();

// Add the rank to the callout list.
MemoryMru mm { i_chip->getTrgt(), rank, MemoryMruData::CALLOUT_RANK };
io_sc.service_data->SetCallout( mm );

// All memory UEs should be customer viewable.
io_sc.service_data->setServiceCall();

// Add entry to UE table.
D db = static_cast<D>(i_chip->getDataBundle());
db->iv_ueTable.addEntry( i_type, i_addr );

#ifdef __HOSTBOOT_RUNTIME

/* TODO RTC 136129
// Dynamically deallocate the rank.
o_rc = MemDealloc::rank<T>( i_chip, rank );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "MemDealloc::rank<T>(0x%08x,m%ds%d) failed",
i_chip->getHuid(), rank.getMaster(), rank.getSlave() );
}
*/

#endif

return o_rc;

#undef PRDF_FUNC
}

template<>
uint32_t handleMemUe<TYPE_MCA>( ExtensibleChip * i_chip, const MemAddr & i_addr,
UE_TABLE::Type i_type,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[MemEcc::handleMemUe] "

PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( TYPE_MCA == i_chip->getType() );

uint32_t o_rc = SUCCESS;

do
{
// First check to see if this is a side-effect UE.
SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister("DDRPHYFIR");
o_rc = fir->Read();
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "Read() failed on DDRPHYFIR: i_chip=0x%08x",
i_chip->getHuid() );
break;
}

// Check DDRPHYFIR[54:55,57:59] to determine if this is a side-effect.
if ( 0 != (fir->GetBitFieldJustified(54,6) & 0x37) )
{
// This is a side-effect. Callout the MCA.
PRDF_TRAC( PRDF_FUNC "Memory UE is side-effect of DDRPHY error" );
io_sc.service_data->SetCallout( i_chip->getTrgt() );
io_sc.service_data->setServiceCall();
}
else
{
// Handle the memory UE.
o_rc = __handleMemUe<TYPE_MCA,McaDataBundle *>( i_chip, i_addr,
i_type, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "__handleMemUe(0x%08x,%d) failed",
i_chip->getHuid(), i_type );
break;
}
}

} while (0);

return o_rc;

#undef PRDF_FUNC
}

/* TODO RTC 157888
template<>
uint32_t handleMemUe<TYPE_MBA>( ExtensibleChip * i_chip, const MemAddr & i_addr,
UE_TABLE::Type i_type,
STEP_CODE_DATA_STRUCT & io_sc )
{
PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( TYPE_MBA == i_chip->getType() );
return __handleMemUe<TYPE_MBA,CenMbaDataBundle *>( i_chip, i_addr,
i_type, io_sc );
}
*/

//------------------------------------------------------------------------------

#ifdef __HOSTBOOT_MODULE

uint32_t maskMemPort( ExtensibleChip * i_chip )
Expand Down Expand Up @@ -127,60 +235,6 @@ uint32_t iuePortFail(ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc)

//------------------------------------------------------------------------------

template<>
void calloutMemUe<TYPE_MCA>( ExtensibleChip * i_chip, const MemRank & i_rank,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[MemEcc::calloutMemUe] "

PRDF_ASSERT( TYPE_MCA == i_chip->getType() );

SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister( "DDRPHYFIR" );
int32_t l_rc = fir->Read();
if ( SUCCESS != l_rc )
{
PRDF_ERR( PRDF_FUNC "Read() failed on DDRPHYFIR: i_chip=0x%08x",
i_chip->getHuid() );
}

// Check DDRPHYFIR[54:55,57:59] to determine if this UE is a side-effect.
if ( SUCCESS == l_rc && (0 != (fir->GetBitFieldJustified(54,6) & 0x37)) )
{
// Callout the MCA.
io_sc.service_data->SetCallout( i_chip->getTrgt() );
}
else
{
// Callout the rank anyway.
MemoryMru memmru ( i_chip->getTrgt(), i_rank,
MemoryMruData::CALLOUT_RANK );
io_sc.service_data->SetCallout( memmru );
}

#undef PRDF_FUNC
}

template<>
void calloutMemUe<TYPE_MBA>( ExtensibleChip * i_chip, const MemRank & i_rank,
STEP_CODE_DATA_STRUCT & io_sc )
{
PRDF_ASSERT( TYPE_MBA == i_chip->getType() );

// TODO: RTC 169933 During Memory Diagnostics we'll want to call the
// mssIplUeIsolation() HWP so that we can isolate to a single DIMM if
// possible. This may be a difficult task to do at this point in the
// code because it will run a maintenance command on the Centaur,
// which may require some cleanup of the previous command. Since there
// are no plans to support IS DIMMs attached to a Centaur in P9, we
// may be able to get rid of this requirement because the FRU will be
// the same regardless if one or two logical DIMMs are called out.

MemoryMru memmru ( i_chip->getTrgt(), i_rank, MemoryMruData::CALLOUT_RANK );
io_sc.service_data->SetCallout( memmru );
}

//------------------------------------------------------------------------------

#ifdef __HOSTBOOT_MODULE

template<TARGETING::TYPE T, typename D>
Expand Down Expand Up @@ -401,6 +455,23 @@ uint32_t handleMemCe( ExtensibleChip * i_chip, const MemAddr & i_addr,
o_doTps = ( 0 != (MemCeTable<T>::FIELD_TH_ALL & ceTableRc) );
}

#ifdef __HOSTBOOT_RUNTIME

/* TODO RTC 136129
if ( i_isHard )
{
// Dynamically deallocate the page.
o_rc = MemDealloc::page<T>( i_chip, i_addr );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "MemDealloc::page(0x%08x) failed",
i_chip->getHuid() );
}
}
*/

#endif

return o_rc;

#undef PRDF_FUNC
Expand Down Expand Up @@ -601,40 +672,28 @@ uint32_t analyzeFetchUe( ExtensibleChip * i_chip,
break;
}

// Add address to UE table.
D db = static_cast<D>(i_chip->getDataBundle());
db->iv_ueTable.addEntry( UE_TABLE::FETCH_UE, addr );

// Make the hardware callout.
MemRank rank = addr.getRank();
calloutMemUe<T>( i_chip, rank, io_sc );
// Do memory UE handling.
o_rc = MemEcc::handleMemUe<T>( i_chip, addr, UE_TABLE::FETCH_UE, io_sc);
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleMemUe<T>(0x%08x) failed",
i_chip->getHuid() );
break;
}

#ifdef __HOSTBOOT_RUNTIME

// Add a TPS request to the TD queue and ban any further TPS requests
// for this rank.
MemRank rank = addr.getRank();
o_rc = addTpsEvent<T,D>( i_chip, rank, io_sc, true );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "addTpsEvent() failed: i_chip=0x%08x "
"rank=%d,%d", i_chip->getHuid(), rank.getMaster(),
rank.getSlave() );
// NOTE: We are not adding a break here because we still want to do
// dynamic memory deallocation of the rank. Any code added
// after this will need to handled return codes judiciously.
}

/* TODO RTC 136129
// Dynamically deallocation the rank.
uint32_t dealloc_rc = MemDealloc::rank<T>( i_chip, rank );
if ( SUCCESS != dealloc_rc )
{
PRDF_ERR( PRDF_FUNC "MemDealloc::rank() failed: i_chip=0x%08x "
"rank=m%ds%d", i_chip->getHuid(), rank.getMaster(),
rank.getSlave() );
o_rc = dealloc_rc; break;
break;
}
*/

#endif // __HOSTBOOT_RUNTIME

Expand Down
24 changes: 17 additions & 7 deletions src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H
Expand Up @@ -48,8 +48,9 @@ namespace MemEcc
{

/**
* @brief Adds the given symbol to the callout list and CE table. Returns true
* if TPS is required.
* @brief Adds the memory CE to the callout list and CE table. Will also issue
* dynamic memory deallocation when appropriate. Returns true if TPS is
* required.
* @param i_chip MCA or MBA.
* @param i_addr Failed address.
* @param i_symbol Failed symbol.
Expand All @@ -64,15 +65,24 @@ uint32_t handleMemCe( ExtensibleChip * i_chip, const MemAddr & i_addr,
STEP_CODE_DATA_STRUCT & io_sc, bool i_isHard = false );

/**
* @brief Will check if the UE is a side-effect attention and make a callout
* appropriately.
* @brief Adds the memory UE to the callout list and UE table. Makes the error
* log predictive. Will also issue dynamic memory deallocation when
* appropriate.
*
* For UEs on Nimbus chips, it is possible that this UE is a side-effect of
* DDRPHY attentions. If so, the MCA will be added to the callout list instead
* of the DIMMs, the UE table will not be updated and no dynamic memory
* deallocation.
*
* @param i_chip MCA or MBA.
* @param i_rank Target rank.
* @param i_addr Failed address.
* @param i_type The type of UE.
* @param io_sc The step code data struct.
* @return Non-SUCCESS if an interal function fails, SUCCESS otherwise.
*/
template<TARGETING::TYPE T>
void calloutMemUe( ExtensibleChip * i_chip, const MemRank & i_rank,
STEP_CODE_DATA_STRUCT & io_sc );
uint32_t handleMemUe( ExtensibleChip * i_chip, const MemAddr & i_addr,
UE_TABLE::Type i_type, STEP_CODE_DATA_STRUCT & io_sc );

/**
* @brief Analyzes a fetch MPE attention.
Expand Down
14 changes: 9 additions & 5 deletions src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C
Expand Up @@ -168,11 +168,15 @@ uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue,
io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintUE );
io_sc.service_data->setSignature( huid, PRDFSIG_MaintUE );

// Add the rank to the callout list.
MemEcc::calloutMemUe<T>( i_chip, rank, io_sc );

// Make the error log predictive.
io_sc.service_data->setServiceCall();
// Do memory UE handling.
o_rc = MemEcc::handleMemUe<T>( i_chip, i_addr, UE_TABLE::SCRUB_UE,
io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleMemUe<T>(0x%08x) failed",
i_chip->getHuid() );
break;
}
}
else if ( 0 != (eccAttns & MAINT_MPE) )
{
Expand Down
37 changes: 8 additions & 29 deletions src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
Expand Up @@ -527,16 +527,6 @@ uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue,
io_sc.service_data->setSignature( huid, PRDFSIG_MaintHARD_CTE );
io_sc.service_data->setServiceCall();
}

/* TODO RTC 136129
// Dynamically deallocation the page.
o_rc = MemDealloc::page<T>( i_chip, i_addr );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "MemDealloc::page(0x%08x) failed", huid );
break;
}
*/
}

if ( 0 != (eccAttns & MAINT_MPE) )
Expand Down Expand Up @@ -591,30 +581,19 @@ uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue,
// signature as well.
io_sc.service_data->setSignature( huid, PRDFSIG_MaintUE );

// Add entry to UE table.
D db = static_cast<D>(i_chip->getDataBundle());
db->iv_ueTable.addEntry( UE_TABLE::SCRUB_UE, i_addr );

// Add the rank to the callout list.
MemEcc::calloutMemUe<T>( i_chip, rank, io_sc );

// Make the error log predictive.
io_sc.service_data->setServiceCall();

// Add a TPS procedure to the queue.
TdEntry * e = new TpsEvent<T>{ i_chip, rank };
io_queue.push( e );

/* TODO RTC 136129
// Dynamically deallocation the rank.
o_rc = MemDealloc::rank<T>( i_chip, rank );
o_rc = MemEcc::handleMemUe<T>( i_chip, i_addr, UE_TABLE::SCRUB_UE,
io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "MemDealloc::rank(0x%08x, m%ds%d) failed",
huid, rank.getMaster(), rank.getSlave() );
PRDF_ERR( PRDF_FUNC "handleMemUe<T>(0x%08x) failed",
i_chip->getHuid() );
break;
}
*/

// Add a TPS procedure to the queue.
TdEntry * e = new TpsEvent<T>{ i_chip, rank };
io_queue.push( e );
}

} while (0);
Expand Down

0 comments on commit 4ac944b

Please sign in to comment.