Skip to content

Commit

Permalink
PRD: generic function for IUE attention handling
Browse files Browse the repository at this point in the history
Change-Id: I0ed418f3934aaceee0e3949ad91af45879f9004d
RTC: 173944
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/40423
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/40228
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
  • Loading branch information
zane131 committed May 19, 2017
1 parent 513e460 commit a12b4ce
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 102 deletions.
174 changes: 89 additions & 85 deletions src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C
Expand Up @@ -153,19 +153,49 @@ uint32_t handleMemUe<TYPE_MBA>( ExtensibleChip * i_chip, const MemAddr & i_addr,

#ifdef __HOSTBOOT_MODULE

uint32_t maskMemPort( ExtensibleChip * i_chip )
template<>
uint32_t maskMemPort<TYPE_MCA>( ExtensibleChip * i_chip )
{
#define PRDF_FUNC "[MemEcc::maskMemPort] "
#define PRDF_FUNC "[MemEcc::maskMemPort<TYPE_MCA>] "

PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( TYPE_MCA == i_chip->getType() );

SCAN_COMM_REGISTER_CLASS * c = i_chip->getRegister("MCACALFIR_MASK_OR");
SCAN_COMM_REGISTER_CLASS * d = i_chip->getRegister("DDRPHYFIR_MASK_OR");
SCAN_COMM_REGISTER_CLASS * e = i_chip->getRegister("MCAECCFIR_MASK_OR");
uint32_t o_rc = SUCCESS;

do
{
// Mask all FIRs on the port.
SCAN_COMM_REGISTER_CLASS * c = i_chip->getRegister("MCACALFIR_MASK_OR");
SCAN_COMM_REGISTER_CLASS * d = i_chip->getRegister("DDRPHYFIR_MASK_OR");
SCAN_COMM_REGISTER_CLASS * e = i_chip->getRegister("MCAECCFIR_MASK_OR");

c->setAllBits(); d->setAllBits(); e->setAllBits();

o_rc = c->Write() | d->Write() | e->Write();
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "Write() failed on 0x%08x", i_chip->getHuid() );
break;
}

#ifdef __HOSTBOOT_RUNTIME

c->setAllBits(); d->setAllBits(); e->setAllBits();
/* TODO RTC 136129
// Dynamically deallocate the port.
o_rc = MemDealloc::port<TYPE_MCA>( i_chip );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "MemDealloc::port<TYPE_MCA>(0x%08x) failed",
i_chip->getHuid() );
}
*/

#endif

return ( c->Write() | d->Write() | e->Write() );
} while (0);

return o_rc;

#undef PRDF_FUNC
}
Expand All @@ -176,10 +206,13 @@ uint32_t maskMemPort( ExtensibleChip * i_chip )

#ifdef __HOSTBOOT_RUNTIME

uint32_t iuePortFail(ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc)
template<>
uint32_t iuePortFail<TYPE_MCA>( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[MemEcc::iuePortFail] "
#define PRDF_FUNC "[MemEcc::iuePortFail<TYPE_MCA>] "

PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( TYPE_MCA == i_chip->getType() );

uint32_t o_rc = SUCCESS;
Expand Down Expand Up @@ -714,89 +747,68 @@ uint32_t analyzeFetchUe<TYPE_MCA, McaDataBundle *>( ExtensibleChip * i_chip,

//------------------------------------------------------------------------------

#ifdef __HOSTBOOT_MODULE

template<TARGETING::TYPE T, typename D>
uint32_t __analyzeIue( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc,
MemAddr i_addr )
uint32_t handleMemIue( ExtensibleChip * i_chip, const MemRank & i_rank,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[MemEcc::__analyzeIue] "
#define PRDF_FUNC "[MemEcc::handleMemIue] "

PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( T == i_chip->getType() );

uint32_t o_rc = SUCCESS;

do
{
// get data bundle from chip
D db = static_cast<D>( i_chip->getDataBundle() );
// Add the DIMM to the callout list.
MemoryMru mm { i_chip->getTrgt(), i_rank, MemoryMruData::CALLOUT_RANK };
io_sc.service_data->SetCallout( mm );

// get the rank
MemRank rank = i_addr.getRank();
#ifdef __HOSTBOOT_MODULE

TargetHandle_t trgt = i_chip->getTrgt();
do
{
// Nothing else to do if handling a system checkstop.
if ( CHECK_STOP == io_sc.service_data->getPrimaryAttnType() ) break;

// Add the DIMM to the callout list
MemoryMru memmru(trgt, rank, MemoryMruData::CALLOUT_RANK);
io_sc.service_data->SetCallout( memmru );
// Get the data bundle from chip.
D db = static_cast<D>( i_chip->getDataBundle() );

uint8_t ds = rank.getDimmSlct();
// Get the DIMM select.
uint8_t ds = i_rank.getDimmSlct();

// Initialize threshold if it doesn't exist yet
// Initialize threshold if it doesn't exist yet.
if ( 0 == db->iv_iueTh.count(ds) )
{
db->iv_iueTh[ds] = TimeBasedThreshold( getIueTh() );
}

// increment the threshold - check if at threshold
// Increment the count and check if at threshold.
if ( db->iv_iueTh[ds].inc(io_sc) )
{
// Make the error log predictive
// Make the error log predictive.
io_sc.service_data->setServiceCall();

#ifdef __HOSTBOOT_RUNTIME

/* TODO RTC 136129
// Dynamically deallocate the rank.
uint32_t dealloc_rc = MemDealloc::rank<T>( i_chip, rank );
if ( SUCCESS != dealloc_rc )
{
PRDF_ERR( PRDF_FUNC "MemDealloc::rank() failed: i_chip=0x%08x "
"rank=m%ds%d", i_chip->getHuid(), rank.getMaster(),
rank.getSlave() );
o_rc = dealloc_rc; break;
}
*/

#endif // __HOSTBOOT_RUNTIME
// The port fail will be triggered in the PostAnalysis plugin after
// the error log has been committed.

// mask off the entire port to avoid collateral
o_rc = maskMemPort( i_chip );
// Mask off the entire port to avoid collateral.
o_rc = MemEcc::maskMemPort<T>( i_chip );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort failed: i_chip=0x%08x",
i_chip->getHuid() );
PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort<T>(0x%08x) failed",
i_chip->getHuid() );
break;
}

// Port fail will be triggered in PostAnalysis after the error log
// has been committed.
}

}while(0);
} while (0);

#endif // __HOSTBOOT_MODULE

return o_rc;

#undef PRDF_FUNC
}

// To resolve template linker errors.
template
uint32_t __analyzeIue<TYPE_MCA, McaDataBundle*>(ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc,
MemAddr i_addr );

#endif // __HOSTBOOT_MODULE

//------------------------------------------------------------------------------

template<TARGETING::TYPE T, typename D>
Expand All @@ -805,44 +817,39 @@ uint32_t analyzeMainlineIue( ExtensibleChip * i_chip,
{
#define PRDF_FUNC "[MemEcc::analyzeMainlineIue] "

PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( T == i_chip->getType() );
uint32_t o_rc = SUCCESS;

#ifdef __HOSTBOOT_MODULE
uint32_t o_rc = SUCCESS;

do
{

// get the address of the failure
MemAddr addr;

// Use the address in MBRCER. This address also traps IRCDs, but it is
// not likely that we will have two independent failure modes at the
// same time. So we just assume the address is correct.
MemAddr addr;
o_rc = getMemReadAddr<T>( i_chip, MemAddr::READ_RCE_ADDR, addr );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "getMemReadAddr(0x%08x, READ_RCE_ADDR) failed",
i_chip->getHuid() );
i_chip->getHuid() );
break;
}
MemRank rank = addr.getRank();

o_rc = __analyzeIue<T,D>( i_chip, io_sc, addr );
o_rc = handleMemIue<T,D>( i_chip, rank, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "__analyzeIue failed. Chip HUID: 0x%08x",
i_chip->getHuid() );
PRDF_ERR( PRDF_FUNC "handleMemIue<T,D>(0x%08x,m%ds%d) failed",
i_chip->getHuid(), rank.getMaster(), rank.getSlave() );
break;
}

}while(0);

#endif
} while (0);

return o_rc;

#undef PRDF_FUNC

}

// To resolve template linker errors.
Expand All @@ -858,40 +865,37 @@ uint32_t analyzeMaintIue( ExtensibleChip * i_chip,
{
#define PRDF_FUNC "[MemEcc::analyzeMaintIue] "

PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( T == i_chip->getType() );
uint32_t o_rc = SUCCESS;

#ifdef __HOSTBOOT_MODULE
uint32_t o_rc = SUCCESS;

do
{
// Use the current address in the MCBMCAT.
MemAddr addr;

// Use the current address in the MCBMCAT
o_rc = getMemMaintAddr<T>( i_chip, addr );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed",
i_chip->getHuid() );
i_chip->getHuid() );
break;
}
MemRank rank = addr.getRank();

o_rc = __analyzeIue<T,D>( i_chip, io_sc, addr );
o_rc = handleMemIue<T,D>( i_chip, rank, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "__analyzeIue failed. Chip HUID: "
"0x%08x", i_chip->getHuid() );
PRDF_ERR( PRDF_FUNC "handleMemIue<T,D>(0x%08x,m%ds%d) failed",
i_chip->getHuid(), rank.getMaster(), rank.getSlave() );
break;
}

}while(0);

#endif
} while (0);

return o_rc;

#undef PRDF_FUNC

}

// To resolve template linker errors.
Expand Down
36 changes: 30 additions & 6 deletions src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H
Expand Up @@ -84,6 +84,27 @@ template<TARGETING::TYPE T>
uint32_t handleMemUe( ExtensibleChip * i_chip, const MemAddr & i_addr,
UE_TABLE::Type i_type, STEP_CODE_DATA_STRUCT & io_sc );

/**
* @brief Does mainline and maintenance IUE handling.
*
* Adds the memory IUE to the callout list. At threshold, will make the error
* log predictive. When threshold is reached at runtime there is a good chance
* these IUEs are going to lead to a data integrity issue. Therefore, the port
* will be forced to fail, the entire port will be masked off, and dynamic
* memory deallocation will be applied. Note that this function will not issue
* the port failure because it is possible that it may crash the host. Instead,
* the port failure is issued in the PostAnalysis plugin after the error log has
* been committed.
*
* @param i_chip MCA chip.
* @param i_rank Rank containing the IUE.
* @param io_sc The step code data struct.
* @return Non-SUCCESS if an interal function fails, SUCCESS otherwise.
*/
template<TARGETING::TYPE T, typename D>
uint32_t handleMemIue( ExtensibleChip * i_chip, const MemRank & i_rank,
STEP_CODE_DATA_STRUCT & io_sc );

/**
* @brief Analyzes a fetch MPE attention.
* @param i_chip MCA or MBA.
Expand Down Expand Up @@ -158,22 +179,25 @@ uint32_t analyzeImpe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc );
#ifdef __HOSTBOOT_RUNTIME

/**
* @brief Will trigger a port fail if the number of IUEs is over threshold
* @param i_chip MCA chip
* @param io_sc The step code data struct.
* @brief Will trigger a port fail if the number of IUEs is over threshold.
* @param i_chip MCA chip
* @param io_sc The step code data struct.
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise
*/
uint32_t iuePortFail(ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc);
template<TARGETING::TYPE T>
uint32_t iuePortFail( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc );

#endif // __HOSTBOOT_RUNTIME

#ifdef __HOSTBOOT_MODULE

/**
* @brief Will mask off the entire mem port
* @param i_chip MCA chip
* @brief Will mask off an entire memory port. At runtime will issue dynamic
* memory deallocation of the port.
* @param i_chip MCA chip
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise
*/
template<TARGETING::TYPE T>
uint32_t maskMemPort( ExtensibleChip * i_chip );

template<TARGETING::TYPE T, typename D>
Expand Down
13 changes: 5 additions & 8 deletions src/usr/diag/prdf/plat/mem/prdfP9Mca.C
Expand Up @@ -69,16 +69,14 @@ int32_t PostAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc )

#ifdef __HOSTBOOT_RUNTIME


// If the IUE threshold in our data bundle has been reached, we trigger
// a port fail. Once we trigger the port fail, the system may crash
// right away. Since PRD is running in the hypervisor, it is possible we
// may not get the error log. To better our chances, we trigger the port
// fail here after the error log has been committed.
if ( SUCCESS != MemEcc::iuePortFail(i_chip, io_sc) )
if ( SUCCESS != MemEcc::iuePortFail<TYPE_MCA>(i_chip, io_sc) )
{
PRDF_ERR( PRDF_FUNC "iuePortFail failed: i_chip=0x%08x",
i_chip->getHuid() );
PRDF_ERR( PRDF_FUNC "iuePortFail(0x%08x) failed", i_chip->getHuid() );
}

#endif // __HOSTBOOT_RUNTIME
Expand Down Expand Up @@ -197,14 +195,13 @@ int32_t MemPortFailure( ExtensibleChip * i_chip,

if ( CHECK_STOP != io_sc.service_data->getPrimaryAttnType() )
{
// The port is dead mask off the entire port.
uint32_t l_rc = MemEcc::maskMemPort( i_chip );
// The port is dead. Mask off the entire port.
uint32_t l_rc = MemEcc::maskMemPort<TYPE_MCA>( i_chip );
if ( SUCCESS != l_rc )
{
PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort failed: i_chip=0x%08x",
PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort<TYPE_MCA>(0x%08x) failed",
i_chip->getHuid() );
}

}

return SUCCESS; // nothing to return to rule code
Expand Down

0 comments on commit a12b4ce

Please sign in to comment.