Skip to content

Commit 676eae6

Browse files
committed
PRD: Fetch NCE/TCE redesign
NCEs and TCEs use the same error vector register, which stores the latest NCE/TCE. So it is possible that PRD could be handling one attention and the vector could be overwritten by a subsequent attention. Change-Id: Ia6955bc1f0a258e1450c426cec8c466a56b43432 CQ: SW392312 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/42013 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com> Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com> Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com> Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com> Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/42210 Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
1 parent f1b00fd commit 676eae6

File tree

7 files changed

+103
-193
lines changed

7 files changed

+103
-193
lines changed

src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C

Lines changed: 61 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -513,69 +513,10 @@ uint32_t handleMemCe( ExtensibleChip * i_chip, const MemAddr & i_addr,
513513
//------------------------------------------------------------------------------
514514

515515
template<TARGETING::TYPE T, typename D>
516-
uint32_t __analyzeFetchNceTce( ExtensibleChip * i_chip, const MemAddr & i_addr,
517-
STEP_CODE_DATA_STRUCT & io_sc,
518-
bool i_isTce = false )
519-
{
520-
#define PRDF_FUNC "[MemEcc::__analyzeFetchNceTce] "
521-
522-
uint32_t o_rc = SUCCESS;
523-
524-
do
525-
{
526-
// Get the symbol of the failure.
527-
MemSymbol symbol;
528-
o_rc = getMemReadSymbol<T>( i_chip, i_addr.getRank(), symbol, i_isTce );
529-
if ( SUCCESS != o_rc )
530-
{
531-
PRDF_ERR( PRDF_FUNC "getMemReadSymbol(0x%08x) failed",
532-
i_chip->getHuid() );
533-
break;
534-
}
535-
536-
// Add the symbol to the callout list and CE table.
537-
bool doTps;
538-
o_rc = handleMemCe<T,D>( i_chip, i_addr, symbol, doTps, io_sc );
539-
if ( SUCCESS != o_rc )
540-
{
541-
PRDF_ERR( PRDF_FUNC "handleMemCe(0x%08x) failed",
542-
i_chip->getHuid() );
543-
break;
544-
}
545-
546-
// Initiate a TPS procedure, if needed.
547-
if ( doTps )
548-
{
549-
#ifdef __HOSTBOOT_RUNTIME
550-
551-
// If a MNFG threshold has been reached (predictive callout), we
552-
// will still try to start TPS just in case MNFG disables the
553-
// termination policy.
554-
555-
o_rc = addTpsEvent<T,D>( i_chip, i_addr.getRank(), io_sc );
556-
if ( SUCCESS != o_rc )
557-
{
558-
PRDF_ERR( PRDF_FUNC "addTpsEvent(0x%08x) failed",
559-
i_chip->getHuid() );
560-
}
561-
562-
#endif
563-
}
564-
565-
} while (0);
566-
567-
return o_rc;
568-
569-
#undef PRDF_FUNC
570-
}
571-
572-
//------------------------------------------------------------------------------
573-
574-
template<TARGETING::TYPE T, typename D>
575-
uint32_t analyzeFetchNce( ExtensibleChip * i_chip,
576-
STEP_CODE_DATA_STRUCT & io_sc )
516+
uint32_t analyzeFetchNceTce( ExtensibleChip * i_chip,
517+
STEP_CODE_DATA_STRUCT & io_sc )
577518
{
578-
#define PRDF_FUNC "[MemEcc::analyzeFetchNce] "
519+
#define PRDF_FUNC "[MemEcc::analyzeFetchNceTce] "
579520

580521
PRDF_ASSERT( nullptr != i_chip );
581522
PRDF_ASSERT( T == i_chip->getType() );
@@ -593,72 +534,70 @@ uint32_t analyzeFetchNce( ExtensibleChip * i_chip,
593534
i_chip->getHuid() );
594535
break;
595536
}
537+
MemRank rank = addr.getRank();
596538

597-
// Complete analysis.
598-
o_rc = __analyzeFetchNceTce<T,D>( i_chip, addr, io_sc );
539+
// Get the symbols for the NCE/TCE attention.
540+
MemSymbol sym1, sym2;
541+
o_rc = getMemReadSymbol<T>( i_chip, rank, sym1, sym2 );
599542
if ( SUCCESS != o_rc )
600543
{
601-
PRDF_ERR( PRDF_FUNC "__analyzeFetchNceTce(0x%08x) failed",
544+
PRDF_ERR( PRDF_FUNC "getMemReadSymbol(0x%08x) failed",
602545
i_chip->getHuid() );
603546
break;
604547
}
605548

606-
} while (0);
607-
608-
// Add ECC capture data for FFDC.
609-
MemCaptureData::addEccData<T>( i_chip, io_sc );
610-
611-
return o_rc;
612-
613-
#undef PRDF_FUNC
614-
}
615-
616-
// To resolve template linker errors.
617-
template
618-
uint32_t analyzeFetchNce<TYPE_MCA, McaDataBundle *>( ExtensibleChip * i_chip,
619-
STEP_CODE_DATA_STRUCT & io_sc );
620-
621-
//------------------------------------------------------------------------------
622-
623-
template<TARGETING::TYPE T, typename D>
624-
uint32_t analyzeFetchTce( ExtensibleChip * i_chip,
625-
STEP_CODE_DATA_STRUCT & io_sc )
626-
{
627-
#define PRDF_FUNC "[MemEcc::analyzeFetchTce] "
628-
629-
PRDF_ASSERT( nullptr != i_chip );
630-
PRDF_ASSERT( T == i_chip->getType() );
631-
632-
uint32_t o_rc = SUCCESS;
633-
634-
do
635-
{
636-
// Get the address of the failure.
637-
MemAddr addr;
638-
o_rc = getMemReadAddr<T>( i_chip, MemAddr::READ_NCE_ADDR, addr );
639-
if ( SUCCESS != o_rc )
549+
// Add the first symbol to the callout list and CE table.
550+
bool doTps = false;
551+
if ( sym1.isValid() )
640552
{
641-
PRDF_ERR( PRDF_FUNC "getMemReadAddr(0x%08x) failed",
642-
i_chip->getHuid() );
553+
o_rc = handleMemCe<T,D>( i_chip, addr, sym1, doTps, io_sc );
554+
if ( SUCCESS != o_rc )
555+
{
556+
PRDF_ERR( PRDF_FUNC "handleMemCe(0x%08x,0x%02x,%d) failed",
557+
i_chip->getHuid(), rank.getKey(), sym1.getSymbol() );
558+
break;
559+
}
560+
}
561+
else
562+
{
563+
// The first symbol should always be valid.
564+
PRDF_ERR( PRDF_FUNC "getMemReadSymbol(0x%08x) returned an invalid "
565+
"symbol", i_chip->getHuid() );
566+
o_rc = FAIL;
643567
break;
644568
}
645569

646-
// Complete analysis for first symbol.
647-
o_rc = __analyzeFetchNceTce<T,D>( i_chip, addr, io_sc );
648-
if ( SUCCESS != o_rc )
570+
// Add the second symbol to the callout list and CE table, if it exists.
571+
if ( sym2.isValid() )
649572
{
650-
PRDF_ERR( PRDF_FUNC "first __analyzeFetchNceTce(0x%08x) failed",
651-
i_chip->getHuid() );
652-
break;
573+
bool tmp;
574+
o_rc = handleMemCe<T,D>( i_chip, addr, sym2, tmp, io_sc );
575+
if ( SUCCESS != o_rc )
576+
{
577+
PRDF_ERR( PRDF_FUNC "handleMemCe(0x%08x,0x%02x,%d) failed",
578+
i_chip->getHuid(), rank.getKey(), sym2.getSymbol() );
579+
break;
580+
}
581+
if ( tmp ) doTps = true;
653582
}
654583

655-
// Complete analysis for second symbol.
656-
o_rc = __analyzeFetchNceTce<T,D>( i_chip, addr, io_sc, true );
657-
if ( SUCCESS != o_rc )
584+
// Initiate a TPS procedure, if needed.
585+
if ( doTps )
658586
{
659-
PRDF_ERR( PRDF_FUNC "second __analyzeFetchNceTce(0x%08x) failed",
660-
i_chip->getHuid() );
661-
break;
587+
#ifdef __HOSTBOOT_RUNTIME
588+
589+
// If a MNFG threshold has been reached (predictive callout), we
590+
// will still try to start TPS just in case MNFG disables the
591+
// termination policy.
592+
593+
o_rc = addTpsEvent<T,D>( i_chip, rank, io_sc );
594+
if ( SUCCESS != o_rc )
595+
{
596+
PRDF_ERR( PRDF_FUNC "addTpsEvent(0x%08x,0x%02x) failed",
597+
i_chip->getHuid(), rank.getKey() );
598+
}
599+
600+
#endif
662601
}
663602

664603
} while (0);
@@ -673,7 +612,7 @@ uint32_t analyzeFetchTce( ExtensibleChip * i_chip,
673612

674613
// To resolve template linker errors.
675614
template
676-
uint32_t analyzeFetchTce<TYPE_MCA, McaDataBundle *>( ExtensibleChip * i_chip,
615+
uint32_t analyzeFetchNceTce<TYPE_MCA, McaDataBundle *>( ExtensibleChip * i_chip,
677616
STEP_CODE_DATA_STRUCT & io_sc );
678617

679618
//------------------------------------------------------------------------------
@@ -929,7 +868,7 @@ uint32_t analyzeImpe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc )
929868
if ( SUCCESS != o_rc )
930869
{
931870
PRDF_ERR( PRDF_FUNC "Read() failed on MSR: i_chip=0x%08x",
932-
i_chip->getHuid() );
871+
i_chip->getHuid() );
933872
break;
934873
}
935874

@@ -944,6 +883,13 @@ uint32_t analyzeImpe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc )
944883

945884
// get symbol and DRAM
946885
MemSymbol symbol = MemSymbol::fromGalois( trgt, rank, galois );
886+
if ( !symbol.isValid() )
887+
{
888+
PRDF_ERR( PRDF_FUNC "Galois 0x%02x from MSR is invalid: 0x%08x,"
889+
"0x%02x", galois, i_chip->getHuid(), rank.getKey() );
890+
o_rc = FAIL;
891+
break;
892+
}
947893
uint8_t dram = symbol.getDram();
948894

949895
// Add the DIMM to the callout list

src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -117,24 +117,14 @@ uint32_t analyzeFetchMpe( ExtensibleChip * i_chip, const MemRank & i_rank,
117117
STEP_CODE_DATA_STRUCT & io_sc );
118118

119119
/**
120-
* @brief Analyzes a fetch NCE attention.
120+
* @brief Analyzes a fetch NCE/TCE attention.
121121
* @param i_chip MCA or MBA.
122122
* @param io_sc The step code data struct.
123123
* @return Non-SUCCESS if an interal function fails, SUCCESS otherwise.
124124
*/
125125
template<TARGETING::TYPE T, typename D>
126-
uint32_t analyzeFetchNce( ExtensibleChip * i_chip,
127-
STEP_CODE_DATA_STRUCT & io_sc );
128-
129-
/**
130-
* @brief Analyzes a fetch TCE attention.
131-
* @param i_chip MCA or MBA.
132-
* @param io_sc The step code data struct.
133-
* @return Non-SUCCESS if an interal function fails, SUCCESS otherwise.
134-
*/
135-
template<TARGETING::TYPE T, typename D>
136-
uint32_t analyzeFetchTce( ExtensibleChip * i_chip,
137-
STEP_CODE_DATA_STRUCT & io_sc );
126+
uint32_t analyzeFetchNceTce( ExtensibleChip * i_chip,
127+
STEP_CODE_DATA_STRUCT & io_sc );
138128

139129
/**
140130
* @brief Analyzes a fetch UE attention.

src/usr/diag/prdf/common/plat/mem/prdfMemSymbol.C

Lines changed: 18 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,10 @@ MemSymbol::MemSymbol( TARGETING::TargetHandle_t i_trgt, const MemRank & i_rank,
6464
iv_trgt(i_trgt), iv_rank(i_rank), iv_symbol(i_symbol),
6565
iv_pins(i_pins), iv_isDramSpared(false), iv_isEccSpared(false)
6666
{
67-
PRDF_ASSERT( NULL != i_trgt );
67+
PRDF_ASSERT( nullptr != i_trgt );
6868
PRDF_ASSERT( TYPE_MBA == getTargetType(i_trgt) ||
6969
TYPE_MCA == getTargetType(i_trgt) );
70-
PRDF_ASSERT( i_symbol < SYMBOLS_PER_RANK );
70+
// Allowing an invalid symbol. Use isValid() to check validity.
7171
PRDF_ASSERT( i_pins <= CEN_SYMBOL::BOTH_SYMBOL_DQS );
7272
}
7373

@@ -87,8 +87,6 @@ MemSymbol MemSymbol::fromGalois( TargetHandle_t i_trgt, const MemRank & i_rank,
8787
}
8888
}
8989

90-
PRDF_ASSERT( symbol < SYMBOLS_PER_RANK );
91-
9290
// Get pins from mask.
9391
uint8_t pins = NO_SYMBOL_DQS;
9492
if ( TYPE_MBA == getTargetType(i_trgt) )
@@ -160,7 +158,7 @@ uint8_t MemSymbol::getDramPins() const
160158
template<>
161159
uint32_t getMemReadSymbol<TYPE_MCA>( ExtensibleChip * i_chip,
162160
const MemRank & i_rank,
163-
MemSymbol & o_symbol, bool i_isTce )
161+
MemSymbol & o_sym1, MemSymbol & o_sym2 )
164162
{
165163
#define PRDF_FUNC "[getMemReadSymbol<TYPE_MBA>] "
166164

@@ -170,6 +168,8 @@ uint32_t getMemReadSymbol<TYPE_MCA>( ExtensibleChip * i_chip,
170168

171169
uint32_t o_rc = SUCCESS;
172170

171+
o_sym1 = o_sym2 = MemSymbol(); // both initially invalid
172+
173173
do
174174
{
175175
// Get the NCE/TCE galois and mask from hardware.
@@ -190,22 +190,15 @@ uint32_t getMemReadSymbol<TYPE_MCA>( ExtensibleChip * i_chip,
190190
break;
191191
}
192192

193-
uint32_t bitPos = (mcaRelMcs * 32) + (i_isTce ? 16 : 0);
194-
195-
uint8_t galois = reg->GetBitFieldJustified( bitPos, 8 );
196-
uint8_t mask = reg->GetBitFieldJustified( bitPos + 8, 8 );
193+
uint32_t bitPos = mcaRelMcs * 32;
194+
uint8_t g1 = reg->GetBitFieldJustified( bitPos, 8 );
195+
uint8_t m1 = reg->GetBitFieldJustified( bitPos + 8, 8 );
196+
uint8_t g2 = reg->GetBitFieldJustified( bitPos + 16, 8 );
197+
uint8_t m2 = reg->GetBitFieldJustified( bitPos + 24, 8 );
197198

198199
// Get the NCE/TCE symbol.
199-
o_symbol = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, galois,
200-
mask );
201-
if ( !o_symbol.isValid() )
202-
{
203-
PRDF_ERR( PRDF_FUNC "fromGalois(0x%08x,m%ds%d,0x%02x,0x%02x) "
204-
"failed", i_chip->getHuid(), i_rank.getMaster(),
205-
i_rank.getSlave(), galois, mask );
206-
o_rc = FAIL;
207-
break;
208-
}
200+
o_sym1 = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, g1, m1 );
201+
o_sym2 = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, g2, m2 );
209202

210203
// TODO: RTC 157888 Check if the symbol is on a spare DRAM.
211204

@@ -221,17 +214,18 @@ uint32_t getMemReadSymbol<TYPE_MCA>( ExtensibleChip * i_chip,
221214
template<>
222215
uint32_t getMemReadSymbol<TYPE_MBA>( ExtensibleChip * i_chip,
223216
const MemRank & i_rank,
224-
MemSymbol & o_symbol, bool i_isTce )
217+
MemSymbol & o_sym1, MemSymbol & o_sym2 )
225218
{
226219
#define PRDF_FUNC "[getMemReadSymbol<TYPE_MBA>] "
227220

228221
// Check parameters
229222
PRDF_ASSERT( nullptr != i_chip );
230223
PRDF_ASSERT( TYPE_MBA == i_chip->getType() );
231-
PRDF_ASSERT( !i_isTce ); // TCEs do not exist on Centaur
232224

233225
uint32_t o_rc = SUCCESS;
234226

227+
o_sym1 = o_sym2 = MemSymbol(); // both initially invalid
228+
235229
do
236230
{
237231
// Get the NCE galois and mask from hardware.
@@ -249,20 +243,11 @@ uint32_t getMemReadSymbol<TYPE_MBA>( ExtensibleChip * i_chip,
249243
break;
250244
}
251245

252-
uint8_t galois = reg->GetBitFieldJustified( 40, 8 );
253-
uint8_t mask = reg->GetBitFieldJustified( 32, 8 );
246+
uint8_t g1 = reg->GetBitFieldJustified( 40, 8 );
247+
uint8_t m1 = reg->GetBitFieldJustified( 32, 8 );
254248

255249
// Get the NCE symbol.
256-
o_symbol = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, galois,
257-
mask );
258-
if ( !o_symbol.isValid() )
259-
{
260-
PRDF_ERR( PRDF_FUNC "fromGalois(0x%08x,m%ds%d,0x%02x,0x%02x) "
261-
"failed", i_chip->getHuid(), i_rank.getMaster(),
262-
i_rank.getSlave(), galois, mask );
263-
o_rc = FAIL;
264-
break;
265-
}
250+
o_sym1 = MemSymbol::fromGalois( i_chip->getTrgt(), i_rank, g1, m1 );
266251

267252
// TODO: RTC 157888 Check if the symbol is on a spare DRAM.
268253

src/usr/diag/prdf/common/plat/mem/prdfMemSymbol.H

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -182,14 +182,17 @@ class MemSymbol
182182
* @brief Reads the memory NCE/TCE vector trap register from hardware.
183183
* @param i_chip MCA or MBA.
184184
* @param i_rank The rank this symbol is on.
185-
* @param o_symbol The returned symbol.
186-
* @param i_isTce Only applies to MCA. True if the TCE symbol is wanted. False
187-
* if the NCE symbol is wanted (default).
185+
* @param o_sym1 The first symbol. Should always be valid for both NCE/TCE.
186+
* @param o_sym2 The second symbol. Only valid for TCEs.
187+
* @note For MCAs, both NCEs and TCEs report to the same error vector and only
188+
* the latest NCE/TCE is recorded. Therefore, it is possible that PRD
189+
* handles a TCE attention, but only one symbol is found because a NCE
190+
* was reported afterwards, wiping out the error vector for the TCE.
188191
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
189192
*/
190193
template<TARGETING::TYPE T>
191194
uint32_t getMemReadSymbol( ExtensibleChip * i_chip, const MemRank & i_rank,
192-
MemSymbol & o_symbol, bool i_isTce = false );
195+
MemSymbol & o_sym1, MemSymbol & o_sym2 );
193196

194197
} // end namespace PRDF
195198

0 commit comments

Comments
 (0)