Skip to content

Commit a12b4ce

Browse files
committed
PRD: generic function for IUE attention handling
Change-Id: I0ed418f3934aaceee0e3949ad91af45879f9004d RTC: 173944 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/40423 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com> Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com> Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com> Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com> Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/40228 Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
1 parent 513e460 commit a12b4ce

File tree

4 files changed

+127
-102
lines changed

4 files changed

+127
-102
lines changed

src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C

Lines changed: 89 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -153,19 +153,49 @@ uint32_t handleMemUe<TYPE_MBA>( ExtensibleChip * i_chip, const MemAddr & i_addr,
153153

154154
#ifdef __HOSTBOOT_MODULE
155155

156-
uint32_t maskMemPort( ExtensibleChip * i_chip )
156+
template<>
157+
uint32_t maskMemPort<TYPE_MCA>( ExtensibleChip * i_chip )
157158
{
158-
#define PRDF_FUNC "[MemEcc::maskMemPort] "
159+
#define PRDF_FUNC "[MemEcc::maskMemPort<TYPE_MCA>] "
159160

161+
PRDF_ASSERT( nullptr != i_chip );
160162
PRDF_ASSERT( TYPE_MCA == i_chip->getType() );
161163

162-
SCAN_COMM_REGISTER_CLASS * c = i_chip->getRegister("MCACALFIR_MASK_OR");
163-
SCAN_COMM_REGISTER_CLASS * d = i_chip->getRegister("DDRPHYFIR_MASK_OR");
164-
SCAN_COMM_REGISTER_CLASS * e = i_chip->getRegister("MCAECCFIR_MASK_OR");
164+
uint32_t o_rc = SUCCESS;
165+
166+
do
167+
{
168+
// Mask all FIRs on the port.
169+
SCAN_COMM_REGISTER_CLASS * c = i_chip->getRegister("MCACALFIR_MASK_OR");
170+
SCAN_COMM_REGISTER_CLASS * d = i_chip->getRegister("DDRPHYFIR_MASK_OR");
171+
SCAN_COMM_REGISTER_CLASS * e = i_chip->getRegister("MCAECCFIR_MASK_OR");
172+
173+
c->setAllBits(); d->setAllBits(); e->setAllBits();
174+
175+
o_rc = c->Write() | d->Write() | e->Write();
176+
if ( SUCCESS != o_rc )
177+
{
178+
PRDF_ERR( PRDF_FUNC "Write() failed on 0x%08x", i_chip->getHuid() );
179+
break;
180+
}
181+
182+
#ifdef __HOSTBOOT_RUNTIME
165183

166-
c->setAllBits(); d->setAllBits(); e->setAllBits();
184+
/* TODO RTC 136129
185+
// Dynamically deallocate the port.
186+
o_rc = MemDealloc::port<TYPE_MCA>( i_chip );
187+
if ( SUCCESS != o_rc )
188+
{
189+
PRDF_ERR( PRDF_FUNC "MemDealloc::port<TYPE_MCA>(0x%08x) failed",
190+
i_chip->getHuid() );
191+
}
192+
*/
193+
194+
#endif
167195

168-
return ( c->Write() | d->Write() | e->Write() );
196+
} while (0);
197+
198+
return o_rc;
169199

170200
#undef PRDF_FUNC
171201
}
@@ -176,10 +206,13 @@ uint32_t maskMemPort( ExtensibleChip * i_chip )
176206

177207
#ifdef __HOSTBOOT_RUNTIME
178208

179-
uint32_t iuePortFail(ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc)
209+
template<>
210+
uint32_t iuePortFail<TYPE_MCA>( ExtensibleChip * i_chip,
211+
STEP_CODE_DATA_STRUCT & io_sc )
180212
{
181-
#define PRDF_FUNC "[MemEcc::iuePortFail] "
213+
#define PRDF_FUNC "[MemEcc::iuePortFail<TYPE_MCA>] "
182214

215+
PRDF_ASSERT( nullptr != i_chip );
183216
PRDF_ASSERT( TYPE_MCA == i_chip->getType() );
184217

185218
uint32_t o_rc = SUCCESS;
@@ -714,89 +747,68 @@ uint32_t analyzeFetchUe<TYPE_MCA, McaDataBundle *>( ExtensibleChip * i_chip,
714747

715748
//------------------------------------------------------------------------------
716749

717-
#ifdef __HOSTBOOT_MODULE
718-
719750
template<TARGETING::TYPE T, typename D>
720-
uint32_t __analyzeIue( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc,
721-
MemAddr i_addr )
751+
uint32_t handleMemIue( ExtensibleChip * i_chip, const MemRank & i_rank,
752+
STEP_CODE_DATA_STRUCT & io_sc )
722753
{
723-
#define PRDF_FUNC "[MemEcc::__analyzeIue] "
754+
#define PRDF_FUNC "[MemEcc::handleMemIue] "
724755

756+
PRDF_ASSERT( nullptr != i_chip );
725757
PRDF_ASSERT( T == i_chip->getType() );
758+
726759
uint32_t o_rc = SUCCESS;
727760

728-
do
729-
{
730-
// get data bundle from chip
731-
D db = static_cast<D>( i_chip->getDataBundle() );
761+
// Add the DIMM to the callout list.
762+
MemoryMru mm { i_chip->getTrgt(), i_rank, MemoryMruData::CALLOUT_RANK };
763+
io_sc.service_data->SetCallout( mm );
732764

733-
// get the rank
734-
MemRank rank = i_addr.getRank();
765+
#ifdef __HOSTBOOT_MODULE
735766

736-
TargetHandle_t trgt = i_chip->getTrgt();
767+
do
768+
{
769+
// Nothing else to do if handling a system checkstop.
770+
if ( CHECK_STOP == io_sc.service_data->getPrimaryAttnType() ) break;
737771

738-
// Add the DIMM to the callout list
739-
MemoryMru memmru(trgt, rank, MemoryMruData::CALLOUT_RANK);
740-
io_sc.service_data->SetCallout( memmru );
772+
// Get the data bundle from chip.
773+
D db = static_cast<D>( i_chip->getDataBundle() );
741774

742-
uint8_t ds = rank.getDimmSlct();
775+
// Get the DIMM select.
776+
uint8_t ds = i_rank.getDimmSlct();
743777

744-
// Initialize threshold if it doesn't exist yet
778+
// Initialize threshold if it doesn't exist yet.
745779
if ( 0 == db->iv_iueTh.count(ds) )
746780
{
747781
db->iv_iueTh[ds] = TimeBasedThreshold( getIueTh() );
748782
}
749783

750-
// increment the threshold - check if at threshold
784+
// Increment the count and check if at threshold.
751785
if ( db->iv_iueTh[ds].inc(io_sc) )
752786
{
753-
// Make the error log predictive
787+
// Make the error log predictive.
754788
io_sc.service_data->setServiceCall();
755789

756-
#ifdef __HOSTBOOT_RUNTIME
757-
758-
/* TODO RTC 136129
759-
// Dynamically deallocate the rank.
760-
uint32_t dealloc_rc = MemDealloc::rank<T>( i_chip, rank );
761-
if ( SUCCESS != dealloc_rc )
762-
{
763-
PRDF_ERR( PRDF_FUNC "MemDealloc::rank() failed: i_chip=0x%08x "
764-
"rank=m%ds%d", i_chip->getHuid(), rank.getMaster(),
765-
rank.getSlave() );
766-
o_rc = dealloc_rc; break;
767-
}
768-
*/
769-
770-
#endif // __HOSTBOOT_RUNTIME
790+
// The port fail will be triggered in the PostAnalysis plugin after
791+
// the error log has been committed.
771792

772-
// mask off the entire port to avoid collateral
773-
o_rc = maskMemPort( i_chip );
793+
// Mask off the entire port to avoid collateral.
794+
o_rc = MemEcc::maskMemPort<T>( i_chip );
774795
if ( SUCCESS != o_rc )
775796
{
776-
PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort failed: i_chip=0x%08x",
777-
i_chip->getHuid() );
797+
PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort<T>(0x%08x) failed",
798+
i_chip->getHuid() );
778799
break;
779800
}
780-
781-
// Port fail will be triggered in PostAnalysis after the error log
782-
// has been committed.
783801
}
784802

785-
}while(0);
803+
} while (0);
804+
805+
#endif // __HOSTBOOT_MODULE
786806

787807
return o_rc;
788808

789809
#undef PRDF_FUNC
790810
}
791811

792-
// To resolve template linker errors.
793-
template
794-
uint32_t __analyzeIue<TYPE_MCA, McaDataBundle*>(ExtensibleChip * i_chip,
795-
STEP_CODE_DATA_STRUCT & io_sc,
796-
MemAddr i_addr );
797-
798-
#endif // __HOSTBOOT_MODULE
799-
800812
//------------------------------------------------------------------------------
801813

802814
template<TARGETING::TYPE T, typename D>
@@ -805,44 +817,39 @@ uint32_t analyzeMainlineIue( ExtensibleChip * i_chip,
805817
{
806818
#define PRDF_FUNC "[MemEcc::analyzeMainlineIue] "
807819

820+
PRDF_ASSERT( nullptr != i_chip );
808821
PRDF_ASSERT( T == i_chip->getType() );
809-
uint32_t o_rc = SUCCESS;
810822

811-
#ifdef __HOSTBOOT_MODULE
823+
uint32_t o_rc = SUCCESS;
812824

813825
do
814826
{
815-
816-
// get the address of the failure
817-
MemAddr addr;
818-
819827
// Use the address in MBRCER. This address also traps IRCDs, but it is
820828
// not likely that we will have two independent failure modes at the
821829
// same time. So we just assume the address is correct.
830+
MemAddr addr;
822831
o_rc = getMemReadAddr<T>( i_chip, MemAddr::READ_RCE_ADDR, addr );
823832
if ( SUCCESS != o_rc )
824833
{
825834
PRDF_ERR( PRDF_FUNC "getMemReadAddr(0x%08x, READ_RCE_ADDR) failed",
826-
i_chip->getHuid() );
835+
i_chip->getHuid() );
827836
break;
828837
}
838+
MemRank rank = addr.getRank();
829839

830-
o_rc = __analyzeIue<T,D>( i_chip, io_sc, addr );
840+
o_rc = handleMemIue<T,D>( i_chip, rank, io_sc );
831841
if ( SUCCESS != o_rc )
832842
{
833-
PRDF_ERR( PRDF_FUNC "__analyzeIue failed. Chip HUID: 0x%08x",
834-
i_chip->getHuid() );
843+
PRDF_ERR( PRDF_FUNC "handleMemIue<T,D>(0x%08x,m%ds%d) failed",
844+
i_chip->getHuid(), rank.getMaster(), rank.getSlave() );
835845
break;
836846
}
837847

838-
}while(0);
839-
840-
#endif
848+
} while (0);
841849

842850
return o_rc;
843851

844852
#undef PRDF_FUNC
845-
846853
}
847854

848855
// To resolve template linker errors.
@@ -858,40 +865,37 @@ uint32_t analyzeMaintIue( ExtensibleChip * i_chip,
858865
{
859866
#define PRDF_FUNC "[MemEcc::analyzeMaintIue] "
860867

868+
PRDF_ASSERT( nullptr != i_chip );
861869
PRDF_ASSERT( T == i_chip->getType() );
862-
uint32_t o_rc = SUCCESS;
863870

864-
#ifdef __HOSTBOOT_MODULE
871+
uint32_t o_rc = SUCCESS;
865872

866873
do
867874
{
875+
// Use the current address in the MCBMCAT.
868876
MemAddr addr;
869-
870-
// Use the current address in the MCBMCAT
871877
o_rc = getMemMaintAddr<T>( i_chip, addr );
872878
if ( SUCCESS != o_rc )
873879
{
874880
PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed",
875-
i_chip->getHuid() );
881+
i_chip->getHuid() );
876882
break;
877883
}
884+
MemRank rank = addr.getRank();
878885

879-
o_rc = __analyzeIue<T,D>( i_chip, io_sc, addr );
886+
o_rc = handleMemIue<T,D>( i_chip, rank, io_sc );
880887
if ( SUCCESS != o_rc )
881888
{
882-
PRDF_ERR( PRDF_FUNC "__analyzeIue failed. Chip HUID: "
883-
"0x%08x", i_chip->getHuid() );
889+
PRDF_ERR( PRDF_FUNC "handleMemIue<T,D>(0x%08x,m%ds%d) failed",
890+
i_chip->getHuid(), rank.getMaster(), rank.getSlave() );
884891
break;
885892
}
886893

887-
}while(0);
888-
889-
#endif
894+
} while (0);
890895

891896
return o_rc;
892897

893898
#undef PRDF_FUNC
894-
895899
}
896900

897901
// To resolve template linker errors.

src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,27 @@ template<TARGETING::TYPE T>
8484
uint32_t handleMemUe( ExtensibleChip * i_chip, const MemAddr & i_addr,
8585
UE_TABLE::Type i_type, STEP_CODE_DATA_STRUCT & io_sc );
8686

87+
/**
88+
* @brief Does mainline and maintenance IUE handling.
89+
*
90+
* Adds the memory IUE to the callout list. At threshold, will make the error
91+
* log predictive. When threshold is reached at runtime there is a good chance
92+
* these IUEs are going to lead to a data integrity issue. Therefore, the port
93+
* will be forced to fail, the entire port will be masked off, and dynamic
94+
* memory deallocation will be applied. Note that this function will not issue
95+
* the port failure because it is possible that it may crash the host. Instead,
96+
* the port failure is issued in the PostAnalysis plugin after the error log has
97+
* been committed.
98+
*
99+
* @param i_chip MCA chip.
100+
* @param i_rank Rank containing the IUE.
101+
* @param io_sc The step code data struct.
102+
* @return Non-SUCCESS if an interal function fails, SUCCESS otherwise.
103+
*/
104+
template<TARGETING::TYPE T, typename D>
105+
uint32_t handleMemIue( ExtensibleChip * i_chip, const MemRank & i_rank,
106+
STEP_CODE_DATA_STRUCT & io_sc );
107+
87108
/**
88109
* @brief Analyzes a fetch MPE attention.
89110
* @param i_chip MCA or MBA.
@@ -158,22 +179,25 @@ uint32_t analyzeImpe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc );
158179
#ifdef __HOSTBOOT_RUNTIME
159180

160181
/**
161-
* @brief Will trigger a port fail if the number of IUEs is over threshold
162-
* @param i_chip MCA chip
163-
* @param io_sc The step code data struct.
182+
* @brief Will trigger a port fail if the number of IUEs is over threshold.
183+
* @param i_chip MCA chip
184+
* @param io_sc The step code data struct.
164185
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise
165186
*/
166-
uint32_t iuePortFail(ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc);
187+
template<TARGETING::TYPE T>
188+
uint32_t iuePortFail( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc );
167189

168190
#endif // __HOSTBOOT_RUNTIME
169191

170192
#ifdef __HOSTBOOT_MODULE
171193

172194
/**
173-
* @brief Will mask off the entire mem port
174-
* @param i_chip MCA chip
195+
* @brief Will mask off an entire memory port. At runtime will issue dynamic
196+
* memory deallocation of the port.
197+
* @param i_chip MCA chip
175198
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise
176199
*/
200+
template<TARGETING::TYPE T>
177201
uint32_t maskMemPort( ExtensibleChip * i_chip );
178202

179203
template<TARGETING::TYPE T, typename D>

src/usr/diag/prdf/plat/mem/prdfP9Mca.C

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -69,16 +69,14 @@ int32_t PostAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc )
6969

7070
#ifdef __HOSTBOOT_RUNTIME
7171

72-
7372
// If the IUE threshold in our data bundle has been reached, we trigger
7473
// a port fail. Once we trigger the port fail, the system may crash
7574
// right away. Since PRD is running in the hypervisor, it is possible we
7675
// may not get the error log. To better our chances, we trigger the port
7776
// fail here after the error log has been committed.
78-
if ( SUCCESS != MemEcc::iuePortFail(i_chip, io_sc) )
77+
if ( SUCCESS != MemEcc::iuePortFail<TYPE_MCA>(i_chip, io_sc) )
7978
{
80-
PRDF_ERR( PRDF_FUNC "iuePortFail failed: i_chip=0x%08x",
81-
i_chip->getHuid() );
79+
PRDF_ERR( PRDF_FUNC "iuePortFail(0x%08x) failed", i_chip->getHuid() );
8280
}
8381

8482
#endif // __HOSTBOOT_RUNTIME
@@ -197,14 +195,13 @@ int32_t MemPortFailure( ExtensibleChip * i_chip,
197195

198196
if ( CHECK_STOP != io_sc.service_data->getPrimaryAttnType() )
199197
{
200-
// The port is dead mask off the entire port.
201-
uint32_t l_rc = MemEcc::maskMemPort( i_chip );
198+
// The port is dead. Mask off the entire port.
199+
uint32_t l_rc = MemEcc::maskMemPort<TYPE_MCA>( i_chip );
202200
if ( SUCCESS != l_rc )
203201
{
204-
PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort failed: i_chip=0x%08x",
202+
PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort<TYPE_MCA>(0x%08x) failed",
205203
i_chip->getHuid() );
206204
}
207-
208205
}
209206

210207
return SUCCESS; // nothing to return to rule code

0 commit comments

Comments
 (0)