Skip to content

Commit 2536137

Browse files
committed
1. implement decoupling log full (UNIV_LIBOBJ_WAL_ELR, 2. skip log_write_up_to() in logcheckpoint(), and flush log buffer when its size reach 50% capacity
1 parent fa34845 commit 2536137

File tree

5 files changed

+266
-12
lines changed

5 files changed

+266
-12
lines changed

build_mysql.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@ IS_DEBUG=0
1515
### PB-NVM + PL-NVM
1616
#BUILD_NAME="-DUNIV_PMEMOBJ_PART_PL_STAT -DUNIV_PMEMOBJ_PAGE_LOG -DUNIV_PMEMOBJ_PART_PL -DUNIV_PMEMOBJ_PL -DUNIV_OPENMP -DUNIV_PMEMOBJ_BLOOM -DUNIV_PMEMOBJ_BUF -DUNIV_PMEMOBJ_BUF_PARTITION -DUNIV_PMEMOBJ_BUF_FLUSHER -DUNIV_PMEMOBJ_BUF_RECOVERY -DUNIV_PMEMOBJ_BUF_STAT -DUNIV_TRACE_FLUSH_TIME"
1717

18-
BUILD_NAME="-DUNIV_TRACE_FLUSH_TIME"
18+
#BUILD_NAME="-DUNIV_TRACE_FLUSH_TIME"
1919
#BUILD_NAME="-DUNIV_TRACE_FLUSH_TIME -DUNIV_SKIPLOG"
2020
#BUILD_NAME="-DUNIV_PMEMOBJ_WAL -DUNIV_TRACE_FLUSH_TIME"
21+
BUILD_NAME="-DUNIV_PMEMOBJ_WAL -DUNIV_PMEMOBJ_WAL_ELR -DUNIV_TRACE_FLUSH_TIME"
2122
#BUILD_NAME="-DUNIV_PMEMOBJ_PPL_STAT -DUNIV_PMEMOBJ_PAGE_LOG -DUNIV_PMEMOBJ_PART_PL -DUNIV_PMEMOBJ_PL -DUNIV_TRACE_FLUSH_TIME"
2223

2324

storage/innobase/log/log0log.cc

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -421,8 +421,13 @@ log_reserve_and_open(
421421

422422
len_upper_limit = LOG_BUF_WRITE_MARGIN + srv_log_write_ahead_size
423423
+ (5 * len) / 4;
424-
424+
#if defined (UNIV_PMEMOBJ_WAL)
425+
//in PMEMOBJ_WAL, because we do not flush log at commit, we shouldn't keep the log buffer
426+
//too large, we flush when its size is x% (25, 50, 75) capacity
427+
if (log_sys->buf_free + len_upper_limit > log_sys->buf_size / 2) {
428+
#else //original
425429
if (log_sys->buf_free + len_upper_limit > log_sys->buf_size) {
430+
#endif
426431
log_mutex_exit();
427432

428433
DEBUG_SYNC_C("log_buf_size_exceeded");
@@ -438,7 +443,6 @@ log_reserve_and_open(
438443
log_mutex_enter();
439444
goto loop;
440445
}
441-
442446
return(log_sys->lsn);
443447
}
444448

@@ -478,11 +482,15 @@ log_write_low(
478482

479483
#if defined (UNIV_PMEMOBJ_LOG) || defined (UNIV_PMEMOBJ_WAL)
480484
//copy the trasaction's log records to persistent memory
485+
//The mfence per copy version
481486
TX_BEGIN(gb_pmw->pop) {
482487
TX_MEMCPY(log->buf + log->buf_free, str, len);
483488
} TX_ONABORT {
484489
} TX_END
485-
//pmemobj_memcpy_persist(gb_pmw->pop, log->buf + log->buf_free, str, len);
490+
491+
//The unsafe version
492+
//ut_memcpy(log->buf + log->buf_free, str, len);
493+
486494
//set the flag here
487495
gb_pmw->plogbuf->need_recv = true;
488496
#else //original
@@ -2007,9 +2015,11 @@ log_checkpoint(
20072015
}
20082016

20092017
log_mutex_exit();
2010-
2018+
#if defined (UNIV_PMEMOBJ_WAL)
2019+
//do nothing
2020+
#else //original
20112021
log_write_up_to(flush_lsn, true);
2012-
2022+
#endif
20132023
DBUG_EXECUTE_IF(
20142024
"using_wa_checkpoint_middle",
20152025
if (write_always) {

storage/innobase/mtr/mtr0mtr.cc

Lines changed: 230 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,10 @@ Created 11/26/1995 Heikki Tuuri
4141

4242
#if defined (UNIV_TRACE_FLUSH_TIME)
4343
extern volatile int64 gb_write_log_time;
44+
extern volatile int64 gb_n_write_log;
4445
#endif
4546

46-
#if defined (UNIV_PMEMOBJ_PART_PL)
47+
#if defined (UNIV_PMEMOBJ_PART_PL) || defined (UNIV_PMEMOBJ_WAL_ELR)
4748
#include "my_pmemobj.h"
4849
extern PMEM_WRAPPER* gb_pmw;
4950
#endif /* UNIV_PMEMOBJ_PART_PL */
@@ -1109,6 +1110,7 @@ mtr_t::Command::execute()
11091110
ulint exec_time = end_time - start_time;
11101111
//my_atomic_add64(&gb_write_log_time, exec_time);
11111112
__sync_fetch_and_add(&gb_write_log_time, exec_time);
1113+
__sync_fetch_and_add(&gb_n_write_log, 1);
11121114
#endif
11131115
}
11141116
#else //old method
@@ -1152,6 +1154,232 @@ mtr_t::Command::execute()
11521154
release_resources();
11531155
}
11541156
#endif //UNIV_PMEMOBJ_PART_PL
1157+
#elif defined (UNIV_PMEMOBJ_WAL) && defined (UNIV_PMEMOBJ_WAL_ELR)
1158+
//Early lock release
1159+
void
1160+
mtr_t::Command::execute()
1161+
{
1162+
ulint len;
1163+
ulint str_len;
1164+
ulint len_tem;
1165+
ulint data_len;
1166+
ulint n_recs;
1167+
1168+
byte* start_cpy;
1169+
ulint len_cpy;
1170+
1171+
log_t* log = log_sys;
1172+
byte* log_block;
1173+
1174+
fil_space_t* space;
1175+
1176+
const mtr_buf_t::block_t* front = m_impl->m_log.front();
1177+
byte* start_log_ptr = (byte*) front->begin();
1178+
byte* str = start_log_ptr;
1179+
1180+
ut_ad(m_impl->m_log_mode != MTR_LOG_NONE);
1181+
1182+
////Simulate prepare_write()
1183+
switch (m_impl->m_log_mode) {
1184+
case MTR_LOG_SHORT_INSERTS:
1185+
ut_ad(0);
1186+
/* fall through (write no redo log) */
1187+
case MTR_LOG_NO_REDO:
1188+
case MTR_LOG_NONE:
1189+
ut_ad(m_impl->m_log.size() == 0);
1190+
log_mutex_enter();
1191+
m_end_lsn = m_start_lsn = log_sys->lsn;
1192+
//return(0);
1193+
len = 0;
1194+
break;
1195+
case MTR_LOG_ALL:
1196+
break;
1197+
}
1198+
1199+
if (m_impl->m_log_mode == MTR_LOG_ALL) {
1200+
1201+
len = m_impl->m_log.size();
1202+
n_recs = m_impl->m_n_log_recs;
1203+
ut_ad(len > 0);
1204+
ut_ad(n_recs > 0);
1205+
1206+
if (len > log_sys->buf_size / 2) {
1207+
log_buffer_extend((len + 1) * 2);
1208+
}
1209+
1210+
ut_ad(m_impl->m_n_log_recs == n_recs);
1211+
1212+
space = m_impl->m_user_space;
1213+
1214+
if (space != NULL && is_system_or_undo_tablespace(space->id)) {
1215+
/* Omit MLOG_FILE_NAME for predefined tablespaces. */
1216+
space = NULL;
1217+
}
1218+
1219+
log_mutex_enter();
1220+
1221+
if (fil_names_write_if_was_clean(space, m_impl->m_mtr)) {
1222+
/* This mini-transaction was the first one to modify
1223+
this tablespace since the latest checkpoint, so
1224+
some MLOG_FILE_NAME records were appended to m_log. */
1225+
ut_ad(m_impl->m_n_log_recs > n_recs);
1226+
mlog_catenate_ulint(
1227+
&m_impl->m_log, MLOG_MULTI_REC_END, MLOG_1BYTE);
1228+
len = m_impl->m_log.size();
1229+
} else {
1230+
/* This was not the first time of dirtying a
1231+
tablespace since the latest checkpoint. */
1232+
1233+
ut_ad(n_recs == m_impl->m_n_log_recs);
1234+
1235+
if (n_recs <= 1) {
1236+
ut_ad(n_recs == 1);
1237+
1238+
/* Flag the single log record as the
1239+
only record in this mini-transaction. */
1240+
*m_impl->m_log.front()->begin()
1241+
|= MLOG_SINGLE_REC_FLAG;
1242+
} else {
1243+
/* Because this mini-transaction comprises
1244+
multiple log records, append MLOG_MULTI_REC_END
1245+
at the end. */
1246+
1247+
mlog_catenate_ulint(
1248+
&m_impl->m_log, MLOG_MULTI_REC_END,
1249+
MLOG_1BYTE);
1250+
len++;
1251+
}
1252+
}
1253+
1254+
/* check and attempt a checkpoint if exceeding capacity */
1255+
log_margin_checkpoint_age(len);
1256+
} // end if (m_impl->m_log_mode == MTR_LOG_ALL)
1257+
1258+
////End simulate prepare_write()
1259+
1260+
if (len > 0){
1261+
//simulate finish_write()
1262+
ut_ad(m_impl->m_log_mode == MTR_LOG_ALL);
1263+
ut_ad(log_mutex_own());
1264+
ut_ad(m_impl->m_log.size() == len);
1265+
ut_ad(len > 0);
1266+
1267+
if (m_impl->m_log.is_small()) {
1268+
const mtr_buf_t::block_t* front = m_impl->m_log.front();
1269+
ut_ad(len <= front->used());
1270+
1271+
m_end_lsn = log_reserve_and_write_fast(
1272+
front->begin(), len, &m_start_lsn);
1273+
1274+
if (m_end_lsn > 0) {
1275+
goto skip_write;
1276+
}
1277+
}
1278+
/* Open the database log for log_write_low
1279+
* This function also check for flush log buffer if lsn + len > log buffer capacity
1280+
*
1281+
* */
1282+
m_start_lsn = log_reserve_and_open(len);
1283+
1284+
//simulate log_write_low(), we do the same as log_write_low() except skip memcpy()
1285+
1286+
//assign as parameter pass
1287+
str_len = len;
1288+
str = start_log_ptr;
1289+
1290+
part_loop:
1291+
data_len = (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
1292+
if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
1293+
1294+
/* The string fits within the current log block */
1295+
1296+
len_tem = str_len;
1297+
} else {
1298+
data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
1299+
1300+
len_tem = OS_FILE_LOG_BLOCK_SIZE
1301+
- (log->buf_free % OS_FILE_LOG_BLOCK_SIZE)
1302+
- LOG_BLOCK_TRL_SIZE;
1303+
}
1304+
1305+
//save the pointer and the len to be copied later
1306+
start_cpy = log->buf + log->buf_free;
1307+
len_cpy = len_tem;
1308+
1309+
str_len -= len_tem;
1310+
str = str + len_tem;
1311+
1312+
log_block = static_cast<byte*>(
1313+
ut_align_down(
1314+
log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
1315+
1316+
log_block_set_data_len(log_block, data_len);
1317+
1318+
if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
1319+
/* This block became full */
1320+
log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
1321+
log_block_set_checkpoint_no(log_block,
1322+
log_sys->next_checkpoint_no);
1323+
len_tem += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE;
1324+
1325+
log->lsn += len_tem;
1326+
1327+
/* Initialize the next block header */
1328+
log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, log->lsn);
1329+
} else {
1330+
log->lsn += len_tem;
1331+
}
1332+
1333+
log->buf_free += len_tem;
1334+
1335+
ut_ad(log->buf_free <= log->buf_size);
1336+
1337+
if (str_len > 0) {
1338+
goto part_loop;
1339+
}
1340+
#if defined(UNIV_PMEMOBJ_LOG) || defined(UNIV_PMEMOBJ_WAL)
1341+
// update the lsn and buf_free
1342+
gb_pmw->plogbuf->lsn = log->lsn;
1343+
gb_pmw->plogbuf->buf_free = log->buf_free;
1344+
#endif /*UNIV_PMEMOBJ_LOG */
1345+
srv_stats.log_write_requests.inc();
1346+
1347+
//end simulate log_write_low() ///
1348+
1349+
m_end_lsn = log_close();
1350+
1351+
} //end if len > 0
1352+
skip_write:
1353+
////// end simulate finish_write() ////////////
1354+
1355+
if (m_impl->m_made_dirty) {
1356+
log_flush_order_mutex_enter();
1357+
}
1358+
1359+
/* It is now safe to release the log mutex because the
1360+
flush_order mutex will ensure that we are the first one
1361+
to insert into the flush list. */
1362+
log_mutex_exit();
1363+
1364+
//now we do the memcpy
1365+
TX_BEGIN(gb_pmw->pop) {
1366+
TX_MEMCPY(start_cpy, start_log_ptr, len_cpy);
1367+
} TX_ONABORT {
1368+
} TX_END
1369+
gb_pmw->plogbuf->need_recv = true;
1370+
1371+
m_impl->m_mtr->m_commit_lsn = m_end_lsn;
1372+
1373+
release_blocks();
1374+
1375+
if (m_impl->m_made_dirty) {
1376+
log_flush_order_mutex_exit();
1377+
}
1378+
1379+
release_latches();
1380+
1381+
release_resources();
1382+
}
11551383
#else //original
11561384
void
11571385
mtr_t::Command::execute()
@@ -1191,6 +1419,7 @@ mtr_t::Command::execute()
11911419
ulint exec_time = end_time - start_time;
11921420
//my_atomic_add64(&gb_write_log_time, exec_time);
11931421
__sync_fetch_and_add(&gb_write_log_time, exec_time);
1422+
__sync_fetch_and_add(&gb_n_write_log, 1);
11941423
#endif
11951424
}
11961425
#endif // UNIV_PMEMOBJ_PL

storage/innobase/pmem/pmem0log.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2844,12 +2844,12 @@ __print_lock_overhead(FILE* f,
28442844
avg_log_write_lock_wait_time = avg_log_write_lock_wait_time / n;
28452845
avg_log_flush_lock_wait_time = avg_log_flush_lock_wait_time / n;
28462846

2847-
fprintf(f, "max_log_write_wait(us) max_log_flush_wait(us) avg_log_write_wait avg_log_flush_wait \t %zu \t %f \t %zu \t %f\n",
2847+
fprintf(f, "max_log_write_wait(us) avg_log_write_wait(us) max_log_flush_wait(us) avg_log_flush_wait \t %zu \t %f \t %zu \t %f\n",
28482848
max_log_write_lock_wait_time,
28492849
avg_log_write_lock_wait_time,
28502850
max_log_flush_lock_wait_time,
28512851
avg_log_flush_lock_wait_time);
2852-
printf("max_log_write_wait(us) max_log_flush_wait(us) avg_log_write_wait avg_log_flush_wait \t %zu \t %f \t %zu \t %f\n",
2852+
printf("max_log_write_wait(us) avg_log_write_wait(us) max_log_flush_wait(us) avg_log_flush_wait \t %zu \t %f \t %zu \t %f\n",
28532853
max_log_write_lock_wait_time,
28542854
avg_log_write_lock_wait_time,
28552855
max_log_flush_lock_wait_time,

storage/innobase/srv/srv0start.cc

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,9 @@ static pfs_os_file_t files[1000];
176176
#if defined (UNIV_TRACE_FLUSH_TIME)
177177
ulint gb_flush_time = 0;
178178
volatile int64 gb_write_log_time = 0;
179+
volatile int64 gb_n_write_log = 0;
179180
volatile int64 gb_flush_log_time = 0;
181+
volatile int64 gb_n_flush_log = 0;
180182
FILE* gb_trace_file = fopen("trace_flush.txt","a");
181183
#endif
182184

@@ -2927,7 +2929,11 @@ innobase_shutdown_for_mysql(void)
29272929
#endif
29282930

29292931
#if defined (UNIV_TRACE_FLUSH_TIME)
2930-
printf("===>TRACE TIME: flush_time (ms) = %zu write_log (ms)%zu flush_log (ms) %zu\n", gb_flush_time, gb_write_log_time/1000, gb_flush_log_time/1000);
2932+
printf("===>TRACE TIME: flush_time (ms) = %zu write_log (ms)%zu avg_write_log (ms/write) %f flush_log (ms) %zu\n",
2933+
gb_flush_time,
2934+
gb_write_log_time/1000,
2935+
(gb_write_log_time * 1.0)/1000/gb_n_write_log,
2936+
gb_flush_log_time/1000);
29312937
//Add the method name in the trace out
29322938
#if defined (UNIV_PMEMOBJ_DBW)
29332939
fprintf(gb_trace_file, "\n===>TRACE_FLUSH_TIME_DBW: total flush time (ms) = %zu", gb_flush_time);
@@ -2942,9 +2948,17 @@ innobase_shutdown_for_mysql(void)
29422948
#elif defined (UNIV_PMEMOBJ_WAL)
29432949
fprintf(gb_trace_file, "\n===>TRACE TIME WAL_NVM: flush_time (ms) = %zu write_log (ms)%zu flush_log (ms) %zu", gb_flush_time, gb_write_log_time/1000, gb_flush_log_time/1000);
29442950
#elif defined (UNIV_PMEMOBJ_PART_PL)
2945-
fprintf(gb_trace_file, "\n===>TRACE TIME PPL: flush_time (ms) = %zu write_log (ms)%zu flush_log (ms) %zu", gb_flush_time, gb_write_log_time/1000, gb_flush_log_time/1000);
2951+
fprintf(gb_trace_file, "===>TRACE TIME_PPL: flush_time (ms) = %zu write_log (ms)%zu avg_write_log (ms/write) %f flush_log (ms) %zu\n",
2952+
gb_flush_time,
2953+
gb_write_log_time/1000,
2954+
(gb_write_log_time * 1.0)/1000/gb_n_write_log,
2955+
gb_flush_log_time/1000);
29462956
#else
2947-
fprintf(gb_trace_file, "\n===>TRACE TIME ORI: flush_time (ms) = %zu write_log (ms)%zu flush_log (ms) %zu", gb_flush_time, gb_write_log_time/1000, gb_flush_log_time/1000);
2957+
fprintf(gb_trace_file, "===>TRACE TIME_ORI: flush_time (ms) = %zu write_log (ms)%zu avg_write_log (ms/write) %f flush_log (ms) %zu\n",
2958+
gb_flush_time,
2959+
gb_write_log_time/1000,
2960+
(gb_write_log_time * 1.0)/1000/gb_n_write_log,
2961+
gb_flush_log_time/1000);
29482962
#endif //defined (UNIV_PMEMOBJ_DBW)
29492963
fclose(gb_trace_file);
29502964
#endif //defined (UNIV_TRACE_FLUSH_TIME)

0 commit comments

Comments
 (0)