-
Notifications
You must be signed in to change notification settings - Fork 993
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5cdff76
commit 6ff3ae6
Showing
102 changed files
with
2,244 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
{ | ||
# This will sample with equal likelihood Pile and SlimPajama: | ||
"train-data-paths": [ | ||
"data/pile/train/pile_train", | ||
'data/slim_pajama/train_300B/ArXiv/ArXiv', | ||
'data/slim_pajama/train_300B/Book/Book', | ||
'data/slim_pajama/train_300B/C4/C4', | ||
'data/slim_pajama/train_300B/Wikipedia/Wikipedia', | ||
'data/slim_pajama/train_300B/Github/Github', | ||
'data/slim_pajama/train_300B/StackExchange/StackExchange', | ||
'data/slim_pajama/train_300B/CommonCrawl/CommonCrawl',], | ||
"train-data-weights": [ | ||
50.0, | ||
2.2140923205, | ||
2.101565663, | ||
13.344249736, | ||
1.9986465625, | ||
2.612070528, | ||
1.6855393625, | ||
26.0438358255 | ||
], | ||
"train-dataset-name": 'pile+slim_pajama_300B_each', | ||
"train-iters": 264732, | ||
"lr-decay-iters": 264732, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
{ | ||
"train-data-paths": [ | ||
"data/pile/shard_0/shard_0_text_document", | ||
], | ||
"train-data-weights": [ | ||
1., | ||
], | ||
"train-dataset-name": 'pile_shard0', | ||
"train-iters": 1000, | ||
"lr-decay-iters": 1000, | ||
"is_replay_enabled": true, | ||
"replay_config": { | ||
"enabled": true, | ||
# Have to specify idx filenames from original pretraining on tasks, as they contain the num iterations | ||
# and seen indices assuming we're using the same (non-replay) seed as during pretraining | ||
"replay_idx_paths_prefixes": [ | ||
"data/pile/shard_0/shard_0_text_document_train_0_indexmap_32160ns_2048sl_1234s", | ||
], | ||
"replay_data_weights":[ | ||
1.00, | ||
], | ||
"replay_idx_offsets": [ | ||
1, | ||
], | ||
# Fraction of samples coming from the replay buffer, between 0 and 1. | ||
"replay_fraction": 0.5, | ||
# Seed and reshuffle go hand in hand. They control whether you want to see the replay data in the same order | ||
# as you've seen it (done by setting reshuffle to false), and if you decide to reshuffle, what seed you should | ||
# use to reshuffle the seen data. | ||
"replay_seed": 1234, | ||
"replay_reshuffle_idx": false, | ||
}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
{ | ||
"train-data-paths": [ | ||
"data/pile/train/pile_train", | ||
], | ||
"train-data-weights": [ | ||
1., | ||
], | ||
"train-dataset-name": 'pile_train', | ||
"train-iters": 132366, | ||
"lr-decay-iters": 132366, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
{ | ||
# or for weighted datasets: | ||
"train-data-paths": [ | ||
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/arxiv/folder_train/tokenized_text_document", | ||
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/book/folder_train/tokenized_text_document", | ||
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/c4/folder_train/tokenized_text_document", | ||
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/wikipedia/folder_train/tokenized_text_document", | ||
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/github/folder_train/tokenized_text_document", | ||
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/stackexchange/folder_train/tokenized_text_document", | ||
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2019-30/folder_train/tokenized_text_document", | ||
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2020-05/folder_train/tokenized_text_document", | ||
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2021-04/folder_train/tokenized_text_document", | ||
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2022-05/folder_train/tokenized_text_document", | ||
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2023-06/folder_train/tokenized_text_document", | ||
], | ||
"train-data-weights": [ | ||
2.5, | ||
4.5, | ||
15.0, | ||
4.5, | ||
4.5, | ||
2.0, | ||
13.4, | ||
13.4, | ||
13.4, | ||
13.4, | ||
13.4 | ||
], | ||
"train-dataset-name": 'rp', | ||
|
||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
{ | ||
# or for weighted datasets: | ||
"train-data-paths": [ | ||
'data/slim_pajama/tokenized_train_0-100B/ArXiv/ArXiv', | ||
'data/slim_pajama/tokenized_train_0-100B/Book/Book', | ||
'data/slim_pajama/tokenized_train_0-100B/C4/C4', | ||
'data/slim_pajama/tokenized_train_0-100B/Wikipedia/Wikipedia', | ||
'data/slim_pajama/tokenized_train_0-100B/Github/Github', | ||
'data/slim_pajama/tokenized_train_0-100B/StackExchange/StackExchange', | ||
'data/slim_pajama/tokenized_train_0-100B/CommonCrawl/CommonCrawl', | ||
], | ||
"train-data-weights": [ | ||
3.4703977435152775, | ||
3.904381603212791, | ||
25.641950653802013, | ||
3.804228253591696, | ||
4.9994643949282045, | ||
3.1815838172641993, | ||
49.99799353368582, | ||
], | ||
"train-iters": 44229, | ||
"lr-decay-iters": 44229, | ||
"train-dataset-name": 'slim_pajama_100B_1', | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
{ | ||
# or for weighted datasets: | ||
"train-data-paths": [ | ||
'data/slim_pajama/tokenized_train_0-100B/ArXiv/ArXiv', | ||
'data/slim_pajama/tokenized_train_0-100B/Book/Book', | ||
'data/slim_pajama/tokenized_train_0-100B/C4/C4', | ||
'data/slim_pajama/tokenized_train_0-100B/Wikipedia/Wikipedia', | ||
'data/slim_pajama/tokenized_train_0-100B/Github/Github', | ||
'data/slim_pajama/tokenized_train_0-100B/StackExchange/StackExchange', | ||
'data/slim_pajama/tokenized_train_0-100B/CommonCrawl/CommonCrawl', | ||
|
||
'data/pile_replay_shards/replay_10B_1/splits', | ||
], | ||
"train-data-weights": [ | ||
3.4703977435152775, | ||
3.904381603212791, | ||
25.641950653802013, | ||
3.804228253591696, | ||
4.9994643949282045, | ||
3.1815838172641993, | ||
49.99799353368582, | ||
|
||
5.0 | ||
], | ||
"train-iters": 44229, | ||
"lr-decay-iters": 44229, | ||
"train-dataset-name": 'slim_pajama_100B_1_replay5', | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
{ | ||
# or for weighted datasets: | ||
"train-data-paths": [ | ||
'data/slim_pajama/tokenized_train_100B-200B/ArXiv/ArXiv', | ||
'data/slim_pajama/tokenized_train_100B-200B/Book/Book', | ||
'data/slim_pajama/tokenized_train_100B-200B/C4/C4', | ||
'data/slim_pajama/tokenized_train_100B-200B/Wikipedia/Wikipedia', | ||
'data/slim_pajama/tokenized_train_100B-200B/Github/Github', | ||
'data/slim_pajama/tokenized_train_100B-200B/StackExchange/StackExchange', | ||
'data/slim_pajama/tokenized_train_100B-200B/CommonCrawl/CommonCrawl', | ||
], | ||
"train-data-weights": [ | ||
4.03666599074094, | ||
3.927523855378127, | ||
25.467175464208918, | ||
3.7984379710376293, | ||
4.990226864678155, | ||
3.1957646326079723, | ||
49.58420522134826, | ||
], | ||
"train-iters": 44229, | ||
"lr-decay-iters": 44229, | ||
"train-dataset-name": 'slim_pajama_100B_2', | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
{ | ||
# or for weighted datasets: | ||
"train-data-paths": [ | ||
'data/slim_pajama/tokenized_train_100B-200B/ArXiv/ArXiv', | ||
'data/slim_pajama/tokenized_train_100B-200B/Book/Book', | ||
'data/slim_pajama/tokenized_train_100B-200B/C4/C4', | ||
'data/slim_pajama/tokenized_train_100B-200B/Wikipedia/Wikipedia', | ||
'data/slim_pajama/tokenized_train_100B-200B/Github/Github', | ||
'data/slim_pajama/tokenized_train_100B-200B/StackExchange/StackExchange', | ||
'data/slim_pajama/tokenized_train_100B-200B/CommonCrawl/CommonCrawl', | ||
|
||
'data/pile_replay_shards/replay_10B_2/splits', | ||
|
||
'data/sp_replay_shards/100B_1_shard1/ArXiv/ArXiv', | ||
'data/sp_replay_shards/100B_1_shard1/Book/Book', | ||
'data/sp_replay_shards/100B_1_shard1/C4/C4', | ||
'data/sp_replay_shards/100B_1_shard1/Wikipedia/Wikipedia', | ||
'data/sp_replay_shards/100B_1_shard1/Github/Github', | ||
'data/sp_replay_shards/100B_1_shard1/StackExchange/StackExchange', | ||
'data/sp_replay_shards/100B_1_shard1/CommonCrawl/CommonCrawl', | ||
], | ||
"train-data-weights": [ | ||
4.03666599074094, | ||
3.927523855378127, | ||
25.467175464208918, | ||
3.7984379710376293, | ||
4.990226864678155, | ||
3.1957646326079723, | ||
49.58420522134826, | ||
|
||
3.8125, | ||
|
||
# total: 1.1875, | ||
0.04337997179394097, | ||
0.04880477004015989, | ||
0.3205243831725252, | ||
0.0475528531698962, | ||
0.06249330493660256, | ||
0.03976979771580249, | ||
0.6249749191710727, | ||
], | ||
"train-iters": 44229, | ||
"lr-decay-iters": 44229, | ||
"train-dataset-name": 'slim_pajama_100B_2_replay5', | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
{ | ||
# or for weighted datasets: | ||
"train-data-paths": [ | ||
'data/slim_pajama/tokenized_train_200B-300B/ArXiv/ArXiv', | ||
'data/slim_pajama/tokenized_train_200B-300B/Book/Book', | ||
'data/slim_pajama/tokenized_train_200B-300B/C4/C4', | ||
'data/slim_pajama/tokenized_train_200B-300B/Wikipedia/Wikipedia', | ||
'data/slim_pajama/tokenized_train_200B-300B/Github/Github', | ||
'data/slim_pajama/tokenized_train_200B-300B/StackExchange/StackExchange', | ||
'data/slim_pajama/tokenized_train_200B-300B/CommonCrawl/CommonCrawl', | ||
], | ||
"train-data-weights": [ | ||
3.491756366873565, | ||
4.084283062119696, | ||
25.524317038754475, | ||
3.8109321899190314, | ||
4.89534056131328, | ||
3.254459546224121, | ||
49.93891123479581, | ||
], | ||
"train-iters": 44229, | ||
"lr-decay-iters": 44229, | ||
"train-dataset-name": 'slim_pajama_100B_3', | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
{ | ||
# or for weighted datasets: | ||
"train-data-paths": [ | ||
'data/slim_pajama/tokenized_train_200B-300B/ArXiv/ArXiv', | ||
'data/slim_pajama/tokenized_train_200B-300B/Book/Book', | ||
'data/slim_pajama/tokenized_train_200B-300B/C4/C4', | ||
'data/slim_pajama/tokenized_train_200B-300B/Wikipedia/Wikipedia', | ||
'data/slim_pajama/tokenized_train_200B-300B/Github/Github', | ||
'data/slim_pajama/tokenized_train_200B-300B/StackExchange/StackExchange', | ||
'data/slim_pajama/tokenized_train_200B-300B/CommonCrawl/CommonCrawl', | ||
|
||
'data/pile_replay_shards/replay_10B_3/splits', | ||
|
||
'data/sp_replay_shards/100B_1_shard2/ArXiv/ArXiv', | ||
'data/sp_replay_shards/100B_1_shard2/Book/Book', | ||
'data/sp_replay_shards/100B_1_shard2/C4/C4', | ||
'data/sp_replay_shards/100B_1_shard2/Wikipedia/Wikipedia', | ||
'data/sp_replay_shards/100B_1_shard2/Github/Github', | ||
'data/sp_replay_shards/100B_1_shard2/StackExchange/StackExchange', | ||
'data/sp_replay_shards/100B_1_shard2/CommonCrawl/CommonCrawl', | ||
|
||
'data/sp_replay_shards/100B_2_shard1/ArXiv/ArXiv', | ||
'data/sp_replay_shards/100B_2_shard1/Book/Book', | ||
'data/sp_replay_shards/100B_2_shard1/C4/C4', | ||
'data/sp_replay_shards/100B_2_shard1/Wikipedia/Wikipedia', | ||
'data/sp_replay_shards/100B_2_shard1/Github/Github', | ||
'data/sp_replay_shards/100B_2_shard1/StackExchange/StackExchange', | ||
'data/sp_replay_shards/100B_2_shard1/CommonCrawl/CommonCrawl', | ||
], | ||
"train-data-weights": [3.491756366873565, | ||
4.084283062119696, | ||
25.524317038754475, | ||
3.8109321899190314, | ||
4.89534056131328, | ||
3.254459546224121, | ||
49.93891123479581, | ||
|
||
3.088125, | ||
|
||
# total: 0.961875, | ||
0.03513777715309219, | ||
0.03953186373252951, | ||
0.2596247503697454, | ||
0.03851781106761592, | ||
0.05061957699864807, | ||
0.03221353614980002, | ||
0.506229684528569, | ||
|
||
#total: 0.95, | ||
0.0403666599074094, | ||
0.03927523855378127, | ||
0.25467175464208913, | ||
0.03798437971037629, | ||
0.049902268646781545, | ||
0.03195764632607972, | ||
0.4958420522134826, | ||
], | ||
"train-iters": 44229, | ||
"lr-decay-iters": 44229, | ||
"train-dataset-name": 'slim_pajama_100B_3_replay5', | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
{ | ||
# or for weighted datasets: | ||
"train-data-paths": [ | ||
'data/slim_pajama/train_150B/ArXiv/ArXiv', | ||
'data/slim_pajama/train_150B/Book/Book', | ||
'data/slim_pajama/train_150B/C4/C4', | ||
'data/slim_pajama/train_150B/Wikipedia/Wikipedia', | ||
'data/slim_pajama/train_150B/Github/Github', | ||
'data/slim_pajama/train_150B/StackExchange/StackExchange', | ||
'data/slim_pajama/train_150B/CommonCrawl/CommonCrawl',], | ||
"train-data-weights": [ | ||
4.576447650075095, | ||
4.198505982426652, | ||
26.62982374026485, | ||
3.9945183507095225, | ||
5.218824282422116, | ||
3.372167199706489, | ||
52.00971279439528 | ||
], | ||
"train-dataset-name": 'slim_pajama_150B', | ||
"train-iters": 66342, | ||
"lr-decay-iters": 66342, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
{ | ||
# or for weighted datasets: | ||
"train-data-paths": [ | ||
'data/slim_pajama/tokenized_train_0-200B/ArXiv/ArXiv', | ||
'data/slim_pajama/tokenized_train_0-200B/Book/Book', | ||
'data/slim_pajama/tokenized_train_0-200B/C4/C4', | ||
'data/slim_pajama/tokenized_train_0-200B/Wikipedia/Wikipedia', | ||
'data/slim_pajama/tokenized_train_0-200B/Github/Github', | ||
'data/slim_pajama/tokenized_train_0-200B/StackExchange/StackExchange', | ||
'data/slim_pajama/tokenized_train_0-200B/CommonCrawl/CommonCrawl', | ||
], | ||
"train-data-weights": [ | ||
3.4703977435152775, | ||
3.904381603212791, | ||
25.641950653802013, | ||
3.804228253591696, | ||
4.9994643949282045, | ||
3.1815838172641993, | ||
49.99799353368582, | ||
], | ||
"train-iters": 88457, | ||
"lr-decay-iters": 88457, | ||
"train-dataset-name": 'slim_pajama_200B_1', | ||
} |
Oops, something went wrong.