-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
remove news summary dataset from repo, download using script in sample #2250
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
# import library to parse command line arguments | ||
import argparse, os | ||
import pandas as pd | ||
import os | ||
|
||
parser = argparse.ArgumentParser() | ||
# add an argument to specify a dataset name to download | ||
parser.add_argument("--dataset", type=str, default="cnn_dailymail", help="dataset name") | ||
# add an argument to specify the config name of the dataset | ||
parser.add_argument( | ||
"--config_name", type=str, default="3.0.0", help="config name of the dataset" | ||
) | ||
# argument to save a fraction of the dataset | ||
parser.add_argument( | ||
"--fraction", type=float, default=0.05, help="fraction of the dataset to save" | ||
) | ||
# add an argument to specify the directory to download the dataset to | ||
parser.add_argument( | ||
"--download_dir", | ||
type=str, | ||
default="./news-summary-dataset", | ||
help="directory to download the dataset to", | ||
) | ||
args = parser.parse_args() | ||
|
||
# create the download directory if it does not exist | ||
if not os.path.exists(args.download_dir): | ||
os.makedirs(args.download_dir) | ||
|
||
# import hugging face datasets library | ||
from datasets import load_dataset, get_dataset_split_names | ||
|
||
for split in get_dataset_split_names(args.dataset, config_name=args.config_name): | ||
print(f"Loading {split} split of {args.dataset} dataset...") | ||
# load the split of the dataset | ||
dataset = load_dataset(args.dataset, args.config_name, split=split) | ||
# save the split of the dataset to the download directory as json lines file | ||
dataset.select(range(int(dataset.num_rows * args.fraction))).to_json( | ||
os.path.join(args.download_dir, f"{split}.jsonl") | ||
) | ||
|
||
train_df = pd.read_json(os.path.join(args.download_dir, "train.jsonl"), lines=True) | ||
validation_df = pd.read_json(os.path.join(args.download_dir, "validation.jsonl"), lines=True) | ||
# this dataset doesn't have test data, so split the validation_df into test_df and validation_df | ||
test_df = validation_df.sample(frac=0.5, random_state=42) | ||
validation_df.drop(test_df.index, inplace=True) | ||
# drop the id column as it is not needed for fine tuning | ||
train_df.drop(columns=["id"], inplace=True) | ||
validation_df.drop(columns=["id"], inplace=True) | ||
test_df.drop(columns=["id"], inplace=True) | ||
|
||
|
||
# save 20% of the rows from the dataframes into files with small_ prefix in the ./news-summary-dataset folder | ||
train_df.sample(frac=0.2).to_json( | ||
os.path.join(args.download_dir,"small_train.jsonl"), orient="records", lines=True | ||
) | ||
validation_df.sample(frac=0.2).to_json( | ||
os.path.join(args.download_dir,"small_validation.jsonl"), orient="records", lines=True | ||
) | ||
test_df.sample(frac=0.2).to_json( | ||
os.path.join(args.download_dir,"small_test.jsonl"), orient="records", lines=True | ||
) | ||
|
||
|
||
# generate sample scoring data | ||
# read ./news-summary-dataset/small_test.jsonl into a pandas dataframe | ||
import pandas as pd | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. already imported way up top |
||
import json | ||
|
||
test_df = pd.read_json( | ||
os.path.join(args.download_dir,"small_test.jsonl"), orient="records", lines=True | ||
) | ||
# take 1 random sample | ||
test_df = test_df.sample(n=1) | ||
# rebuild index | ||
test_df.reset_index(drop=True, inplace=True) | ||
# rename the highlights column to ground_truth_summary | ||
test_df.rename(columns={"highlights": "ground_truth_summary"}, inplace=True) | ||
# create a json object with the key as "inputs" and value as a list of values from the article column of the test dataframe | ||
test_json = {"inputs": {"input_string": test_df["article"].tolist()}} | ||
# save the json object to a file named sample_score.json in the ./emotion-dataset folder | ||
with open(os.path.join(args.download_dir,"sample_score.json"), "w") as f: | ||
json.dump(test_json, f) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,9 +3,9 @@ set -x | |
# the data files are available in the same folder as the above notebook | ||
|
||
# script inputs | ||
subscription_id="<SUBSCRIPTION_ID>" | ||
resource_group_name="<RESOURCE_GROUP>" | ||
workspace_name="<WORKSPACE_NAME>" | ||
subscription_id="21d8f407-c4c4-452e-87a4-e609bfb86248" #"<SUBSCRIPTION_ID>" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What ID is this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like there are test values left here, we need to remove these |
||
resource_group_name="rg-contoso-819prod" #"<RESOURCE_GROUP>", | ||
workspace_name="mlw-contoso-819prod" #"WORKSPACE_NAME>", | ||
registry_name="azureml" | ||
|
||
compute_cluster="gpu-cluster-big" | ||
|
@@ -27,15 +27,15 @@ deployment_sku="Standard_DS3_v2" | |
|
||
|
||
# training data | ||
train_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_train.jsonl" | ||
train_data="./news-summary-dataset/small_train.jsonl" | ||
# validation data | ||
validation_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_validation.jsonl" | ||
validation_data="./news-summary-dataset/small_validation.jsonl" | ||
# test data | ||
test_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_test.jsonl" | ||
test_data="./news-summary-dataset/small_test.jsonl" | ||
# evaluation config | ||
evaluation_config="../../../../../sdk/python/foundation-models/system/finetune/summarization/summarization-config.json" | ||
evaluation_config="./summarization-config.json" | ||
# scoring_file | ||
scoring_file="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/sample_score.json" | ||
scoring_file="./news-summary-dataset/sample_score.json" | ||
|
||
# finetuning job parameters | ||
finetuning_pipeline_component="summarization_pipeline" | ||
|
@@ -49,6 +49,8 @@ number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs avail | |
num_train_epochs=3 | ||
learning_rate=2e-5 | ||
|
||
|
||
|
||
# 1. Setup pre-requisites | ||
|
||
if [ "$subscription_id" = "<SUBSCRIPTION_ID>" ] || \ | ||
|
@@ -73,6 +75,13 @@ else | |
} | ||
fi | ||
|
||
# download the dataset | ||
|
||
python ./download-dataset.py || { | ||
echo "Failed to download dataset" | ||
exit 1 | ||
} | ||
|
||
# 2. Check if the model exists in the registry | ||
# need to confirm model show command works for registries outside the tenant (aka system registry) | ||
if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
{ | ||
"metrics": ["rouge1", "rouge2", "rougeL", "rougeLsum"], | ||
"aggregator": true, | ||
"stemmer": true | ||
} |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
already imported on line 1