Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove news summary dataset from repo, download using script in sample #2250

Merged
merged 4 commits into from
May 10, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# import library to parse command line arguments
import argparse, os
import pandas as pd
import os
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

already imported on line 1


parser = argparse.ArgumentParser()
# add an argument to specify a dataset name to download
parser.add_argument("--dataset", type=str, default="cnn_dailymail", help="dataset name")
# add an argument to specify the config name of the dataset
parser.add_argument(
"--config_name", type=str, default="3.0.0", help="config name of the dataset"
)
# argument to save a fraction of the dataset
parser.add_argument(
"--fraction", type=float, default=0.05, help="fraction of the dataset to save"
)
# add an argument to specify the directory to download the dataset to
parser.add_argument(
"--download_dir",
type=str,
default="./news-summary-dataset",
help="directory to download the dataset to",
)
args = parser.parse_args()

# create the download directory if it does not exist
if not os.path.exists(args.download_dir):
os.makedirs(args.download_dir)

# import hugging face datasets library
from datasets import load_dataset, get_dataset_split_names

for split in get_dataset_split_names(args.dataset, config_name=args.config_name):
print(f"Loading {split} split of {args.dataset} dataset...")
# load the split of the dataset
dataset = load_dataset(args.dataset, args.config_name, split=split)
# save the split of the dataset to the download directory as json lines file
dataset.select(range(int(dataset.num_rows * args.fraction))).to_json(
os.path.join(args.download_dir, f"{split}.jsonl")
)

train_df = pd.read_json(os.path.join(args.download_dir, "train.jsonl"), lines=True)
validation_df = pd.read_json(os.path.join(args.download_dir, "validation.jsonl"), lines=True)
# this dataset doesn't have test data, so split the validation_df into test_df and validation_df
test_df = validation_df.sample(frac=0.5, random_state=42)
validation_df.drop(test_df.index, inplace=True)
# drop the id column as it is not needed for fine tuning
train_df.drop(columns=["id"], inplace=True)
validation_df.drop(columns=["id"], inplace=True)
test_df.drop(columns=["id"], inplace=True)


# save 20% of the rows from the dataframes into files with small_ prefix in the ./news-summary-dataset folder
train_df.sample(frac=0.2).to_json(
os.path.join(args.download_dir,"small_train.jsonl"), orient="records", lines=True
)
validation_df.sample(frac=0.2).to_json(
os.path.join(args.download_dir,"small_validation.jsonl"), orient="records", lines=True
)
test_df.sample(frac=0.2).to_json(
os.path.join(args.download_dir,"small_test.jsonl"), orient="records", lines=True
)


# generate sample scoring data
# read ./news-summary-dataset/small_test.jsonl into a pandas dataframe
import pandas as pd
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

already imported way up top

import json

test_df = pd.read_json(
os.path.join(args.download_dir,"small_test.jsonl"), orient="records", lines=True
)
# take 1 random sample
test_df = test_df.sample(n=1)
# rebuild index
test_df.reset_index(drop=True, inplace=True)
# rename the highlights column to ground_truth_summary
test_df.rename(columns={"highlights": "ground_truth_summary"}, inplace=True)
# create a json object with the key as "inputs" and value as a list of values from the article column of the test dataframe
test_json = {"inputs": {"input_string": test_df["article"].tolist()}}
# save the json object to a file named sample_score.json in the ./emotion-dataset folder
with open(os.path.join(args.download_dir,"sample_score.json"), "w") as f:
json.dump(test_json, f)
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ set -x
# the data files are available in the same folder as the above notebook

# script inputs
subscription_id="<SUBSCRIPTION_ID>"
resource_group_name="<RESOURCE_GROUP>"
workspace_name="<WORKSPACE_NAME>"
subscription_id="21d8f407-c4c4-452e-87a4-e609bfb86248" #"<SUBSCRIPTION_ID>"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What ID is this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like there are test values left here, we need to remove these

resource_group_name="rg-contoso-819prod" #"<RESOURCE_GROUP>",
workspace_name="mlw-contoso-819prod" #"WORKSPACE_NAME>",
registry_name="azureml"

compute_cluster="gpu-cluster-big"
Expand All @@ -27,15 +27,15 @@ deployment_sku="Standard_DS3_v2"


# training data
train_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_train.jsonl"
train_data="./news-summary-dataset/small_train.jsonl"
# validation data
validation_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_validation.jsonl"
validation_data="./news-summary-dataset/small_validation.jsonl"
# test data
test_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_test.jsonl"
test_data="./news-summary-dataset/small_test.jsonl"
# evaluation config
evaluation_config="../../../../../sdk/python/foundation-models/system/finetune/summarization/summarization-config.json"
evaluation_config="./summarization-config.json"
# scoring_file
scoring_file="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/sample_score.json"
scoring_file="./news-summary-dataset/sample_score.json"

# finetuning job parameters
finetuning_pipeline_component="summarization_pipeline"
Expand All @@ -49,6 +49,8 @@ number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs avail
num_train_epochs=3
learning_rate=2e-5



# 1. Setup pre-requisites

if [ "$subscription_id" = "<SUBSCRIPTION_ID>" ] || \
Expand All @@ -73,6 +75,13 @@ else
}
fi

# download the dataset

python ./download-dataset.py || {
echo "Failed to download dataset"
exit 1
}

# 2. Check if the model exists in the registry
# need to confirm model show command works for registries outside the tenant (aka system registry)
if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"metrics": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
"aggregator": true,
"stemmer": true
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@

parser = argparse.ArgumentParser()
# add an argument to specify a dataset name to download
parser.add_argument("--dataset", type=str, default="squad", help="dataset name")
parser.add_argument("--dataset", type=str, default="cnn_dailymail", help="dataset name")
# add an argument to specify the config name of the dataset
parser.add_argument(
"--config_name", type=str, default="plain_text", help="config name of the dataset"
"--config_name", type=str, default="3.0.0", help="config name of the dataset"
)
# argument to save a fraction of the dataset
parser.add_argument(
Expand All @@ -16,7 +16,7 @@
parser.add_argument(
"--download_dir",
type=str,
default="data",
default="./news-summary-dataset",
help="directory to download the dataset to",
)
args = parser.parse_args()
Expand Down

This file was deleted.

Loading
Loading