Skip to content

Commit

Permalink
remove news summary dataset from repo, download using script in sample (
Browse files Browse the repository at this point in the history
#2250)

* remove news summary dataset from repo, download using script in sample

* fix to cli sample to remove dataset from repo

* typo fix

* formatting
  • Loading branch information
ManojBableshwar committed May 10, 2023
1 parent 0abe0b5 commit 0b96822
Show file tree
Hide file tree
Showing 12 changed files with 134 additions and 11,249 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# import library to parse command line arguments
import argparse, os
import pandas as pd
import os

parser = argparse.ArgumentParser()
# add an argument to specify a dataset name to download
parser.add_argument("--dataset", type=str, default="cnn_dailymail", help="dataset name")
# add an argument to specify the config name of the dataset
parser.add_argument(
"--config_name", type=str, default="3.0.0", help="config name of the dataset"
)
# argument to save a fraction of the dataset
parser.add_argument(
"--fraction", type=float, default=0.05, help="fraction of the dataset to save"
)
# add an argument to specify the directory to download the dataset to
parser.add_argument(
"--download_dir",
type=str,
default="./news-summary-dataset",
help="directory to download the dataset to",
)
args = parser.parse_args()

# create the download directory if it does not exist
if not os.path.exists(args.download_dir):
os.makedirs(args.download_dir)

# import hugging face datasets library
from datasets import load_dataset, get_dataset_split_names

for split in get_dataset_split_names(args.dataset, config_name=args.config_name):
print(f"Loading {split} split of {args.dataset} dataset...")
# load the split of the dataset
dataset = load_dataset(args.dataset, args.config_name, split=split)
# save the split of the dataset to the download directory as json lines file
dataset.select(range(int(dataset.num_rows * args.fraction))).to_json(
os.path.join(args.download_dir, f"{split}.jsonl")
)

train_df = pd.read_json(os.path.join(args.download_dir, "train.jsonl"), lines=True)
validation_df = pd.read_json(
os.path.join(args.download_dir, "validation.jsonl"), lines=True
)
# this dataset doesn't have test data, so split the validation_df into test_df and validation_df
test_df = validation_df.sample(frac=0.5, random_state=42)
validation_df.drop(test_df.index, inplace=True)
# drop the id column as it is not needed for fine tuning
train_df.drop(columns=["id"], inplace=True)
validation_df.drop(columns=["id"], inplace=True)
test_df.drop(columns=["id"], inplace=True)


# save 20% of the rows from the dataframes into files with small_ prefix in the ./news-summary-dataset folder
train_df.sample(frac=0.2).to_json(
os.path.join(args.download_dir, "small_train.jsonl"), orient="records", lines=True
)
validation_df.sample(frac=0.2).to_json(
os.path.join(args.download_dir, "small_validation.jsonl"),
orient="records",
lines=True,
)
test_df.sample(frac=0.2).to_json(
os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True
)


# generate sample scoring data
# read ./news-summary-dataset/small_test.jsonl into a pandas dataframe
import pandas as pd
import json

test_df = pd.read_json(
os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True
)
# take 1 random sample
test_df = test_df.sample(n=1)
# rebuild index
test_df.reset_index(drop=True, inplace=True)
# rename the highlights column to ground_truth_summary
test_df.rename(columns={"highlights": "ground_truth_summary"}, inplace=True)
# create a json object with the key as "inputs" and value as a list of values from the article column of the test dataframe
test_json = {"inputs": {"input_string": test_df["article"].tolist()}}
# save the json object to a file named sample_score.json in the ./emotion-dataset folder
with open(os.path.join(args.download_dir, "sample_score.json"), "w") as f:
json.dump(test_json, f)
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ set -x
# the data files are available in the same folder as the above notebook

# script inputs
subscription_id="<SUBSCRIPTION_ID>"
resource_group_name="<RESOURCE_GROUP>"
workspace_name="<WORKSPACE_NAME>"
subscription_id="21d8f407-c4c4-452e-87a4-e609bfb86248" #"<SUBSCRIPTION_ID>"
resource_group_name="rg-contoso-819prod" #"<RESOURCE_GROUP>",
workspace_name="mlw-contoso-819prod" #"WORKSPACE_NAME>",
registry_name="azureml"

compute_cluster="gpu-cluster-big"
Expand All @@ -27,15 +27,15 @@ deployment_sku="Standard_DS3_v2"


# training data
train_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_train.jsonl"
train_data="./news-summary-dataset/small_train.jsonl"
# validation data
validation_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_validation.jsonl"
validation_data="./news-summary-dataset/small_validation.jsonl"
# test data
test_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_test.jsonl"
test_data="./news-summary-dataset/small_test.jsonl"
# evaluation config
evaluation_config="../../../../../sdk/python/foundation-models/system/finetune/summarization/summarization-config.json"
evaluation_config="./summarization-config.json"
# scoring_file
scoring_file="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/sample_score.json"
scoring_file="./news-summary-dataset/sample_score.json"

# finetuning job parameters
finetuning_pipeline_component="summarization_pipeline"
Expand All @@ -49,6 +49,8 @@ number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs avail
num_train_epochs=3
learning_rate=2e-5



# 1. Setup pre-requisites

if [ "$subscription_id" = "<SUBSCRIPTION_ID>" ] || \
Expand All @@ -73,6 +75,13 @@ else
}
fi

# download the dataset

python ./download-dataset.py || {
echo "Failed to download dataset"
exit 1
}

# 2. Check if the model exists in the registry
# need to confirm model show command works for registries outside the tenant (aka system registry)
if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"metrics": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
"aggregator": true,
"stemmer": true
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@

parser = argparse.ArgumentParser()
# add an argument to specify a dataset name to download
parser.add_argument("--dataset", type=str, default="squad", help="dataset name")
parser.add_argument("--dataset", type=str, default="cnn_dailymail", help="dataset name")
# add an argument to specify the config name of the dataset
parser.add_argument(
"--config_name", type=str, default="plain_text", help="config name of the dataset"
"--config_name", type=str, default="3.0.0", help="config name of the dataset"
)
# argument to save a fraction of the dataset
parser.add_argument(
Expand All @@ -16,7 +16,7 @@
parser.add_argument(
"--download_dir",
type=str,
default="data",
default="./news-summary-dataset",
help="directory to download the dataset to",
)
args = parser.parse_args()
Expand Down

This file was deleted.

Loading

0 comments on commit 0b96822

Please sign in to comment.