Skip to content

Commit

Permalink
Model Evaluation sample notebooks - Enabling camembert-base for Fill …
Browse files Browse the repository at this point in the history
…Mask task (#2358)

Co-authored-by: Sarthak Singhal <sarsinghal@microsoft.com>
  • Loading branch information
sarthaks95 and Sarthak Singhal committed Jun 12, 2023
1 parent 1c7d296 commit 2ed10b2
Showing 1 changed file with 16 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -208,13 +208,13 @@
" {\"name\": \"bert-base-uncased\"},\n",
" {\"name\": \"bert-large-cased\"},\n",
" {\"name\": \"bert-large-uncased\"},\n",
" # {\"name\": \"camembert-base\"},\n",
" {\"name\": \"camembert-base\"},\n",
" {\"name\": \"distilbert-base-cased\"},\n",
" {\"name\": \"distilbert-base-uncased\"},\n",
" {\"name\": \"distilroberta-base\"},\n",
" {\"name\": \"microsoft-deberta-base\", \"pretrained\": \"microsoft/deberta-base\"},\n",
" {\"name\": \"microsoft-deberta-large\", \"pretrained\": \"microsoft/deberta-large\"},\n",
" {\"name\": \"microsoft-deberta-xlarge\", \"pretrained\": \"microsoft/deberta-xlarge\"},\n",
" {\"name\": \"microsoft-deberta-base\", \"hf_id\": \"microsoft/deberta-base\"},\n",
" {\"name\": \"microsoft-deberta-large\", \"hf_id\": \"microsoft/deberta-large\"},\n",
" {\"name\": \"microsoft-deberta-xlarge\", \"hf_id\": \"microsoft/deberta-xlarge\"},\n",
" {\"name\": \"roberta-base\"},\n",
" {\"name\": \"roberta-large\"},\n",
"]"
Expand Down Expand Up @@ -310,19 +310,26 @@
"from transformers import AutoTokenizer\n",
"\n",
"for model in models:\n",
" tokenizer = AutoTokenizer.from_pretrained(model.get(\"pretrained\", model[\"name\"]))\n",
" tokenizer = AutoTokenizer.from_pretrained(model.get(\"hf_id\", model[\"name\"]))\n",
" test_data_df[\"input_string\"] = test_data_df[\"texts\"].apply(\n",
" lambda x: tokenizer.decode(\n",
" tokenizer.encode(\n",
" x.replace(\"<mask>\", tokenizer.mask_token),\n",
" max_length=512,\n",
" max_length=500,\n",
" truncation=True,\n",
" )[:500]\n",
" )\n",
" )\n",
" )\n",
" test_data_fil_df = test_data_df[\n",
" test_data_df[\"input_string\"].str.contains(tokenizer.mask_token)\n",
" ].reset_index(drop=True)\n",
" print(\"{} - {}\".format(model[\"name\"], test_data_fil_df.shape))\n",
" test_data_file_name = \"small-test-{}.jsonl\".format(model[\"name\"])\n",
" test_data_df.to_json(test_data_file_name, lines=True, orient=\"records\")"
]
" test_data_fil_df.to_json(test_data_file_name, lines=True, orient=\"records\")"
],
"metadata": {
"collapsed": false
}
},
{
"attachments": {},
Expand Down

0 comments on commit 2ed10b2

Please sign in to comment.