diff --git a/CHANGELOG.md b/CHANGELOG.md index 012a6921..e2fda163 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ Unreleased - Scraped Kashmiri (`kas`). (\#431) - Added Malayalam (`mal`) scrape. (\#434) - Configuration and initial scrape for Dhivehi (`div`, Maldivian). (\#437) +- Gujarati scrape (`guj`). (\#445) #### Changed diff --git a/data/scrape/README.md b/data/scrape/README.md index 35e4ee53..12f83079 100644 --- a/data/scrape/README.md +++ b/data/scrape/README.md @@ -119,6 +119,7 @@ | [TSV](tsv/gre_grek_broad_filtered.tsv) | gre | Modern Greek (1453-) | Greek | Greek | | True | Broad | True | 10,547 | | [TSV](tsv/gre_grek_narrow.tsv) | gre | Modern Greek (1453-) | Greek | Greek | | False | Narrow | True | 408 | | [TSV](tsv/gsw_latn_broad.tsv) | gsw | Swiss German | Alemannic German | Latin | | False | Broad | True | 307 | +| [TSV](tsv/guj_gujr_broad.tsv) | guj | Gujarati | Gujarati | Gujarati | | False | Broad | False | 120 | | [TSV](tsv/haw_latn_broad.tsv) | haw | Hawaiian | Hawaiian | Latin | | False | Broad | True | 493 | | [TSV](tsv/haw_latn_narrow.tsv) | haw | Hawaiian | Hawaiian | Latin | | False | Narrow | True | 536 | | [TSV](tsv/hbs_cyrl_broad.tsv) | hbs | Serbo-Croatian | Serbo-Croatian | Cyrillic | | False | Broad | True | 22,735 | @@ -198,7 +199,7 @@ | [TSV](tsv/mah_latn_broad.tsv) | mah | Marshallese | Marshallese | Latin | | False | Broad | True | 900 | | [TSV](tsv/mah_latn_narrow.tsv) | mah | Marshallese | Marshallese | Latin | | False | Narrow | True | 1,502 | | [TSV](tsv/mak_latn_narrow.tsv) | mak | Makasar | Makasar | Latin | | False | Narrow | True | 405 | -| [TSV](tsv/mal_mlym_narrow.tsv) | mal | Malayalam | Malayalam | Malayalam | | False | Narrow | None | 141 | +| [TSV](tsv/mal_mlym_narrow.tsv) | mal | Malayalam | Malayalam | Malayalam | | False | Narrow | False | 141 | | [TSV](tsv/mar_deva_broad.tsv) | mar | Marathi | Marathi | Devanagari | | False | Broad | False | 588 | | [TSV](tsv/mar_deva_narrow.tsv) | mar | Marathi | Marathi | Devanagari | | False | Narrow | False | 118 | | [TSV](tsv/may_arab_ara_broad.tsv) | may | Malay (macrolanguage) | Malay | Arabic | | False | Broad | True | 628 | diff --git a/data/scrape/lib/languages.json b/data/scrape/lib/languages.json index 13976f12..715df443 100644 --- a/data/scrape/lib/languages.json +++ b/data/scrape/lib/languages.json @@ -659,7 +659,11 @@ "iso639_name": "Gujarati", "wiktionary_name": "Gujarati", "wiktionary_code": "gu", - "casefold": false + "casefold": false, + "script": { + "gujr": "Gujarati", + "sinh": "Sinhala" + } }, "afb": { "iso639_name": "Gulf Arabic", diff --git a/data/scrape/tsv/guj_gujr_broad.tsv b/data/scrape/tsv/guj_gujr_broad.tsv new file mode 100644 index 00000000..656be108 --- /dev/null +++ b/data/scrape/tsv/guj_gujr_broad.tsv @@ -0,0 +1,120 @@ +અં ə̃ː +અંક ə ŋ k ə +અંકગણિત ə ŋ k ə ɡ ɳ ɪ t̪ +અંગૂઠો ə ŋ ɡ u ʈʰ ɔ +અંગ્રેજ ə ŋ ɡ ɾ eː d͡ʒ +અઃ ə ɨ +અગિયાર ə ɡ i j ɑ ɾ +અછબડા ə t͡ʃʰ ə b ɖ ɑ +અઢાર ə ɖʱ ɑ ɾ +અતંત્રતા ə t̪ ə n t̪ ɾ ə t̪ ɑ +અધુના ə d̪ʱ u n ɑ +અર્થ ə ɾ t̪ʰ +અહીં ə ɦ ĩ +અહીંતહીં ə ɦ ĩ t̪ ə ɦ ĩ +આ ɑ̈ +આઠ ɑ ʈʰ ə +ઇ ɪ +ઇમ્ફાલ ɪ m pʰ ɑ l +ઈ iː +ઉ u +ઊ uː +ઋ ɾ u +એ e +એક e k ə +એકવીસ e k ə ʋ i s +ઐ ə j +ઓ o +ઓગણીસ o ɡ ə ɳ i s +ઓણ o ɳ ə +ઔ ə ʋ +ક k ə +કરચોરી k ə ɾ t͡ʃ o ɾ i +ક્રીડા k ɾ i ɖ ɑ +ક્ષ k ʃ ə +ખ kʰ ə +ગ ɡ ə +ગાંધીનગર ɡ ɑ̃ d̪ʱ i n ə ɡ ə ɾ +ગુજરાતી ɡ u d ʒ ə ɾ ɑː t̪ i +ગોતું ɡ o t̪ ũ +ઘ ɡʱ ə +ઘર ɡʰ ə r +ઘોડું ɡʱ o ɖ ũ +ઘોડો ɡʱ o ɽ o +ઙ ŋ ə +ચ t͡ʃ ə +ચા t͡ʃ ɑ +ચાર t͡ʃ ə ɾ +ચીકણું t͡ʃ i k ɳ ũ +ચૌદ t͡ʃ ə ʋ d +છ t ʃʰ ə +છોકરમત t ʃʰ o k ə r m ə t +છોકરી t ʃʰ o k r i +છોકરું t ʃʰ o k r ũ +છોકરો t ʃʰ o k r o +જ d͡ʒ ə +જાત d͡ʒ ɑ t̪ +જ્ઞ ɡ n ə +ઝ d͡ʒʱ ə +ઞ ɲ ə +ટ ʈ ə +ઠ ʈʰ ə +ઠોકર ʈʰ o k ə ɾ +ડ ɖ ə +ઢ ɖʱ ə +ણ ɳ ə +ત t ə +તડકો t̪ ə ɖ k o +તેર t e ɾ +ત્રણ t̪ ɾ ɔ ɳ +ત્વરિત t̪ ʋ ə ɾ ɪ t̪ +થ tʰ ə +દ d ə +દશ d ə ʃ +દસ d ə s +ધ dʱ ə +ન n ə +નંબર n ə m b ə ɾ +નવ n ə ʋ +નાન્યતર n ɑ n j ə t̪ ə ɾ +પ p ə +પંદર p ə n d ə ɾ +પાંચ p ɑ̃ː t͡ʃ +પ્રાચી p ɾ ɑ t͡ʃ i +પ્રાચીન p ɾ ɑ t͡ʃ i n +ફ pʰ ə +ફ્રાંસીસી pʰ ɾ ɑ̃ː s i s i +બ b ə +બાર b ɑ ɾ +બે b e +ભ bʱ ə +ભારત bʱ ɑ̈ ɾ ə t +મ m ə +માટે m ɑ ʈ e +ય j ə +ર ɾ ə +રામાયણ ɾ ɑ m ɑ j ə ɳ +લ l ə +લિંગ l ɪ ŋ ɡ +લુપ્ત l u p t̪ +લોપ l o p +ળ ɭ ə +વ v ə +વધારે ʋ ə d̪ʱ ɑ ɾ e +વસ્તુ ʋ ə s t̪ u +વાઘણ ʋ ɑ ɡʱ ə ɳ +વિક્ષનરી ʋ i k ʃ ə n ə ɾ i +વિહોણું ʋ ɪ ɦ o ɳ ũ +વીસ ʋ iː s +શ ʃ ə +શાને ʃ ɑ n eː +શૂન્ય ʃ u n j ə +શેકવું ʃ eː k ʋ ũ +ષ ʂ ə +સ s ə +સત્તર s ə t t ə r +સાત s ɑ t +સિંહ s ĩ ɦ +સિંહ ʃ ĩ ɦ +સોળ s ɔ ɭ +હ ɦ ə diff --git a/data/scrape/tsv_summary.tsv b/data/scrape/tsv_summary.tsv index 5173c386..55b52ecb 100644 --- a/data/scrape/tsv_summary.tsv +++ b/data/scrape/tsv_summary.tsv @@ -117,6 +117,7 @@ gre_grek_broad.tsv gre Modern Greek (1453-) Greek Greek False Broad True 10626 gre_grek_broad_filtered.tsv gre Modern Greek (1453-) Greek Greek True Broad True 10547 gre_grek_narrow.tsv gre Modern Greek (1453-) Greek Greek False Narrow True 408 gsw_latn_broad.tsv gsw Swiss German Alemannic German Latin False Broad True 307 +guj_gujr_broad.tsv guj Gujarati Gujarati Gujarati False Broad False 120 haw_latn_broad.tsv haw Hawaiian Hawaiian Latin False Broad True 493 haw_latn_narrow.tsv haw Hawaiian Hawaiian Latin False Narrow True 536 hbs_cyrl_broad.tsv hbs Serbo-Croatian Serbo-Croatian Cyrillic False Broad True 22735 @@ -196,7 +197,7 @@ mac_cyrl_narrow.tsv mac Macedonian Macedonian Cyrillic False Narrow True 6878 mah_latn_broad.tsv mah Marshallese Marshallese Latin False Broad True 900 mah_latn_narrow.tsv mah Marshallese Marshallese Latin False Narrow True 1502 mak_latn_narrow.tsv mak Makasar Makasar Latin False Narrow True 405 -mal_mlym_narrow.tsv mal Malayalam Malayalam Malayalam False Narrow 141 +mal_mlym_narrow.tsv mal Malayalam Malayalam Malayalam False Narrow False 141 mar_deva_broad.tsv mar Marathi Marathi Devanagari False Broad False 588 mar_deva_narrow.tsv mar Marathi Marathi Devanagari False Narrow False 118 may_arab_ara_broad.tsv may Malay (macrolanguage) Malay Arabic False Broad True 628