Skip to content

Commit

Permalink
Remove the old 0715, 0122 and 0604 data sets for Chrome.
Browse files Browse the repository at this point in the history
Adjust scripts accordingly and eliminate old unittest file as well.
As of this revision, the default for compilation in all the scripts
EXCEPT for compile_full.sh is the 20141015 dataset. The compile_full.sh
script still builds the most recent pre-20141015 data set, and should
also be updated soon.

The suffix (0122, 0715, 0604, or 20141015) has simply been deleted from
all affected filenames, e.g.:
  cld2_generated_deltaoctachrome0122.cc -> cld2_generated_deltaoctachrome.cc

Moving forward the intention is not keep the old data files around. Repository
history can be used to check out old versions of data files as necessary.

The clean.sh script has also been updated with the names of the new executables
produced in the compilation and testing process along with the svn:ignore
propset.

In the next commit, the compile20141015 will become compile.sh.



git-svn-id: https://cld2.googlecode.com/svn/trunk@190 b252ecd4-b096-bf77-eb8e-91563289f87e
  • Loading branch information
andrewhayden@google.com committed Nov 6, 2014
1 parent 077be12 commit b2c2d34
Show file tree
Hide file tree
Showing 21 changed files with 89 additions and 294,433 deletions.
File renamed without changes.
4,621 changes: 0 additions & 4,621 deletions internal/cld2_generated_deltaoctachrome0122.cc

This file was deleted.

4,547 changes: 0 additions & 4,547 deletions internal/cld2_generated_deltaoctachrome0614.cc

This file was deleted.

2,228 changes: 0 additions & 2,228 deletions internal/cld2_generated_distinctoctachrome0122.cc

This file was deleted.

2,188 changes: 0 additions & 2,188 deletions internal/cld2_generated_distinctoctachrome0604.cc

This file was deleted.

52,761 changes: 0 additions & 52,761 deletions internal/cld2_generated_quadchrome0122_16.cc

This file was deleted.

63,704 changes: 0 additions & 63,704 deletions internal/cld2_generated_quadchrome0122_19.cc

This file was deleted.

81,713 changes: 0 additions & 81,713 deletions internal/cld2_generated_quadchrome0122_2.cc

This file was deleted.

82,295 changes: 0 additions & 82,295 deletions internal/cld2_generated_quadchrome0715.cc

This file was deleted.

File renamed without changes.
File renamed without changes.
58 changes: 47 additions & 11 deletions internal/cld2_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -145,26 +145,46 @@ static const TestPair kTestPair[] = {
{YIDDISH, kTeststr_yi_Hebr},

// Added 2013.08.31 so-Latn ig-Latn ha-Latn yo-Latn zu-Latn
{SOMALI, kTeststr_so_Latn},
{IGBO, kTeststr_ig_Latn},
{HAUSA, kTeststr_ha_Latn},
{YORUBA, kTeststr_yo_Latn},
{ZULU, kTeststr_zu_Latn},
// Deleted 2014.10.15 so-Latn ig-Latn ha-Latn yo-Latn zu-Latn
//{SOMALI, kTeststr_so_Latn},
//{IGBO, kTeststr_ig_Latn},
//{HAUSA, kTeststr_ha_Latn},
//{YORUBA, kTeststr_yo_Latn},
//{ZULU, kTeststr_zu_Latn},

// Added 2014.01.22 bs-Latn
{BOSNIAN, kTeststr_bs_Latn},

// 2 statistically-close languages
// Added 2014.10.15
{KAZAKH, kTeststr_kk_Cyrl},
{KURDISH, kTeststr_ku_Latn}, // aka kmr
{KYRGYZ, kTeststr_ky_Cyrl},
{MALAGASY, kTeststr_mg_Latn},
{MALAYALAM, kTeststr_ml_Mlym},
{BURMESE, kTeststr_my_Mymr},
{NYANJA, kTeststr_ny_Latn},
{SINHALESE, kTeststr_si_Sinh}, // aka SINHALA
{SESOTHO, kTeststr_st_Latn},
{SUNDANESE, kTeststr_su_Latn},
{TAJIK, kTeststr_tg_Cyrl},
{UZBEK, kTeststr_uz_Latn},
{UZBEK, kTeststr_uz_Cyrl},

// 2 statistically-close languages
{INDONESIAN, kTeststr_id_close},
{MALAY, kTeststr_ms_close},

// Simple intermixed French/English text
{FRENCH, kTeststr_fr_en_Latn},

// Simple English with bad UTF-8
{UNKNOWN_LANGUAGE, kTeststr_en_Latn_bad_UTF8},

// Cross-check the main quadgram table build date
// Change the expected language each time it is rebuilt
//{WELSH, kTeststr_version}, // 2013.07.15
{AZERBAIJANI, kTeststr_version}, // 2014.01.31

// {WELSH, kTeststr_version}, // 2013.07.15
// {AZERBAIJANI, kTeststr_version}, // 2014.01.31
{TURKISH, kTeststr_version}, // 2014.10.16

{UNKNOWN_LANGUAGE, NULL}, // Must be last
};
Expand All @@ -183,8 +203,9 @@ bool OneTest(int flags, bool get_vector,
ResultChunkVector resultchunkvector;
int text_bytes;
bool is_reliable;
int valid_prefix_bytes;

Language lang_detected = ExtDetectLanguageSummary(
Language lang_detected = ExtDetectLanguageSummaryCheckUTF8(
buffer,
buffer_length,
is_plain_text,
Expand All @@ -195,10 +216,17 @@ bool OneTest(int flags, bool get_vector,
normalized_score3,
get_vector ? &resultchunkvector : NULL,
&text_bytes,
&is_reliable);
&is_reliable,
&valid_prefix_bytes);
// expose DumpExtLang DumpLanguages
bool good_utf8 = (valid_prefix_bytes == buffer_length);
if (!good_utf8) {
fprintf(stderr, "*** Bad UTF-8 after %d bytes<br>\n", valid_prefix_bytes);
fprintf(stdout, "*** Bad UTF-8 after %d bytes\n", valid_prefix_bytes);
}

bool ok = (lang_detected == lang_expected);
ok &= good_utf8;

if (!ok) {
if ((flags & kCLDFlagHtml) != 0) {
Expand Down Expand Up @@ -305,6 +333,10 @@ int RunTests (int flags, bool get_vector) {
const char* buffer = kTestPair[i].text;
int buffer_length = strlen(buffer);
bool ok = OneTest(flags, get_vector, lang_expected, buffer, buffer_length);
if (kTestPair[i].text == kTeststr_en_Latn_bad_UTF8) {
// We expect this one to fail, so flip the value of ok
ok = !ok;
}
any_fail |= (!ok);
++i;
}
Expand Down Expand Up @@ -350,6 +382,10 @@ int RunTests (int flags, bool get_vector) {
const char* buffer = kTestPair[i].text;
int buffer_length = strlen(buffer);
bool ok = OneTest(flags, get_vector, lang_expected, buffer, buffer_length);
if (kTestPair[i].text == kTeststr_en_Latn_bad_UTF8) {
// We expect this one to fail, so flip the value of ok
ok = !ok;
}
any_fail |= (!ok);
++i;
}
Expand Down
Loading

0 comments on commit b2c2d34

Please sign in to comment.