Skip to content

Commit

Permalink
Fixed issue #2. No more hard-coded tablesize in char distribution ana…
Browse files Browse the repository at this point in the history
…lysers
  • Loading branch information
rudi.pettazzi@gmail.com committed May 14, 2010
1 parent 5bafb7e commit d17ab72
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 29 deletions.
36 changes: 9 additions & 27 deletions src/Library/Ude.Core/CharDistributionAnalyser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,29 +47,24 @@ namespace Ude.Core
/// </summary>
public abstract class CharDistributionAnalyser
{

protected const float SURE_YES = 0.99f;
protected const float SURE_NO = 0.01f;
protected const int MINIMUM_DATA_THRESHOLD = 4;
protected const int ENOUGH_DATA_THRESHOLD = 1024;

//If this flag is set to PR_TRUE, detection is done and conclusion has been made
// If this flag is set to true, detection is done and conclusion has been made
protected bool done;

// The number of characters whose frequency order is less than 512
protected int freqChars;

//Total character encounted.
// Total character encounted.
protected int totalChars;

// Mapping table to get frequency order from char order (get from GetOrder())
protected int[] charToFreqOrder;

// Size of above table
protected int tableSize;

//This is a constant value varies from language to language, it is used
// in calculating confidence.
// This constant value varies from language to language. It is used in calculating confidence.
protected float typicalDistributionRatio;

public CharDistributionAnalyser()
Expand Down Expand Up @@ -104,7 +99,7 @@ public void HandleOneChar(byte[] buf, int offset, int charLen)
int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
if (order >= 0) {
totalChars++;
if (order < tableSize) { // order is valid
if (order < charToFreqOrder.Length) { // order is valid
if (512 > charToFreqOrder[order])
freqChars++;
}
Expand Down Expand Up @@ -165,8 +160,6 @@ public class GB18030DistributionAnalyser : CharDistributionAnalyser

private static float GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9f;

private static int GB2312_TABLE_SIZE = 3760;

private static int[] GB2312_CHAR2FREQ_ORDER = {
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
Expand Down Expand Up @@ -602,7 +595,6 @@ public class GB18030DistributionAnalyser : CharDistributionAnalyser
public GB18030DistributionAnalyser() : base()
{
charToFreqOrder = GB2312_CHAR2FREQ_ORDER;
tableSize = GB2312_TABLE_SIZE;
typicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO;
}

Expand Down Expand Up @@ -643,8 +635,6 @@ public class EUCTWDistributionAnalyser : CharDistributionAnalyser

private static float EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75f;

private static int EUCTW_TABLE_SIZE = 8102;

private static int[] EUCTW_CHAR2FREQ_ORDER = {
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, // 2742
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, // 2758
Expand Down Expand Up @@ -1033,7 +1023,6 @@ public class EUCTWDistributionAnalyser : CharDistributionAnalyser
public EUCTWDistributionAnalyser()
{
charToFreqOrder = EUCTW_CHAR2FREQ_ORDER;
tableSize = EUCTW_TABLE_SIZE;
typicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO;
}

Expand All @@ -1053,7 +1042,8 @@ public override int GetOrder(byte[] buf, int offset)

public class EUCKRDistributionAnalyser : CharDistributionAnalyser
{
// Sampling from about 20M text materials include literature and computer technology
// Sampling from about 20M text materials include literature and computer technology

/*
* 128 --> 0.79
* 256 --> 0.92
Expand All @@ -1067,8 +1057,6 @@ public class EUCKRDistributionAnalyser : CharDistributionAnalyser

public const float EUCKR_TYPICAL_DISTRIBUTION_RATIO = 6.0f;

public const int EUCKR_TABLE_SIZE = 2352;

// Char to FreqOrder table
public static int[] EUCKR_CHAR2FREQ_ORDER = {
13, 130, 120,1396, 481,1719,1720, 328, 609, 212,1721, 707, 400, 299,1722, 87,
Expand Down Expand Up @@ -1627,7 +1615,6 @@ public class EUCKRDistributionAnalyser : CharDistributionAnalyser
public EUCKRDistributionAnalyser()
{
charToFreqOrder = EUCKR_CHAR2FREQ_ORDER;
tableSize = EUCKR_TABLE_SIZE;
typicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO;
}

Expand Down Expand Up @@ -1665,8 +1652,6 @@ public class BIG5DistributionAnalyser : CharDistributionAnalyser

private static float BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75f;

private static int BIG5_TABLE_SIZE = 5376;

private static int[] BIG5_CHAR2FREQ_ORDER = {
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, // 16
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, // 32
Expand Down Expand Up @@ -2553,7 +2538,6 @@ public class BIG5DistributionAnalyser : CharDistributionAnalyser
public BIG5DistributionAnalyser()
{
charToFreqOrder = BIG5_CHAR2FREQ_ORDER;
tableSize = BIG5_TABLE_SIZE;
typicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO;
}

Expand All @@ -2577,9 +2561,9 @@ public override int GetOrder(byte[] buf, int offset)

public class SJISDistributionAnalyser : CharDistributionAnalyser
{
//Sampling from about 20M text materials include literature and computer technology
// Sampling from about 20M text materials include literature and computer technology
// Japanese frequency table, applied to both S-JIS and EUC-JP
//They are sorted in order.
// They are sorted in order.

/******************************************************************************
* 128 --> 0.77094
Expand All @@ -2596,8 +2580,6 @@ public class SJISDistributionAnalyser : CharDistributionAnalyser

protected static float SJIS_TYPICAL_DISTRIBUTION_RATIO = 3.0f;

protected static int SJIS_TABLE_SIZE = 4368;

protected static int[] SJIS_CHAR2FREQ_ORDER = {
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, // 16
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, // 32
Expand Down Expand Up @@ -3123,10 +3105,10 @@ public class SJISDistributionAnalyser : CharDistributionAnalyser
8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271, // 8272
****************************************************************************************/
};

public SJISDistributionAnalyser()
{
charToFreqOrder = SJIS_CHAR2FREQ_ORDER;
tableSize = SJIS_TABLE_SIZE;
typicalDistributionRatio = SJIS_TYPICAL_DISTRIBUTION_RATIO;
}

Expand Down
2 changes: 1 addition & 1 deletion src/Library/Ude.Core/EUCTWProber.cs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ public override ProbingState HandleData(byte[] buf, int offset, int len)

public override string GetCharsetName()
{
return "x-euc-tw";
return "EUC-TW";
}

public override void Reset()
Expand Down
14 changes: 14 additions & 0 deletions src/Tests/Data/euctw/euc-tw1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
������̨��Wikipedia��ϯ�����ҳ����ŲĶϡ��ƾ���ơ�ܸĩʵ�����������ɢϯ����������������ľ��

̨����Ͳ��ȡIJĨ��Žġ��ů����ȡĨ��IJĦ��ȭ�������Ĩ����IJ�����Ĥ��������ĨIJ��̺��Ĭȩ��������̺ǩ�Ĩ���ܡ�ھ��ĥŲĶȴʵ�����ȩ��ȴ��ɷ�桢�����������ɢ����̨���

����ġ⤡������������ס��������⦡��ˡ����ġ���ľ�ס�����ơ��ˡ����ס�����ľ�ס�̧ĩ���桢̽����������ľ�������꾡�������ۡ�ǩ����ľ��

��̨ڵͱ���������������̡����ۡ�����Ŵ�С���硣��ջ᫡�̧̺����ƫ����Ժ���������¡���ܸ����Դڵ����꼡���̱��̧��̧��̧ĩ����Ь̨������ҡ���ɭ������

Ļ������ᬡ�����ϯ������������������ƫ̶������ƫ�������ŲĶ�֡�

��������������ȧȡĤ�ʡ�����ڵ��Ĩ��ġ��ĬIJĦ��

commons:̺��
���������̦�����������ꡫ������̨��
3 changes: 2 additions & 1 deletion src/Tests/Ude.Tests.mdp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<Project name="Ude.Tests" fileversion="2.0" DefaultNamespace="Ude.Tests" language="C#" clr-version="Net_2_0" ctype="DotNetProject">
<Project name="Ude.Tests" fileversion="2.0" language="C#" DefaultNamespace="Ude.Tests" clr-version="Net_2_0" targetFramework="2.0" ctype="DotNetProject">
<Configurations active="Debug|Any CPU">
<Configuration name="Debug|Any CPU" ctype="DotNetProjectConfiguration">
<Output directory="bin/Debug/" assemblyKeyFile="." assembly="Ude.Tests" />
Expand Down Expand Up @@ -60,6 +60,7 @@
<File name="Data/windows1255/he1.txt" subtype="Code" buildaction="Nothing" />
<File name="Data/windows1255/he2.txt" subtype="Code" buildaction="Nothing" />
<File name="Data/windows1255/he3.txt" subtype="Code" buildaction="Nothing" />
<File name="Data/euctw/euc-tw1.txt" subtype="Code" buildaction="Nothing" />
</Contents>
<References>
<ProjectReference type="Gac" localcopy="True" refto="System, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089" />
Expand Down
1 change: 1 addition & 0 deletions src/Tests/Ude.Tests/CharsetDetectorTestBatch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ public void TestCJK()
Process(Charsets.SHIFT_JIS, "shiftjis");
Process(Charsets.EUCJP, "eucjp");
Process(Charsets.EUCKR, "euckr");
Process(Charsets.EUCTW, "euctw");
Process(Charsets.ISO2022_JP, "iso2022jp");
Process(Charsets.ISO2022_KR, "iso2022kr");
}
Expand Down

0 comments on commit d17ab72

Please sign in to comment.