Skip to content

Commit

Permalink
Cleanup of c# syntax
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexandre Point committed Jun 25, 2014
1 parent 5bcab4d commit 605b03e
Show file tree
Hide file tree
Showing 12 changed files with 207 additions and 238 deletions.
2 changes: 1 addition & 1 deletion ModelConverter/Converter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ private static bool ConvertFolder(string folder)
{
try
{
BinaryGisModelWriter writer = new BinaryGisModelWriter();
var writer = new BinaryGisModelWriter();

foreach (string file in Directory.GetFiles(folder))
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,16 @@
namespace OpenNLP.Tools.SentenceDetect
{
/// <summary>
/// A sentence detector which uses a model trained on English data (Wall Street
/// Journal text).
/// A sentence detector which uses a model trained on English data
/// (Wall Street Journal text).
/// </summary>
public class EnglishMaximumEntropySentenceDetector : MaximumEntropySentenceDetector
{
/// <summary>
/// Constructor which loads the English sentence detection model
/// transparently.
/// </summary>
public EnglishMaximumEntropySentenceDetector(string name) : base(new SharpEntropy.GisModel(new SharpEntropy.IO.BinaryGisModelReader(name)))
{
}
public EnglishMaximumEntropySentenceDetector(string name):
base(new SharpEntropy.GisModel(new SharpEntropy.IO.BinaryGisModelReader(name))){}
}
}
55 changes: 12 additions & 43 deletions OpenNLP/Tools/Tokenize/MaximumEntropyTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ namespace OpenNLP.Tools.Tokenize
/// </summary>
public class MaximumEntropyTokenizer : ITokenizer
{
internal static Regex AlphaNumeric = new Regex("^[A-Za-z0-9]+$");
internal static Regex AlphaNumeric = new Regex("^[A-Za-z0-9]+$", RegexOptions.Compiled);

/// <summary>
/// the maximum entropy model to use to evaluate contexts.
Expand All @@ -61,31 +61,16 @@ public class MaximumEntropyTokenizer : ITokenizer
/// </summary>
private readonly SharpEntropy.IContextGenerator<Tuple<string, int>> _contextGenerator;

/// <summary>
/// Optimization flag to skip alpha numeric tokens for further tokenization
/// </summary>
private bool _mAlphaNumericOptimization;

/// <summary>
/// List of probabilities for each token returned from call to Tokenize()
/// </summary>
private readonly List<double> _tokenProbabilities;
private readonly List<Util.Span> _newTokens;

/// <summary>
/// Used to have the tokenizer ignore tokens which only contain alpha-numeric characters.
/// Optimization flag to skip alpha numeric tokens for further tokenization.
/// </summary>
virtual public bool AlphaNumericOptimization
{
get
{
return _mAlphaNumericOptimization;
}
set
{
_mAlphaNumericOptimization = value;
}
}
virtual public bool AlphaNumericOptimization { get; set; }

/// <summary>
/// Class constructor which takes the string locations of the
Expand All @@ -94,21 +79,15 @@ virtual public bool AlphaNumericOptimization
public MaximumEntropyTokenizer(SharpEntropy.IMaximumEntropyModel model)
{
_contextGenerator = new TokenContextGenerator();
_mAlphaNumericOptimization = false;
AlphaNumericOptimization = false;
this._model = model;
_newTokens = new List<Util.Span>();
_tokenProbabilities = new List<double>(50);
}

/// <summary>
/// Tokenizes the string.
/// </summary>
/// <param name="input">
/// The string to be tokenized.
/// </param>
/// <returns>
/// A span array containing individual tokens as elements.
/// </returns>
/// <summary>Tokenizes the string</summary>
/// <param name="input">The string to be tokenized</param>
/// <returns>A span array containing individual tokens as elements</returns>
public virtual Util.Span[] TokenizePositions(string input)
{
Util.Span[] tokens = Split(input);
Expand Down Expand Up @@ -159,15 +138,9 @@ public virtual Util.Span[] TokenizePositions(string input)
return _newTokens.ToArray();
}

/// <summary>
/// Tokenize a string.
/// </summary>
/// <param name="input">
/// The string to be tokenized.
/// </param>
/// <returns>
/// A string array containing individual tokens as elements.
/// </returns>
/// <summary>Tokenize a string</summary>
/// <param name="input">The string to be tokenized</param>
/// <returns>A string array containing individual tokens as elements</returns>
public virtual string[] Tokenize(string input)
{
Util.Span[] tokenSpans = TokenizePositions(input);
Expand All @@ -184,12 +157,8 @@ public virtual string[] Tokenize(string input)
/// delimited token. Token strings can be constructed form these
/// spans as follows: input.Substring(span.Start, span.End);
/// </summary>
/// <param name="input">
/// string to tokenize.
/// </param>
/// <returns>
/// Array of spans.
/// </returns>
/// <param name="input">string to tokenize</param>
/// <returns>Array of spans</returns>
internal static Util.Span[] Split(string input)
{
int tokenStart = - 1;
Expand Down
53 changes: 39 additions & 14 deletions OpenNLP/Tools/Tokenize/TokenContextGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,13 @@ namespace OpenNLP.Tools.Tokenize
/// </summary>
public class TokenContextGenerator : SharpEntropy.IContextGenerator<Tuple<string, int>>
{
/// <summary>
/// Split the string
/// </summary>
public const string SplitIndicator = "T";
/// <summary>
/// Don't split the string
/// </summary>
public const string NoSplitIndicator = "F";

/// <summary>
Expand All @@ -53,41 +59,52 @@ public class TokenContextGenerator : SharpEntropy.IContextGenerator<Tuple<string
/// </summary>
public virtual string[] GetContext(Tuple<string, int> pair)
{
string data = pair.Item1;
string token = pair.Item1;
int index = pair.Item2;

var predicates = new List<string>();
predicates.Add("p=" + data.Substring(0, (index) - (0)));
predicates.Add("s=" + data.Substring(index));
if (index > 0)
// add strings before and after the index in the token
var predicates = new List<string>
{
"p=" + token.Substring(0, index),
"s=" + token.Substring(index)
};
if (index > 0)
{
AddCharPredicates("p1", data[index - 1], predicates);
// add predicates for character just before the current index
AddCharPredicates("p1", token[index - 1], predicates);
predicates.Add("p1f1=" + token[index - 1] + token[index]);
if (index > 1)
{
AddCharPredicates("p2", data[index - 2], predicates);
predicates.Add("p21=" + data[index - 2] + data[index - 1]);
// add predicates for the character 2 positions before the current index
AddCharPredicates("p2", token[index - 2], predicates);
predicates.Add("p21=" + token[index - 2] + token[index - 1]);
}
else
{
predicates.Add("p2=bok");
}
predicates.Add("p1f1=" + data[index - 1] + data[index]);
}
else
{
predicates.Add("p1=bok");
}
AddCharPredicates("f1", data[index], predicates);
if (index + 1 < data.Length)

// add predicates for char at the current index
AddCharPredicates("f1", token[index], predicates);

// add predicates for the char just after
if (index + 1 < token.Length)
{
AddCharPredicates("f2", data[index + 1], predicates);
predicates.Add("f12=" + data[index] + data[index + 1]);
AddCharPredicates("f2", token[index + 1], predicates);
predicates.Add("f12=" + token[index] + token[index + 1]);
}
else
{
predicates.Add("f2=bok");
}
if (data[0] == '&' && data[data.Length - 1] == ';')

// test if token starts by '&' or ends by ';'
if (token[0] == '&' && token[token.Length - 1] == ';')
{
predicates.Add("cc"); //character code
}
Expand All @@ -103,36 +120,44 @@ private void AddCharPredicates(string key, char c, List<string> predicates)
predicates.Add(key + "=" + c);
if (char.IsLetter(c))
{
// whether it's a letter
predicates.Add(key + "_alpha");
if (char.IsUpper(c))
{
// whether it's upper case
predicates.Add(key + "_caps");
}
}
else if (char.IsDigit(c))
{
// whether it's a digit
predicates.Add(key + "_num");
}
else if (char.IsWhiteSpace(c))
{
// whether it's whitespace
predicates.Add(key + "_ws");
}
else
{
if (c == '.' || c == '?' || c == '!')
{
// whether it's an end of sentence
predicates.Add(key + "_eos");
}
else if (c == '`' || c == '"' || c == '\'')
{
// whether it's a quote
predicates.Add(key + "_quote");
}
else if (c == '[' || c == '{' || c == '(')
{
// whether it's a left parenthesis
predicates.Add(key + "_lp");
}
else if (c == ']' || c == '}' || c == ')')
{
// whether it's a right parenthesis
predicates.Add(key + "_rp");
}
}
Expand Down
63 changes: 28 additions & 35 deletions OpenNLP/Tools/Tokenize/TokenEventReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -45,18 +45,18 @@ namespace OpenNLP.Tools.Tokenize
/// </summary>
public class TokenEventReader : SharpEntropy.ITrainingEventReader
{
private static readonly SharpEntropy.IContextGenerator<Tuple<string, int>> mContextGenerator = new TokenContextGenerator();
private StreamReader mStreamReader;
private List<SharpEntropy.TrainingEvent> mEventList = new List<SharpEntropy.TrainingEvent>();
private int mCurrentEvent = 0;
private static readonly SharpEntropy.IContextGenerator<Tuple<string, int>> ContextGenerator = new TokenContextGenerator();
private readonly StreamReader _streamReader;
private readonly List<SharpEntropy.TrainingEvent> _eventList = new List<SharpEntropy.TrainingEvent>();
private int _currentEvent = 0;

/// <summary>
/// Class constructor.
/// </summary>
public TokenEventReader(StreamReader dataReader)
{
mStreamReader = dataReader;
string nextLine = mStreamReader.ReadLine();
_streamReader = dataReader;
string nextLine = _streamReader.ReadLine();
if (nextLine != null)
{
AddEvents(nextLine);
Expand All @@ -65,43 +65,36 @@ public TokenEventReader(StreamReader dataReader)

private void AddEvents(string line)
{
string[] spacedTokens = line.Split(' ');
for (int currentToken = 0; currentToken < spacedTokens.Length; currentToken++)
{
string buffer = spacedTokens[currentToken];
if (MaximumEntropyTokenizer.AlphaNumeric.IsMatch(buffer))
{
int lastIndex = buffer.Length - 1;
for (int index = 0; index < buffer.Length; index++)
{
string[] context = mContextGenerator.GetContext(new Tuple<string, int>(buffer, index));
if (index == lastIndex)
{
mEventList.Add(new SharpEntropy.TrainingEvent("T", context));
}
else
{
mEventList.Add(new SharpEntropy.TrainingEvent("F", context));
}
}
}
}
string[] spacedTokens = line.Split(' ');
foreach (string buffer in spacedTokens)
{
if (MaximumEntropyTokenizer.AlphaNumeric.IsMatch(buffer))
{
int lastIndex = buffer.Length - 1;
for (int index = 0; index < buffer.Length; index++)
{
string[] context = ContextGenerator.GetContext(new Tuple<string, int>(buffer, index));
var trainingEvent = new SharpEntropy.TrainingEvent(index == lastIndex ? "T" : "F", context);
_eventList.Add(trainingEvent);
}
}
}
}

public virtual bool HasNext()
public virtual bool HasNext()
{
return (mCurrentEvent < mEventList.Count);
return (_currentEvent < _eventList.Count);
}

public virtual SharpEntropy.TrainingEvent ReadNextEvent()
{
SharpEntropy.TrainingEvent trainingEvent = mEventList[mCurrentEvent];
mCurrentEvent++;
if (mEventList.Count == mCurrentEvent)
SharpEntropy.TrainingEvent trainingEvent = _eventList[_currentEvent];
_currentEvent++;
if (_eventList.Count == _currentEvent)
{
mCurrentEvent = 0;
mEventList.Clear();
string nextLine = mStreamReader.ReadLine();
_currentEvent = 0;
_eventList.Clear();
string nextLine = _streamReader.ReadLine();
if (nextLine != null)
{
AddEvents(nextLine);
Expand Down
2 changes: 1 addition & 1 deletion ParseTree/App.config
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<appSettings>
<add key="MaximumEntropyModelDirectory" value="C:\Users\Alex\Documents\GitHub\sharpnlp\OpenNLP\Resources\Models\"/>
<add key="MaximumEntropyModelDirectory" value="C:\Users\Alexandre\Documents\GitHub\sharpnlp\OpenNLP\Resources\Models\"/>
</appSettings>
<startup><supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5"/></startup></configuration>
Loading

0 comments on commit 605b03e

Please sign in to comment.