In [1]:
from src import strtree

# One-class classification

In [2]:
strings = ['Samsung X-500', 'Samsung SM-10', 'Samsung X-1100', 'Samsung F-10', 'Samsung X-2200',
       'AB Nokia 1', 'DG Nokia 2', 'THGF Nokia 3', 'SFSD Nokia 4', 'Nokia XG', 'Nokia YO']

In [3]:
labels = [1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0]

In [4]:
tree = strtree.StringTree()
tree.build(strings, labels, min_precision=0.9, min_token_length=1, verbose=True)

Total: 11 strings with 7 positive labels

Start processing another 11 of strings with 7 positive labels.
Current pattern="". N matches: 11, Precision=0.6363636363636364, Recall=1.0
Best pattern=".+\d$". N matches: 9, Precision=0.7777777777777778, Recall=1.0
Best pattern=".+a.+\d$". N matches: 9, Precision=0.7777777777777778, Recall=1.0
Best pattern=".+ .+a.+\d$". N matches: 4, Precision=1.0, Recall=0.5714285714285714
Last pattern was saved

Start processing another 7 of strings with 3 positive labels.
Current pattern="". N matches: 7, Precision=0.42857142857142855, Recall=1.0
Best pattern=".+0.+". N matches: 3, Precision=1.0, Recall=1.0
Last pattern was saved

Finished


All found patterns:

In [5]:
tree.leaves

[PatternNode(".+ .+a.+\d$", right=None, left=PatternNode(.+0.+), n_strings=11, n_matches=4, precision=1.0, recall=0.5714285714285714),
 PatternNode(".+0.+", right=None, left=None, n_strings=7, n_matches=3, precision=1.0, recall=1.0)]

Filter out strings not matching the tree:

In [6]:
tree.filter(['Nokia A-100', 'String Outside Of Dataset'])

['Nokia A-100']

Get the matching flags for each string:

In [7]:
tree.match(['Nokia A-100', 'String Outside Of Dataset'])

[1, 0]

Get the precision score for given strings and labels:

In [8]:
tree.precision_score(strings, labels)

1.0

Get the recall score for given strings and labels:

In [9]:
tree.recall_score(strings, labels)

1.0

Predict labels for given strings:

In [10]:
tree.predict_label(['Nokia A-100', 'String Outside Of Dataset'])

[1, None]

Find all regular expressions for a given label:

In [12]:
tree.get_nodes_by_label(1)

array([PatternNode(".+ .+a.+\d$", right=None, left=PatternNode(.+0.+), n_strings=11, n_matches=4, precision=1.0, recall=0.5714285714285714),
       PatternNode(".+0.+", right=None, left=None, n_strings=7, n_matches=3, precision=1.0, recall=1.0)],
      dtype=object)

# Multi-class classification

In [13]:
strings = ['Admiral', 'Apple', 'Age',
           'Bee', 'Bubble', 'Butter',
           'Color', 'Climate', 'CPU']

labels = [0, 0, 0,
          1, 1, 1,
          2, 2, 2]

In [14]:
tree = strtree.StringTree()
tree.build(strings, labels, min_precision=0.9, min_token_length=1, verbose=True)

Total: 9 strings with 9 positive labels

Start processing another 9 of strings with 3 classes.
Current pattern="". N matches: 9, Precision=[0.3333333333333333, 0.3333333333333333, 0.3333333333333333], Recall=[1.0, 1.0, 1.0]
Best pattern="^A.+". N matches: 3, Precision=[1.0, 0.0, 0.0], Recall=[1.0, 0.0, 0.0]
Last pattern was saved

Start processing another 6 of strings with 3 classes.
Current pattern="". N matches: 6, Precision=[0.0, 0.5, 0.5], Recall=[0.0, 1.0, 1.0]
Best pattern="^B.+". N matches: 3, Precision=[0.0, 1.0, 0.0], Recall=[0.0, 1.0, 0.0]
Last pattern was saved

Start processing another 3 of strings with 3 classes.
Current pattern="". N matches: 3, Precision=[0.0, 0.0, 1.0], Recall=[0.0, 0.0, 1.0]
Best pattern="^C.+". N matches: 3, Precision=[0.0, 0.0, 1.0], Recall=[0.0, 0.0, 1.0]
Last pattern was saved

Finished


All found patterns:

In [15]:
tree.leaves

[PatternNode("^A.+", right=None, left=PatternNode(^B.+), n_strings=9, n_matches=3, precision=[1.0, 0.0, 0.0], recall=[1.0, 0.0, 0.0]),
 PatternNode("^B.+", right=None, left=PatternNode(^C.+), n_strings=6, n_matches=3, precision=[0.0, 1.0, 0.0], recall=[0.0, 1.0, 0.0]),
 PatternNode("^C.+", right=None, left=None, n_strings=3, n_matches=3, precision=[0.0, 0.0, 1.0], recall=[0.0, 0.0, 1.0])]

Filter out strings not matching the tree:

In [16]:
tree.filter(['Ananas', 'Zeta'])

['Ananas']

Get the matching flags for each string (with nodes where a match was found):

In [17]:
tree.match(['Ananas', 'Zeta'], return_nodes=True)

([1, 0],
 [PatternNode("^A.+", right=None, left=PatternNode(^B.+), n_strings=9, n_matches=3, precision=[1.0, 0.0, 0.0], recall=[1.0, 0.0, 0.0]),
  None])

Predict labels for given strings:

In [18]:
tree.predict_label(['Ananas'], return_nodes=True)

([0],
 [PatternNode("^A.+", right=None, left=PatternNode(^B.+), n_strings=9, n_matches=3, precision=[1.0, 0.0, 0.0], recall=[1.0, 0.0, 0.0])])

Find all regular expressions for a given label:

In [19]:
tree.get_nodes_by_label(0)

array([PatternNode("^A.+", right=None, left=PatternNode(^B.+), n_strings=9, n_matches=3, precision=[1.0, 0.0, 0.0], recall=[1.0, 0.0, 0.0])],
      dtype=object)