Skip to content

Commit

Permalink
feat(classification): classify known object schema detections (#85)
Browse files Browse the repository at this point in the history
* refactor: pull out shared classify functionality

* refactor: clean up classify helper methods

* feat: add all data classification patterns

* feat: add identity regexp to db known person objects

* feat: add schema classification for known objects

* feat: check properties for stop words
  • Loading branch information
elsapet committed Oct 28, 2022
1 parent 0b64899 commit 00a3271
Show file tree
Hide file tree
Showing 206 changed files with 1,102 additions and 203 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ require (
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rivo/uniseg v0.4.2 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/tangzero/inflector v1.0.0 // indirect
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 // indirect
golang.org/x/term v0.1.0 // indirect
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,8 @@ github.com/struCoder/pidusage v0.2.1 h1:dFiEgUDkubeIj0XA1NpQ6+8LQmKrLi7NiIQl86E6
github.com/struCoder/pidusage v0.2.1/go.mod h1:bewtP2KUA1TBUyza5+/PCpSQ6sc/H6jJbIKAzqW86BA=
github.com/subosito/gotenv v1.4.1 h1:jyEFiXpy21Wm81FBN71l9VoMMV8H8jG+qIK3GCpY6Qs=
github.com/subosito/gotenv v1.4.1/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNGqEflhK0=
github.com/tangzero/inflector v1.0.0 h1:933dvPwRUUOAl98hyeeXuzFix3HwDt5j+45lleu8oh0=
github.com/tangzero/inflector v1.0.0/go.mod h1:OknKjAyDPCDzcWt0yOh2I7hqTukEdyyApcX3/KOhuXc=
github.com/weppos/publicsuffix-go v0.12.0/go.mod h1:z3LCPQ38eedDQSwmsSRW4Y7t2L8Ln16JPQ02lHAdn5k=
github.com/weppos/publicsuffix-go v0.20.0 h1:59ypvSUbW3Dunc6zVm+v+MmXf2Q6cGiNDkxgRIzEnaA=
github.com/weppos/publicsuffix-go v0.20.0/go.mod h1:5ZC/Uv3fIEUE0eP6o9+Yg4+5+W8V0/BieMi05feGXVA=
Expand Down
11 changes: 10 additions & 1 deletion pkg/classification/classification.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ type Classifier struct {
config Config

Interfaces *interfaces.Classifier
Schema schema.Classifier
Schema *schema.Classifier
Dependencies *dependencies.Classifier
}

Expand All @@ -36,6 +36,14 @@ func NewClassifier(config *Config) (*Classifier, error) {
return nil, err
}

schemaClassifier := schema.New(
schema.Config{
DataTypes: db.Default().DataTypes,
DataTypeClassificationPatterns: db.Default().DataTypeClassificationPatterns,
KnownPersonObjectPatterns: db.Default().KnownPersonObjectPatterns,
},
)

dependenciesClassifier := dependencies.New(
dependencies.Config{
Recipes: db.Default().Recipes,
Expand All @@ -46,5 +54,6 @@ func NewClassifier(config *Config) (*Classifier, error) {
config: *config,
Dependencies: dependenciesClassifier,
Interfaces: interfacesClassifier,
Schema: schemaClassifier,
}, nil
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"metadata": { "version": "1.0" },
"id": 135,
"data_type_uuid": "8358b541-500f-4321-8ee9-89ce61e9459e",
"exclude_regexp": "\\b.*error|status|id.*\\b",
"exclude_types": ["boolean"],
"friendly_name": "Message",
"health_context_data_type_uuid": null,
"include_regexp": "\\b.*message.*\\b",
"include_types": ["string"],
"match_column": false,
"match_object": true,
"object_type": ["known_data_object"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"metadata": { "version": "1.0" },
"id": 140,
"data_type_uuid": "8358b541-500f-4321-8ee9-89ce61e9459e",
"exclude_regexp": null,
"exclude_types": ["boolean"],
"friendly_name": "Comment",
"health_context_data_type_uuid": null,
"include_regexp": "\\b.*comment.*\\b",
"include_types": ["string"],
"match_column": false,
"match_object": true,
"object_type": ["associated", "known_data_object"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"metadata": { "version": "1.0" },
"id": 1,
"data_type_uuid": "22e24c62-82d3-4b72-827c-e261533331bd",
"exclude_regexp": "\\b(.*(notification|config(uration)?|template|enabled|token|reminder|subject|body|handover|sent.by|settings?|accept|label|id|voice|type).*)\\b",
"exclude_types": ["boolean", "date", "bool"],
"friendly_name": "Email Address",
"health_context_data_type_uuid": null,
"include_regexp": "\b.*email.*\b",
"include_types": ["object", "string"],
"match_column": true,
"match_object": false,
"object_type": ["known", "unknown_extended"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"metadata": { "version": "1.0" },
"id": 4,
"data_type_uuid": "5a80a938-6fb2-4d9a-9c47-8d377e036506",
"exclude_regexp": "\\b(.*email|(zip.*file)|ip|domain|cart|id|mac|submit|viewed|list|dialog|voice.*)\\b",
"exclude_types": ["boolean"],
"friendly_name": "Physical Address",
"health_context_data_type_uuid": null,
"include_regexp": "\\b(city|street|address|region|province)|((billing|shipping).*address)|((zip\\s?postal)|(zip|postal)\\s?code)\\b",
"include_types": ["string", "object"],
"match_column": true,
"match_object": false,
"object_type": ["known", "unknown_extended"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"metadata": { "version": "1.0" },
"id": 59,
"data_type_uuid": "adfd29eb-8abb-41ea-bec6-9c0ffdfd9206",
"exclude_regexp": null,
"exclude_types": ["boolean"],
"friendly_name": "Links clicked",
"health_context_data_type_uuid": null,
"include_regexp": "\\b(links?|url)\\s?clicked|clicks?\\s?tracking\\b",
"include_types": ["string"],
"match_column": true,
"match_object": false,
"object_type": ["known", "unknown_extended", "unknown"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"metadata": {
"version": "1.0"
},
"id": 60,
"data_type_uuid": "da48dfad-7322-411b-988d-5bde0d7bc659",
"exclude_regexp": null,
"exclude_types": ["boolean"],
"friendly_name": "Date of birth",
"health_context_data_type_uuid": null,
"include_regexp": "\\b(dob|bday|date\\s?of\\s?birth)|birth\\s?(date|year|day|month)\\z",
"include_types": ["string", "date"],
"match_column": true,
"match_object": false,
"object_type": ["known", "unknown_extended"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"metadata": { "version": "1.0" },
"id": 62,
"data_type_uuid": "c2f07a15-6cc1-451b-b8b4-7598bc0117e8",
"exclude_regexp": null,
"exclude_types": ["boolean"],
"friendly_name": "Spoken Languages",
"health_context_data_type_uuid": null,
"include_regexp": "\\b(spoken.*languages?|language)\\z",
"include_types": ["string"],
"match_column": true,
"match_object": false,
"object_type": ["known", "unknown_extended"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"metadata": { "version": "1.0" },
"id": 65,
"data_type_uuid": "e4049ef1-da74-46dc-ac4e-f6b4aa0672ea",
"exclude_regexp": null,
"exclude_types": ["boolean"],
"friendly_name": "Race",
"health_context_data_type_uuid": null,
"include_regexp": "\\b(race|racial)\\b",
"include_types": ["string"],
"match_column": true,
"match_object": false,
"object_type": ["known", "unknown_extended"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"metadata": { "version": "1.0" },
"id": 83,
"data_type_uuid": "84eb0f89-05f7-47a3-a8fb-67cc40e603c6",
"exclude_regexp": null,
"exclude_types": ["boolean"],
"friendly_name": "Mac address",
"health_context_data_type_uuid": null,
"include_regexp": "\\b(mac\\s?addr(ess)?)\\b",
"include_types": ["string"],
"match_column": true,
"match_object": false,
"object_type": ["known", "unknown_extended"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"metadata": { "version": "1.0" },
"id": 85,
"data_type_uuid": "380c8cde-ca2e-44ed-82db-2ab1e7c255c7",
"exclude_regexp": null,
"exclude_types": ["boolean", "number"],
"friendly_name": "Firstname",
"health_context_data_type_uuid": null,
"include_regexp": "\\b(first|middle|given)\\s?names?\\b",
"include_types": ["string"],
"match_column": true,
"match_object": false,
"object_type": ["unknown"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"metadata": { "version": "1.0" },
"id": 87,
"data_type_uuid": "6574a267-e3bf-4ab5-832e-15b8863206df",
"exclude_regexp": null,
"exclude_types": ["boolean", "number"],
"friendly_name": "Lastname",
"health_context_data_type_uuid": null,
"include_regexp": "\\b(last|sur|family)\\s?name\\b",
"include_types": ["string"],
"match_column": true,
"match_object": false,
"object_type": ["unknown"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"metadata": { "version": "1.0" },
"id": 90,
"data_type_uuid": "1617291b-bc22-4267-ad5e-8054b2505958",
"exclude_regexp": null,
"exclude_types": ["boolean"],
"friendly_name": "Fullname",
"health_context_data_type_uuid": null,
"include_regexp": "\\A(\\S?(full|cardholder|display|person|customer|client|seller|doctor|patient|player|candidate|mentor|captain|winner|author|sender|recipient)\\s?name|name|(card\\s?holder))\\b",
"include_types": ["string"],
"match_column": true,
"match_object": false,
"object_type": ["known", "unknown_extended"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"metadata": { "version": "1.0" },
"id": 92,
"data_type_uuid": "8421f13c-0b9a-422b-9080-55b78e7c07cd",
"exclude_regexp": null,
"exclude_types": ["boolean"],
"friendly_name": "Physical and mental health",
"health_context_data_type_uuid": null,
"include_regexp": "\\b(physical|mental)\\s?health\\b",
"include_types": ["object", "string"],
"match_column": true,
"match_object": false,
"object_type": ["unknown"]
}
29 changes: 21 additions & 8 deletions pkg/classification/db/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ import (
"encoding/json"
"log"
"regexp"
"strings"

"github.com/google/uuid"
"github.com/tangzero/inflector"
)

//go:embed recipes
Expand Down Expand Up @@ -55,7 +57,7 @@ type DataType struct {

type DataTypeClassificationPattern struct {
Id int `json:"id"`
DataTypeUUID uuid.UUID `json:"data_type_uuid,omitempty"`
DataTypeUUID *uuid.UUID `json:"data_type_uuid,omitempty"`
IncludeRegexp string `json:"include_regexp"`
IncludeRegexpMatcher *regexp.Regexp `json:"include_regexp_matcher"`
ExcludeRegexp string `json:"exclude_regexp,omitempty"`
Expand All @@ -71,13 +73,14 @@ type DataTypeClassificationPattern struct {
}

type KnownPersonObjectPattern struct {
Id int `json:"id"`
IncludeRegexp string `json:"include_regexp"`
IncludeRegexpMatcher *regexp.Regexp `json:"include_regexp_matcher"`
ExcludeRegexp string `json:"exclude_regexp,omitempty"`
ExcludeRegexpMatcher *regexp.Regexp `json:"exclude_regexp_matcher"`
Category string `json:"category"`
ActAsIdentifier bool `json:"act_as_identifier"`
Id int `json:"id"`
IncludeRegexp string `json:"include_regexp"`
IncludeRegexpMatcher *regexp.Regexp `json:"include_regexp_matcher"`
ExcludeRegexp string `json:"exclude_regexp,omitempty"`
ExcludeRegexpMatcher *regexp.Regexp `json:"exclude_regexp_matcher"`
Category string `json:"category"`
ActAsIdentifier bool `json:"act_as_identifier"`
IdentifierRegexpMatcher *regexp.Regexp `json:"identifier_regexp_matcher"`
}

func Default() DefaultDB {
Expand Down Expand Up @@ -224,6 +227,16 @@ func defaultKnownPersonObjectPatterns() []KnownPersonObjectPattern {
handleError(err)
}
}
if knownPersonObjectPattern.ActAsIdentifier {
category := strings.ToLower(knownPersonObjectPattern.Category)
pluralCategory := inflector.Pluralize(category)

knownPersonObjectPattern.IdentifierRegexpMatcher, err = regexp.Compile("(?i)^[\\S]*(" + category + "|" + pluralCategory + ")\\s?(uu)?id")

if err != nil {
handleError(err)
}
}

knownPersonObjectPatterns = append(knownPersonObjectPatterns, knownPersonObjectPattern)
}
Expand Down
30 changes: 13 additions & 17 deletions pkg/classification/interfaces/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/bearer/curio/pkg/classification/db"
"github.com/bearer/curio/pkg/report/detections"
"github.com/bearer/curio/pkg/report/interfaces"
"github.com/bearer/curio/pkg/util/classify"
"github.com/bearer/curio/pkg/util/url"
)

Expand All @@ -16,16 +17,11 @@ type ClassifiedInterface struct {
Classification *Classification `json:"classification"`
}

type ClassificationDecision struct {
State url.ValidationState `json:"state"`
Reason string `json:"reason"`
}

type Classification struct {
URL string `json:"url"`
RecipeMatch bool `json:"recipe_match"`
RecipeName string `json:"recipe_name,omitempty"`
Decision ClassificationDecision `json:"decision"`
URL string `json:"url"`
RecipeMatch bool `json:"recipe_match"`
RecipeName string `json:"recipe_name,omitempty"`
Decision classify.ClassificationDecision `json:"decision"`
}

type Classifier struct {
Expand Down Expand Up @@ -127,12 +123,12 @@ func (classifier *Classifier) Classify(data detections.Detection) (*ClassifiedIn
if err != nil {
return nil, err
}
if formatValidityCheck.State == url.Invalid {
if formatValidityCheck.State == classify.Invalid {
return &ClassifiedInterface{
Detection: &data,
Classification: &Classification{
URL: value,
Decision: ClassificationDecision{
Decision: classify.ClassificationDecision{
State: formatValidityCheck.State,
Reason: formatValidityCheck.Reason,
},
Expand All @@ -159,7 +155,7 @@ func (classifier *Classifier) Classify(data detections.Detection) (*ClassifiedIn
Detection: &data,
Classification: &Classification{
URL: value,
Decision: ClassificationDecision{
Decision: classify.ClassificationDecision{
State: internalValidityCheck.State,
Reason: internalValidityCheck.Reason,
},
Expand All @@ -182,15 +178,15 @@ func (classifier *Classifier) Classify(data detections.Detection) (*ClassifiedIn
},
}
if strings.Contains(recipeMatch.DetectionURLPart, "*") {
classifiedInterface.Classification.Decision = ClassificationDecision{
State: url.Potential,
classifiedInterface.Classification.Decision = classify.ClassificationDecision{
State: classify.Potential,
Reason: "recipe_match_with_wildcard",
}
return classifiedInterface, nil
}

classifiedInterface.Classification.Decision = ClassificationDecision{
State: url.Valid,
classifiedInterface.Classification.Decision = classify.ClassificationDecision{
State: classify.Valid,
Reason: "recipe_match",
}
return classifiedInterface, nil
Expand All @@ -206,7 +202,7 @@ func (classifier *Classifier) Classify(data detections.Detection) (*ClassifiedIn
Detection: &data,
Classification: &Classification{
URL: value,
Decision: ClassificationDecision{
Decision: classify.ClassificationDecision{
State: validityCheck.State,
Reason: validityCheck.Reason,
},
Expand Down
Loading

0 comments on commit 00a3271

Please sign in to comment.