Contain XPath Regexp Tag(XPath), is easy to Extractor the data format

一个非常容易提取html结构数据的包可以利用golang tag 减少代码量. 而且做了非常的适配.

支持自定义函数.

exp 标识表达式 exp:"//div" xpath表达式
method(mth) 方法名 Text 相当于执行exp后的结果调用Node.Text() AttrValue,class 相当于调用 AttributeValue("class")
index 如果变量为非Slice则, 会把所有执行Mehtod后的值数组选择一个索引
mindex 自定义函数返回多值的时候, 需要选择一个索引值返回. 会调用这个tag

Example

eg:

type tagObject1 struct {
	Color string `exp:".//div[2]" method:"AttributeValue,class"` // exp is xpath expression and method is call xpath result(htmlquery.Node) method.   class is arg. mean GetAttribute().Value()
	Herf  string `exp:".//div[2]/a" method:"AttributeValue,href"`
}

type tagObject2 struct {
	Color string `exp:"self::div" mth:"AttrValue,class"` // mth == method
	Herf  string `exp:".//a" mth:"AttributeValue,href"`
}

type tagObject3 struct {
	Color string `exp:"self::div/@class"` // default Method == Text mth:"Text"
	Herf  string `exp:".//a"`
}

func TestTag1(t *testing.T) {
	etor := extractor.ExtractHtmlString(`<html>
		<head></head>
		<body>
			<div class="red">
				<a href="https://www.baidu.com"></a>
			</div>
			<div class="blue">
				<a href="https://www.google.com"></a>
			</div>

			<div class="black"> 
				<span>
					good你好
				</span>
			</div>
		</body>
	</html>`)

	to := &tagObject1{}
	etor.GetObjectByTag(to)
	if to.Color != "blue" {
		t.Error(to)
	}

	if to.Herf != "https://www.google.com" {
		t.Error(to)
	}

	xp, err := etor.XPaths("//div/a/..")
	if err != nil {
		t.Error(err)
	} else {
		tobj2 := xp.ForEachTag(tagObject2{})
		if o := tobj2[0].(*tagObject2); o.Herf != "https://www.baidu.com" || o.Color != "red" {
			t.Error(o)
		}

		if o := tobj2[1].(*tagObject2); o.Herf != "https://www.google.com" || o.Color != "blue" {
			t.Error(o)
		}

	}

	xp, err = etor.XPaths("//div/a/..")
	if err != nil {
		t.Error(err)
	} else {
		tag3 := xp.ForEachTag(tagObject3{})
		sr := spew.Sprint(tag3)
		if sr != `[<*>{<class>red</class> <a href="https://www.baidu.com"></a>} <*>{<class>blue</class> <a href="https://www.google.com"></a>}]` {
			t.Error(sr)
		}
	}

}

type tagObject4 struct {
	Num    int     `exp:"//div[@num]" mth:"AttrValue,num" ` // 自动判断值的类型
	Num321 float64 `exp:"//div[@num]" mth:"AttrValue,num" index:"1"` // get all div[index]
	Numstr string  `exp:"//div[@num]" mth:"AttrValue,num"`
	Nums   []int32 `exp:"//div[@num]" mth:"AttrValue,num"`
}

func TestType(t *testing.T) {
	etor := extractor.ExtractHtmlString(`<html>
	<head></head>
	<body>
		<div class="red" num="123">
			<a href="https://www.baidu.com"></a>
		</div>
		<div class="blue" num="321">
			<a href="https://www.google.com"></a>
		</div>

		<div class="black" num="456"> 
			<span>
				good你好
			</span>
		</div>
	</body>
</html>`)

	obj4 := &tagObject4{}
	etor.GetObjectByTag(obj4) 

	if obj4.Num == 0 {
		t.Error("tag parse errror", obj4.Num)
	}

	if len(obj4.Nums) != 3 {
		t.Error("tag parse errror", obj4.Nums)
	}

	if obj4.Numstr != "123" {
		t.Error("tag parse errror", obj4.Numstr)
	}

	if obj4.Num321 != 321 {
		t.Error("tag parse errror", obj4.Num321)
	}
}

eg:

func TestHtml(t *testing.T) {

	f, err := os.Open("./testfile/test1.html")
	if err != nil {
		t.Error(err)
	}
	etor := extractor.ExtractHtmlReader(f)
	xp, err := etor.XPaths("//*[contains(@class, 'c-header__modal__content__login')]")
	if err != nil {
		t.Error(err)
    }

    xp.GetAttributes("class") // get all xpath result Attribute class
    xp.ForEachText(".//dt") // all xpath result execute XPath(.//dt) Get all Text

    etor.XPath("{xpath}") // one result like html.Node. i change some api. https://github.com/antchfx/htmlquery/blob/feature/esonapi/xnode.go  forked from antchfx/htmlquery
}

eg:

type enumbertag struct {
	Num  int   `exp:"//div[@num]/@num" mth:"r:ExtractNumber" index:"1" mindex:"1"`
	Nums []int `exp:"//div[@num]/@num" mth:"r:ExtractNumber"` // 自定函数 Register("ParseNumber", ParseNumber) ParseNumer参见源码utils.go函数
}

func TestExtractNumber(t *testing.T) {
	etor := ExtractHtmlString(`<html>
	<head></head>
	<body>
		<div class="red" num="3.3k">
			<a href="https://www.baidu.com"></a>
		</div>
		<div class="blue" num="hello 3,003k 20k">
			<a href="https://www.google.com"></a>
		</div>

		<div class="black" num="sd 456"> 
			<span>
				good你好
			</span>
		</div>
	</body>
</html>`)
	o := &enumbertag{}
	etor.GetObjectByTag(o)
	if o.Num != 20000 {
		t.Error(o.Num)
	}

	if fmt.Sprintf("%#v", o.Nums) != "[]int{3300, 3003000, 456}" {
		t.Error(fmt.Sprintf("%#v", o.Nums))
	}
}

Name		Name	Last commit message	Last commit date
Latest commit History 105 Commits
htmlquery		htmlquery
testfile		testfile
go.mod		go.mod
go.sum		go.sum
hmtl_extractor_test.go		hmtl_extractor_test.go
html_extractor.go		html_extractor.go
json_extractor.go		json_extractor.go
readme.md		readme.md
utils.go		utils.go
utils_test.go		utils_test.go

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

Contain XPath Regexp Tag(XPath), is easy to Extractor the data format

一个非常容易提取html结构数据的包可以利用golang tag 减少代码量. 而且做了非常的适配.

支持自定义函数.

Example

About

Releases

Packages

Languages

474420502/extractor

Folders and files

Latest commit

History

Repository files navigation

Contain XPath Regexp Tag(XPath), is easy to Extractor the data format

一个非常容易提取html结构数据的包 可以利用golang tag 减少代码量. 而且做了非常的适配.

支持自定义函数.

Example

About

Topics

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

一个非常容易提取html结构数据的包可以利用golang tag 减少代码量. 而且做了非常的适配.

Packages