-
Notifications
You must be signed in to change notification settings - Fork 4
/
session.go
150 lines (135 loc) · 4.71 KB
/
session.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
package session
import (
"compress/gzip"
"context"
"fmt"
"io"
"net/http"
"net/http/cookiejar"
"net/url"
"time"
"github.com/DaRealFreak/epub-scraper/pkg/config"
"github.com/DaRealFreak/epub-scraper/pkg/raven"
"github.com/PuerkitoBio/goquery"
log "github.com/sirupsen/logrus"
"golang.org/x/time/rate"
)
// Session is the interface for the implemented HTTP client
type Session interface {
Get(uri string) (response *http.Response, err error)
Post(uri string, data url.Values) (response *http.Response, err error)
GetDocument(response *http.Response) *goquery.Document
ApplyRateLimit()
}
// Session is an extension to the implemented SessionInterface for HTTP sessions
type session struct {
Client *http.Client
RateLimiter *rate.Limiter
MaxRetries int
ctx context.Context
}
// UseWaybackMachineError custom error if we get redirected on a URL configured to use the wayback machine
type UseWaybackMachineError struct {
error
Handling *config.WaybackMachine
URL *url.URL
}
// NewSession initializes a new session and sets all the required headers etc
func NewSession(novelConfig *config.NovelConfig) Session {
jar, _ := cookiejar.New(nil)
app := session{
Client: &http.Client{Jar: jar},
RateLimiter: rate.NewLimiter(rate.Every(1500*time.Millisecond), 1),
MaxRetries: 5,
ctx: context.Background(),
}
return &WaybackMachineWrapper{
session: app,
cfg: novelConfig,
}
}
// Get sends a GET request, returns the occurred error if something went wrong even after multiple tries
func (s *session) Get(uri string) (response *http.Response, err error) {
// access the passed url and return the data or the error which persisted multiple retries
// post the request with the retries option
for try := 1; try <= s.MaxRetries; try++ {
s.ApplyRateLimit()
log.Debug(fmt.Sprintf("opening GET uri \"%s\" (try: %d)", uri, try))
response, err = s.Client.Get(uri)
if err == nil && response.StatusCode < 400 {
// if no error occurred and status code is okay too break out of the loop
// 4xx & 5xx are client/server error codes, so we check for < 400
return response, err
}
if waybackResponse, done, err := s.handleWaybackMachineError(response, err); done {
return waybackResponse, err
}
// any other error falls into the retry clause
time.Sleep(time.Duration(try+1) * time.Second)
}
return response, err
}
// handleWaybackMachineError checks if the returned error is indicating that we should use the wayback machine
// if yes we return the request using the wayback machine and replace the request URL to the original URL
// to keep host settings
func (s *session) handleWaybackMachineError(response *http.Response, err error) (*http.Response, bool, error) {
if response != nil && err != nil {
// can't use .(type) outside of switch case, so we have to use single case switch case here
// nolint: gocritic
switch v := err.(type) {
case *url.Error:
switch c := v.Err.(type) {
case *UseWaybackMachineError:
newURL := fmt.Sprintf("https://web.archive.org/web/%s/%s", c.Handling.Version, c.URL.String())
newRes, err := s.Get(newURL)
if newRes != nil {
newRes.Request.URL = c.URL
return newRes, true, err
}
}
}
}
return nil, false, nil
}
// Post sends a POST request, returns the occurred error if something went wrong even after multiple tries
func (s *session) Post(uri string, data url.Values) (response *http.Response, err error) {
// post the request with the retries option
for try := 1; try <= s.MaxRetries; try++ {
s.ApplyRateLimit()
log.Debug(fmt.Sprintf("opening POST uri \"%s\" (try: %d)", uri, try))
response, err = s.Client.PostForm(uri, data)
switch {
case err == nil && response.StatusCode < 400:
// if no error occurred and status code is okay too break out of the loop
// 4xx & 5xx are client/server error codes, so we check for < 400
return response, err
default:
// any other error falls into the retry clause
time.Sleep(time.Duration(try+1) * time.Second)
}
}
return response, err
}
// GetDocument converts the http response to a *goquery.Document
func (s *session) GetDocument(response *http.Response) *goquery.Document {
var reader io.ReadCloser
switch response.Header.Get("Content-Encoding") {
case "gzip":
reader, _ = gzip.NewReader(response.Body)
default:
reader = response.Body
}
defer raven.CheckClosure(reader)
document, err := goquery.NewDocumentFromReader(reader)
raven.CheckError(err)
return document
}
// ApplyRateLimit waits for the leaky bucket to fill again
func (s *session) ApplyRateLimit() {
// if no rate limiter is defined we don't have to wait
if s.RateLimiter != nil {
// wait for request to stay within the rate limit
err := s.RateLimiter.Wait(s.ctx)
raven.CheckError(err)
}
}