Skip to content
Permalink
Browse files

Initial commit with periodic scrape, instant scrape, zip serving

  • Loading branch information...
Depado committed Apr 9, 2018
1 parent ad13d50 commit f0a8d774f8e493d87dad63af5af90db2c7aabc62
Showing with 388 additions and 0 deletions.
  1. +3 −0 .gitignore
  2. +21 −0 Makefile
  3. +77 −0 cmd/root.go
  4. +19 −0 cmd/scrape.go
  5. +33 −0 cmd/start.go
  6. +21 −0 cmd/version.go
  7. +13 −0 main.go
  8. +96 −0 models/newspaper.go
  9. +41 −0 router/router.go
  10. +64 −0 router/utils.go
@@ -12,3 +12,6 @@

# Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736
.glide/
out/
conf.yml
launeparser
@@ -0,0 +1,21 @@
.PHONY: all clean

export CGO_ENABLED=0
export GOOS=linux
export GOARCH=amd64

BINARY=myprogram
VERSION=0.1.0
BUILD=$(shell git rev-parse HEAD)
LDFLAGS=-ldflags "-X main.Version=$(VERSION) -X main.Build=$(BUILD)"

all:
go build -o $(BINARY) $(LDFLAGS)

docker:
docker build -t "your-docker-repo/$(BINARY):$(VERSION)" \
--build-arg build=$(BUILD) --build-arg version=$(VERSION) \
-f Dockerfile .

clean:
-rm $(BINARY)
@@ -0,0 +1,77 @@
package cmd

import (
"github.com/Depado/launeparser/models"
"github.com/onrik/logrus/filename"
"github.com/sirupsen/logrus"

"github.com/spf13/cobra"
"github.com/spf13/viper"
)

// NP is the main list of NewsPapers to scrape
var NP *models.NewsPapers

var rootCmd = &cobra.Command{
Use: "launeparser",
Short: "Launeparser scrapes newspapers",
}

// Execute executes the commands
func Execute(b, v string) {
Build = b
Version = v
rootCmd.AddCommand(version, startCmd, scrapeCmd)

if err := rootCmd.Execute(); err != nil {
logrus.WithError(err).Fatal()
}
}

func init() {
cobra.OnInitialize(initialize)

// Global flags
rootCmd.PersistentFlags().String("log.level", "info", "one of debug, info, warn, error or fatal")
rootCmd.PersistentFlags().String("log.format", "text", "one of text or json")
rootCmd.PersistentFlags().Bool("log.line", false, "enable filename and line in logs")
rootCmd.PersistentFlags().String("output", "out", "output directory")

// Flag binding
viper.BindPFlags(rootCmd.PersistentFlags())
}

func initialize() {
// Environment variables
viper.AutomaticEnv()

// Configuration file
viper.SetConfigName("conf")
viper.AddConfigPath(".")
viper.AddConfigPath("/config/")
if err := viper.ReadInConfig(); err != nil {
logrus.Fatal("No configuration file found")
}

if err := viper.Unmarshal(&NP); err != nil {
logrus.WithError(err).Fatal("Couldn't unmarshal newspaper")
}

lvl := viper.GetString("log.level")
l, err := logrus.ParseLevel(lvl)
if err != nil {
logrus.WithField("level", lvl).Warn("Invalid log level, fallback to 'info'")
} else {
logrus.SetLevel(l)
}
switch viper.GetString("log.format") {
case "json":
logrus.SetFormatter(&logrus.JSONFormatter{})
default:
case "text":
logrus.SetFormatter(&logrus.TextFormatter{})
}
if viper.GetBool("log.line") {
logrus.AddHook(filename.NewHook())
}
}
@@ -0,0 +1,19 @@
package cmd

import (
"time"

"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
)

// startCmd represents the start command
var scrapeCmd = &cobra.Command{
Use: "scrape",
Short: "Instantly scrape",
Run: func(cmd *cobra.Command, args []string) {
n := time.Now()
NP.Scrape()
logrus.WithField("took", time.Since(n)).Info("Done")
},
}
@@ -0,0 +1,33 @@
package cmd

import (
"github.com/Depado/launeparser/router"
"github.com/jasonlvhit/gocron"
"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"github.com/spf13/viper"
)

// startCmd represents the start command
var startCmd = &cobra.Command{
Use: "start",
Short: "Start the server and scraping",
Run: func(cmd *cobra.Command, args []string) {
go func() {
gocron.Every(1).Day().At("11:00").Do(NP.Scrape)
gocron.Every(1).Day().At("23:00").Do(NP.Scrape)
_, t := gocron.NextRun()
logrus.WithField("next", t).Info("Registered tasks")
<-gocron.Start()
}()
router.Run()
},
}

func init() {
startCmd.Flags().String("server.host", "127.0.0.1", "host on which the server should listen")
startCmd.Flags().Int("server.port", 8080, "port on which the server should listen")
startCmd.Flags().Bool("server.debug", false, "debug mode for the server")

viper.BindPFlags(startCmd.Flags())
}
@@ -0,0 +1,21 @@
package cmd

import (
"fmt"

"github.com/spf13/cobra"
)

// Build number and versions injected at compile time
var (
Build string
Version string
)

var version = &cobra.Command{
Use: "version",
Short: "Show build and version",
Run: func(cmd *cobra.Command, args []string) {
fmt.Printf("Build: %s\nVersion: %s\n", Build, Version)
},
}
13 main.go
@@ -0,0 +1,13 @@
package main

import "github.com/Depado/launeparser/cmd"

// Build number and versions injected at compile time
var (
Version = "unknown"
Build = "unknown"
)

func main() {
cmd.Execute(Version, Build)
}
@@ -0,0 +1,96 @@
package models

import (
"fmt"
"net/http"
"os"
"path/filepath"
"sync"
"time"

"github.com/jaytaylor/html2text"

"github.com/sirupsen/logrus"
"github.com/spf13/viper"
)

// NewsPaper is a struct representing a newspaper
type NewsPaper struct {
Name string `mapstructure:"name"`
URL string `mapstructure:"url"`
Output string
}

// NewsPapers is a struct holding a slice of newspapers
type NewsPapers struct {
NewsPapers []*NewsPaper `mapstructure:"newspapers"`
}

// CreateDirectories creates the raw output directories
func (ns *NewsPapers) CreateDirectories() {
var err error
output := viper.GetString("output")
if _, err = os.Stat(output); os.IsNotExist(err) {
if err = os.Mkdir(output, os.ModePerm); err != nil {
logrus.WithError(err).Fatal("Could not create output directory")
}
}
for _, n := range ns.NewsPapers {
n.Output = filepath.Join(output, n.Name)
if _, err = os.Stat(n.Output); os.IsNotExist(err) {
if err = os.Mkdir(n.Output, os.ModePerm); err != nil {
logrus.WithError(err).WithField("newspaper", n.Name).Fatal("Could not create output directory")
}
}
}
}

// Scrape starts the routine to scrape all the newspapers in the slice
func (ns *NewsPapers) Scrape() {
logrus.Info("Started scraping")
ns.CreateDirectories()
var wg sync.WaitGroup
for _, n := range ns.NewsPapers {
wg.Add(1)
go func(n *NewsPaper) {
defer wg.Done()
n.Scrape()
}(n)
}
wg.Wait()
logrus.Info("Done scraping")
}

// Scrape scrapes a single newspaper
func (n *NewsPaper) Scrape() {
var err error
var resp *http.Response
var fd *os.File
var out string
clog := logrus.WithField("newspaper", n.Name)

c := &http.Client{
Timeout: time.Second * 10,
}
if resp, err = c.Get(n.URL); err != nil {
clog.WithError(err).Error("Couldn't scrape")
return
}
defer resp.Body.Close()

if fd, err = n.CreateDumpFile(); err != nil {
clog.WithError(err).Error("Couldn't create dump file")
return
}
out, err = html2text.FromReader(resp.Body)
if _, err = fd.WriteString(out); err != nil {
clog.WithError(err).Error("Couldn't copy ouput")
}
}

// CreateDumpFile creates a file to write to with appropriate date format
func (n *NewsPaper) CreateDumpFile() (*os.File, error) {
now := time.Now().Format("2006-01-02_15:04")
out := filepath.Join(n.Output, fmt.Sprintf("%s.txt", now))
return os.Create(out)
}
@@ -0,0 +1,41 @@
package router

import (
"fmt"
"net/http"

"github.com/gin-gonic/gin"
"github.com/sirupsen/logrus"
"github.com/spf13/viper"
)

// Run setups and runs the server
func Run() {
var err error

// Debug mode
if !viper.GetBool("server.debug") {
gin.SetMode(gin.ReleaseMode)
}

// Router initialization
r := gin.Default()

r.GET("/", func(c *gin.Context) {
c.Writer.Header().Add("Content-Disposition", `attachment; filename="output.zip"`)
c.Status(http.StatusOK)
if err := zipto(viper.GetString("output"), c.Writer); err != nil {
logrus.WithError(err).Error("Something went wrong")
}
})

// Run
logrus.WithFields(logrus.Fields{
"host": viper.GetString("server.host"),
"port": viper.GetInt("server.port"),
}).Info("Starting server")

if err = r.Run(fmt.Sprintf("%s:%d", viper.GetString("server.host"), viper.GetInt("server.port"))); err != nil {
logrus.WithError(err).Fatal("Couldn't start server")
}
}

0 comments on commit f0a8d77

Please sign in to comment.
You can’t perform that action at this time.