Skip to content

Commit

Permalink
Merge pull request #2 from JJ/master
Browse files Browse the repository at this point in the history
Some changes in docs and file name
  • Loading branch information
raiben committed Jun 9, 2019
2 parents c535916 + ba36e8c commit 053be60
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 10 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
tropescraper.egg-info/
__pycache__
scraper_cache/
*~
.cache
10 changes: 10 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
dist: xenial
language: python
python:
- "3.6"
- "3.7"
- "3.6-dev" # 3.6 development branch
- "3.7-dev" # 3.7 development branch
install:
- pip install -e .
script: pytest
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,20 @@
# tropescraper
A tropes scraper

A scraper for the website TV tropes.


## Install

Install all dependencies with:

pip install -e .

(pip should be installed and available).

## Run

Run it with

bin/scrape-tvtropes

It will take a good while while it scrapes ~12k films.
9 changes: 4 additions & 5 deletions bin/scrape-tvtropes
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@ import os
from tropescraper.tvtropes_scraper import TVTropesScraper

if len(sys.argv) != 2:
command = sys.argv[0].split(os.sep)[-1]
print(f'Error: Invalid usage\nPlease execute \'{command} <target_file.json>\'')
sys.exit(1)

file_name = sys.argv[1]
file_name = "tvtropes.json"
else:

file_name = sys.argv[1]

logging.basicConfig(level=logging.INFO)
scraper = TVTropesScraper()
Expand Down
3 changes: 0 additions & 3 deletions tropescraper/cache_information.py

This file was deleted.

16 changes: 16 additions & 0 deletions tropescraper/test_web_page_retriever.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import unittest
from tropescraper.web_page_retriever import WebPageRetriever

class TestWebPageRetriever(unittest.TestCase):

def setUp(self):
self.scraper = WebPageRetriever(0.5,"https://tvtropes.org/pmwiki/pmwiki.php/Film/FantasticBeastsAndWhereToFindThem","/tmp")

def test_class(self):
self.assertIsInstance( self.scraper, WebPageRetriever, "Correct class" )

def test_retrieve(self):
content = self.scraper.retrieve()
self.assertNotEqual( content, "", "Retrieves something")
content2 = self.scraper.retrieve()
self.assertEqual( content, content2, "Retrieves from cache")
3 changes: 2 additions & 1 deletion tropescraper/web_page_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
import os
from datetime import datetime
from time import sleep, ctime
from collections import namedtuple

import requests

from tropescraper.cache_information import CacheInformation
CacheInformation = namedtuple('CacheInformation', ['size', 'files_count', 'creation_date'])


class WebPageRetriever(object):
Expand Down

0 comments on commit 053be60

Please sign in to comment.