Merged develop on master for v0.5

MickaelWalter · Jun 1, 2020 · 99e7226 · 99e7226
2 parents 0d9998c + 3233660
commit 99e7226
Show file tree

Hide file tree

Showing 11 changed files with 1,919 additions and 210 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
 */__pycache__/*
+.venv/*
diff --git a/README.md b/README.md
@@ -11,6 +11,10 @@ information and enumerate every user, post, comment, media and more.
 This allows to get information about sensitive files or pages which may be not
 protected enough from external access.
 
+WPJsonScraper has 2 operation modes: command line arguments and interactive. 
+The latest offers a command prompt allowing to do more complex operations on 
+the WP-JSON API.
+
 ## Prerequises
 
 WPJsonScraper is written in Python and should work with any Python 3
@@ -25,6 +29,12 @@ Just clone the repository with git and run `pip install -r requirements.txt`
 
 ## Usage
 
+### Interactive mode
+
+See [Interactive mode](doc/Interactive.md) for more details.
+
+### Command line arguments mode
+
 The tool needs the definition of a target WordPress installation and a flag
 instructing which action to do.
 
@@ -71,7 +81,7 @@ Example:
 Using the -r option, you can crawl collections of the specified namespace. This
 allows you to get a set of objects from the API and maybe confidential data ;)
 
-## Search feature
+#### Search feature
 
 WordPress WP-JSON API allows to search in posts, pages, media objects, tags, 
 categories, comments and users.

diff --git a/WPJsonScraper.py b/WPJsonScraper.py
@@ -25,6 +25,7 @@
 import argparse
 import requests
 import re
+import os
 
 from lib.console import Console
 from lib.wpapi import WPApi
@@ -33,8 +34,9 @@
                             NSNotFoundException
 from lib.exporter import Exporter
 from lib.requestsession import RequestSession
+from lib.interactive import start_interactive
 
-version = '0.4.1'
+version = '0.5'
 
 def main():
     parser = argparse.ArgumentParser(description=
@@ -113,6 +115,10 @@ def main():
                         dest='comment_export_folder',
                         action='store',
                         help='export comments to a specified destination folder')
+    parser.add_argument('--download-media',
+                        dest='media_folder',
+                        action='store',
+                        help='download media to the designated folder')
     parser.add_argument('-r',
                         '--crawl-ns',
                         dest='crawl_ns',
@@ -151,6 +157,10 @@ def main():
                         dest='nocolor',
                         action='store_true',
                         help='remove color in the output (e.g. to pipe it)')
+    parser.add_argument('--interactive',
+                        dest='interactive',
+                        action='store_true',
+                        help='start an interactive session')
 
 
     args = parser.parse_args()
@@ -205,8 +215,10 @@ def main():
     session = RequestSession(proxy=proxy, cookies=cookies,
       authorization=authorization)
     try:
+        session.get(target)
         Console.log_success("Connection OK")
     except Exception as e:
+        Console.log_error("Failed to connect to the server")
         exit(0)
 
     # Quite an ugly check to launch a search on all parameters edible 
@@ -221,6 +233,10 @@ def main():
         args.tags = True
         args.media = True
 
+    if args.interactive:
+        start_interactive(target, session, version)
+        return
+
     scanner = WPApi(target, session=session, search_terms=args.search)
     if args.info or args.all:
         try:
@@ -238,23 +254,23 @@ def main():
                 Console.log_info("Post list with comments")
             else:
                 Console.log_info("Post list")
-            posts_list = scanner.get_all_posts(args.comments)
+            posts_list = scanner.get_posts(args.comments)
             InfoDisplayer.display_posts(posts_list, scanner.get_orphans_comments())
         except WordPressApiNotV2:
             Console.log_error("The API does not support WP V2")
 
     if args.pages or args.all:
         try:
             Console.log_info("Page list")
-            pages_list = scanner.get_all_pages()
+            pages_list = scanner.get_pages()
             InfoDisplayer.display_pages(pages_list)
         except WordPressApiNotV2:
             Console.log_error("The API does not support WP V2")
 
     if args.users or args.all:
         try:
             Console.log_info("User list")
-            users_list = scanner.get_all_users()
+            users_list = scanner.get_users()
             InfoDisplayer.display_users(users_list)
         except WordPressApiNotV2:
             Console.log_error("The API does not support WP V2")
@@ -272,23 +288,24 @@ def main():
     if args.categories or args.all:
         try:
             Console.log_info("Category list")
-            categories_list = scanner.get_all_categories()
+            categories_list = scanner.get_categories()
             InfoDisplayer.display_categories(categories_list)
         except WordPressApiNotV2:
             Console.log_error("The API does not support WP V2")
 
     if args.tags or args.all:
         try:
             Console.log_info("Tags list")
-            tags_list = scanner.get_all_tags()
+            tags_list = scanner.get_tags()
             InfoDisplayer.display_tags(tags_list)
         except WordPressApiNotV2:
             Console.log_error("The API does not support WP V2")
 
+    media_list = None
     if args.media or args.all:
         try:
             Console.log_info("Media list")
-            media_list = scanner.get_all_media()
+            media_list = scanner.get_media()
             InfoDisplayer.display_media(media_list)
         except WordPressApiNotV2:
             Console.log_error("The API does not support WP V2")
@@ -311,12 +328,12 @@ def main():
 
     if args.post_export_folder is not None:
         try:
-            posts_list = scanner.get_all_posts()
-            tags_list = scanner.get_all_tags()
-            categories_list = scanner.get_all_categories()
-            users_list = scanner.get_all_users()
+            posts_list = scanner.get_posts()
+            tags_list = scanner.get_tags()
+            categories_list = scanner.get_categories()
+            users_list = scanner.get_users()
             print()
-            post_number = Exporter.export_posts(posts_list,
+            post_number = Exporter.export_posts_html(posts_list,
              args.post_export_folder,
              tags_list,
              categories_list,
@@ -329,10 +346,10 @@ def main():
 
     if args.page_export_folder is not None:
         try:
-            pages_list = scanner.get_all_pages()
-            users_list = scanner.get_all_users()
+            pages_list = scanner.get_pages()
+            users_list = scanner.get_users()
             print()
-            page_number = Exporter.export_posts(pages_list,
+            page_number = Exporter.export_posts_html(pages_list,
              args.page_export_folder,
              None,
              None,
@@ -345,7 +362,7 @@ def main():
 
     if args.comment_export_folder is not None:
         try:
-            post_list = scanner.get_all_posts(True)
+            post_list = scanner.get_posts(True)
             orphan_list = scanner.get_orphans_comments()
             print()
             page_number = Exporter.export_comments(post_list, orphan_list, args.comment_export_folder)
@@ -355,6 +372,23 @@ def main():
         except WordPressApiNotV2:
             Console.log_error("The API does not support WP V2")
 
+    if args.media_folder is not None:
+        Console.log_info("Downloading media files")
+        if not os.path.isdir(args.media_folder):
+            Console.log_error("The destination is not a folder or does not exist")
+        else:
+            print("Pulling the media URLs")
+
+            media, _ = scanner.get_media_urls('all', True)
+            if len(media) == 0:
+                Console.log_error("No media found")
+                return
+            print("%d media URLs found" % len(media))
+
+            print("Note: Only files over 10MB are logged here")
+            number_downloaded = Exporter.download_media(media, args.media_folder)
+            Console.log_success('Downloaded %d media to %s' % (number_downloaded, args.media_folder))
+
 
 if __name__ == "__main__":
     main()
diff --git a/doc/Interactive.md b/doc/Interactive.md
@@ -0,0 +1,153 @@
+# Interactive mode
+
+To help with more complex interactions with WP-JSON API, WPJsonScraper implements an interactive mode.
+
+In interactive mode, the same session is used between requests. So every cookies set by the server and other parameters are kept 
+from one request to another.
+
+Typing `command -h` or `command --help` will bring a detailed help message for specific commands.
+
+Tab autocompletes the command name, up and down browse the command history.
+
+## Commands
+
+### help
+
+Lists commands and displays a brief help message about specified commands.
+
+Example 1: display the command list
+
+    help
+
+Example 2: display a brief help message about the command goals.
+
+    help show
+
+### exit
+
+Exits the interactive mode and goes back to the user's shell.
+
+### show
+
+Shows details about global parameters stored in WPJsonScraper memory.
+
+Example: show all parameters
+
+    show all
+
+### set
+
+Sets a specific global parameter. 
+
+Note that in cases of proxy and cookies, the command updates the entries. 
+Check the resulting parameter with show if you don't know what that means.
+
+**Note:** changing the target resets the cache but keeps proxies, cookies and authorization headers. Be aware 
+of data leakage risks. If you need to keep things apart between targets, relaunch WPJsonScraper or make sure 
+all is correctly set up with the `show all` command.
+
+Example 1: change the target
+
+    set target http://example.com
+
+Example 2: add or modify the cookies PHPSESSID and JSESSIONID (because why not?)
+
+    set cookie "PHPSESSID=deadbeef; JSESSIONID=badc0ffee"
+
+### list
+
+Lists specified data from the server.
+
+This command gets data from the server and displays it as a simple list (with no details).
+
+It also can export full scraped data (with all details available) to specified JSON file 
+(see --csv and --json options). If a file extension is not specified, WPJsonScraper will append one. 
+The export options will try to join data with other API endpoint data (e.g. users with posts). CSV files 
+imply that most of the data is removed to ensure human readability. Use this option only to export a list of 
+posts.
+
+**Note:** to avoid having too much noise on the target, WPJsonScraper won't fetch automatically any other 
+endpoint to complete the exported data. If you want all information to be gathered, you have to build the 
+cache first by requesting the data beforehand (for example, getting the user list before exporting the posts).
+
+By default, WPJsonScraper caches data to avoid requesting the server too often. To get the lastest updates, 
+run this command with the --no-cache option.
+
+Use the --limit and --start options to retrieve a subset of all data selected.
+
+In the case of media files, the files themselves **are not downloaded**.
+
+Example 1: get all posts
+
+    list posts
+
+Example 2: get maximum 10 pages starting at page 15
+
+    list pages --start 15 --limit 10
+
+Example 3: export all listeable content to json files (including for example all-data-posts.json)
+
+    list all --json all-data
+
+Example 4: list namespaces
+
+    list namespaces
+
+### fetch
+
+Fetches a specific piece of data from the server given its type and its ID. By default, if the data is cached, 
+the data is returned from the cache. Use the --no-cache argument to force its retrieval from the server.
+
+The data displayed is more complete than the data displayed by the list command. But some metadata is still not 
+displayed. Only the JSON export is a full data dump (with additional mapping when relevant).
+
+**Note:** like in the list function, the data that could complete the displayed information is not automatically 
+fetched. You have to get it into cache first or to fetch it separately based on its ID. Moreover, the data 
+retrieved by ID is not yet pushed into the cache. It may be in a later version.
+
+Example 1 : display the post with the ID 1
+
+    fetch post 1
+
+Example 2 : display the page with the ID 42 and export it in a JSON file, don't use the cache
+
+    fetch page 42 --no-cache
+
+### search
+
+Looks for data based on the specified keywords. This command doesn't use the cache and systematically uses the 
+WordPress API to do searches. One or several object types may be provided to narrow the search scope.
+
+Example 1: look for keyword test in all object types
+
+    search test
+
+Example 2: look for keyword foo in posts and pages
+
+    search --type post --type page foo
+
+Example 3: --limit and --start also work for search results
+
+    search --limit 5 --start 4 bar
+
+### dl
+
+Downloads media based on the provided ID. The ID can be specified as an integer (or list of integers), `all` or 
+`cache`. In the first case, only media with the specified IDs will be downloaded. `all` will trigger a fetch from 
+the API to list all medias then a download session for each file. `cache` will get media URLs from the cache and 
+then download the files. 
+
+Note that if all the IDs specified are in the cache, no lookup will be made on the API. If you want to override 
+this behaviour, set the `--no-cache` flag.
+
+Example 1: download the media with the IDs 42 and 63 to the current folder
+
+    dl 42,63 .
+
+Example 2: download all media to user's home folder
+
+    dl all /home/user
+
+Example 3: only media present in the cache (e.g. previously requested with list or fetch) are downloaded
+
+    dl cache .