-
Notifications
You must be signed in to change notification settings - Fork 0
/
kdataset.py
167 lines (147 loc) · 3.88 KB
/
kdataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import requests
import re
import hashlib
import base64
import pathlib
import argparse
endpoint = "https://www.kaggle.com"
method = "/api/v1"
url_path = "/datasets/download/{owner_slug}/{dataset_slug}"
header_params = {
"Accept": "file",
"User-Agent": "Swagger-Codegen/1/python"
}
class KaggleDatasetDownloader(object):
def __init__(self, user, key, verbose=False):
super(KaggleDatasetDownloader, self).__init__()
self.user = user
self.key = key
self.verbose = verbose
self.session = requests.Session()
self.session.auth = (self.user, self.key)
def download(self, url, path=None, filename=None, check_md5=False):
owner_slug, dataset_slug = self._get_data_from_url(url)
if self.verbose:
print("URL:", url)
# Connect to Kaggle
if self.verbose:
print("Connecting...")
web = self.session.get(
endpoint + method + url_path.format(
owner_slug=owner_slug,
dataset_slug=dataset_slug),
headers=header_params)
# Show HTTP response
if self.verbose:
print(web.status_code, web.reason)
# Check if all thing is ok
if not web.ok:
return False
if self.verbose:
print("Content-Disposition:", web.headers.get("Content-Disposition", ""))
print("x-goog-hash:", web.headers.get("x-goog-hash", ""))
if not filename:
# Search a filename
filename = re.search(
r'(?<=(filename=))([\w\.\-]+)',
web.headers.get("Content-Disposition", ""))
# Complete missing data for the filename
if not filename:
filename = "dataset.downloaded"
else:
filename = filename.group()
if check_md5:
# Search an MD5
md5 = re.search(
r'(?<=(md5\=))([a-zA-Z0-9=\+\/]+)',
web.headers.get("x-goog-hash", ""))
# Complete missing data for MD5
if md5:
md5 = md5.group()
md5 = md5.encode()
# Original MD5 in Base64
if self.verbose:
print(f"md5: {md5}")
# Original MD5
md5 = base64.b64decode(md5)
if self.verbose:
print(f"md5: {md5}")
# Start downloading
filepath = pathlib.Path(
path or "",
filename)
if self.verbose:
print(f"downloading to {filepath}...")
# Open the file
with open(filepath, "wb") as file:
for part in web.iter_content(chunk_size=2048):
file.write(part)
# Check MD5 if it is configured
if check_md5:
if md5:
with open(filepath, "rb") as file:
md5_original = hashlib.md5(file.read()).digest()
print(
"MD5",
"is" if md5 == md5_original else "is not",
"correct.")
return (md5 == md5_original)
else:
print("MD5 check has not been done.")
return True
else:
return True
def _get_data_from_url(self, url):
searched = re.match(
r"(https?://)?www\.kaggle\.com/([\w\-]+)/([\w\-]+)",
url)
if not searched:
raise ValueError("URL not valid")
_, owner_slug, dataset_slug = searched.groups()
return owner_slug, dataset_slug
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="This simple software lets you download datasets from Kaggle using your credentials (API Token).")
parser.add_argument(
"url",
help="Kaggle URL")
parser.add_argument(
"user",
help="Kaggle user")
parser.add_argument(
"key",
help="Your API Token")
parser.add_argument(
"filepath",
nargs="?",
default=None,
help="The filepath")
parser.add_argument(
"--check-md5",
nargs="?",
dest="check_md5",
default=False,
const=True,
help="Enable check MD5")
parser.add_argument(
"-v",
"--verbose",
dest="verbose",
nargs="?",
default=False,
const=True,
help="Enable verbose")
args = parser.parse_args()
# New object
kdd = KaggleDatasetDownloader(args.user, args.key, args.verbose)
success = kdd.download(
args.url,
pathlib.os.path.dirname(args.filepath) if args.filepath else None,
pathlib.os.path.basename(args.filepath) if args.filepath else None,
args.check_md5)
if success:
print("downloading successfully.")
else:
print("downloading failed")