forked from asong92/DAT_SF_12
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_data.py
150 lines (119 loc) · 5.34 KB
/
fetch_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import numpy as np
import os
try:
from urllib import urlopen
except ImportError:
from urllib.request import urlopen
import tarfile
import zipfile
import gzip
from sklearn.datasets import load_files
from sklearn.externals import joblib
TWENTY_URL = ("http://people.csail.mit.edu/jrennie/"
"20Newsgroups/20news-bydate.tar.gz")
TWENTY_ARCHIVE_NAME = "20news-bydate.tar.gz"
TWENTY_CACHE_NAME = "20news-bydate.pkz"
TWENTY_TRAIN_FOLDER = "20news-bydate-train"
TWENTY_TEST_FOLDER = "20news-bydate-test"
SENTIMENT140_URL = ("http://cs.stanford.edu/people/alecmgo/"
"trainingandtestdata.zip")
SENTIMENT140_ARCHIVE_NAME = "trainingandtestdata.zip"
COVERTYPE_URL = ('http://archive.ics.uci.edu/ml/'
'machine-learning-databases/covtype/covtype.data.gz')
# Source: https://www.kaggle.com/c/titanic-gettingStarted/data
TITANIC_URL = ("https://dl.dropboxusercontent.com/"
"u/5743203/data/titanic/titanic_train.csv")
def get_datasets_folder():
here = os.path.dirname(__file__)
datasets_folder = os.path.abspath(os.path.join(here, 'datasets'))
datasets_archive = os.path.abspath(os.path.join(here, 'datasets.zip'))
if not os.path.exists(datasets_folder):
if os.path.exists(datasets_archive):
print("Extracting " + datasets_archive)
zf = zipfile.ZipFile(datasets_archive)
zf.extractall('.')
assert os.path.exists(datasets_folder)
else:
print("Creating datasets folder: " + datasets_folder)
os.makedirs(datasets_folder)
else:
print("Using existing dataset folder:" + datasets_folder)
return datasets_folder
def check_twenty_newsgroups(datasets_folder):
print("Checking availability of the 20 newsgroups dataset")
archive_path = os.path.join(datasets_folder, TWENTY_ARCHIVE_NAME)
train_path = os.path.join(datasets_folder, TWENTY_TRAIN_FOLDER)
test_path = os.path.join(datasets_folder, TWENTY_TEST_FOLDER)
if not os.path.exists(archive_path):
print("Downloading dataset from %s (14 MB)" % TWENTY_URL)
opener = urlopen(TWENTY_URL)
open(archive_path, 'wb').write(opener.read())
else:
print("Found archive: " + archive_path)
if not os.path.exists(train_path) or not os.path.exists(test_path):
print("Decompressing %s" % archive_path)
tarfile.open(archive_path, "r:gz").extractall(path=datasets_folder)
print("Checking that the 20 newsgroups files exist...")
assert os.path.exists(train_path)
assert os.path.exists(test_path)
print("=> Success!")
def check_sentiment140(datasets_folder):
print("Checking availability of the sentiment 140 dataset")
archive_path = os.path.join(datasets_folder, SENTIMENT140_ARCHIVE_NAME)
sentiment140_path = os.path.join(datasets_folder, 'sentiment140')
train_path = os.path.join(sentiment140_path,
'training.1600000.processed.noemoticon.csv')
test_path = os.path.join(sentiment140_path,
'testdata.manual.2009.06.14.csv')
if not os.path.exists(archive_path):
print("Downloading dataset from %s (77MB)" % SENTIMENT140_URL)
opener = urlopen(SENTIMENT140_URL)
open(archive_path, 'wb').write(opener.read())
else:
print("Found archive: " + archive_path)
if not os.path.exists(sentiment140_path):
print("Extracting %s to %s" % (archive_path, sentiment140_path))
zf = zipfile.ZipFile(archive_path)
zf.extractall(sentiment140_path)
print("Checking that the sentiment 140 CSV files exist...")
assert os.path.exists(train_path)
assert os.path.exists(test_path)
print("=> Success!")
def check_covertype(datasets_folder):
print("Checking availability of the covertype dataset")
archive_path = os.path.join(datasets_folder, 'covtype.data.gz')
covtype_dir = os.path.join(datasets_folder, "covertype")
samples_path = os.path.join(covtype_dir, "samples.pkl")
targets_path = os.path.join(covtype_dir, "targets.pkl")
if not os.path.exists(covtype_dir):
os.makedirs(covtype_dir)
if not os.path.exists(archive_path):
print("Downloading dataset from %s (10.7MB)" % COVERTYPE_URL)
open(archive_path, 'wb').write(urlopen(COVERTYPE_URL).read())
else:
print("Found archive: " + archive_path)
if not os.path.exists(samples_path) or not os.path.exists(targets_path):
print("Parsing the data and splitting input and labels...")
f = open(archive_path, 'rb')
Xy = np.genfromtxt(gzip.GzipFile(fileobj=f), delimiter=',')
X = Xy[:, :-1]
y = Xy[:, -1].astype(np.int32)
joblib.dump(X, samples_path)
joblib.dump(y, targets_path )
print("=> Success!")
def check_titanic(datasets_folder):
print("Checking availability of the titanic dataset")
csv_filename = os.path.join(datasets_folder, 'titanic_train.csv')
if not os.path.exists(csv_filename):
print("Downloading titanic data from %s" % TITANIC_URL)
open(csv_filename, 'wb').write(urlopen(TITANIC_URL).read())
print("=> Success!")
if __name__ == "__main__":
import sys
datasets_folder = get_datasets_folder()
check_twenty_newsgroups(datasets_folder)
check_titanic(datasets_folder)
if 'sentiment140' in sys.argv:
check_sentiment140(datasets_folder)
if 'covertype' in sys.argv:
check_covertype(datasets_folder)