forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
seq_dataset.pyx
178 lines (150 loc) · 6.95 KB
/
seq_dataset.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# encoding: utf-8
# cython: cdivision=True
# cython: boundscheck=False
# cython: wraparound=False
#
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
#
# Licence: BSD 3 clause
import numpy as np
cimport numpy as np
cimport cython
np.import_array()
cdef class SequentialDataset:
"""Base class for datasets with sequential data access. """
cdef void next(self, DOUBLE **x_data_ptr, INTEGER **x_ind_ptr,
int *nnz, DOUBLE *y, DOUBLE *sample_weight):
"""Get the next example ``x`` from the dataset.
Parameters
----------
x_data_ptr : np.float64**
A pointer to the double array which holds the feature
values of the next example.
x_ind_ptr : np.int32**
A pointer to the int32 array which holds the feature
indices of the next example.
nnz : int*
A pointer to an int holding the number of non-zero
values of the next example.
y : np.float64*
The target value of the next example.
sample_weight : np.float64*
The weight of the next example.
"""
raise NotImplementedError()
cdef void shuffle(self, seed):
"""Permutes the ordering of examples. """
raise NotImplementedError()
cdef class ArrayDataset(SequentialDataset):
"""Dataset backed by a two-dimensional numpy array.
The dtype of the numpy array is expected to be ``np.float64``
and C-style memory layout.
"""
def __cinit__(self, np.ndarray[DOUBLE, ndim=2, mode='c'] X,
np.ndarray[DOUBLE, ndim=1, mode='c'] Y,
np.ndarray[DOUBLE, ndim=1, mode='c'] sample_weights):
"""A ``SequentialDataset`` backed by a two-dimensional numpy array.
Parameters
----------
X : ndarray, dtype=np.float64, ndim=2, mode='c'
The samples; a two-dimensional c-continuous numpy array of
dtype np.float64.
Y : ndarray, dtype=np.float64, ndim=1, mode='c'
The target values; a one-dimensional c-continuous numpy array of
dtype np.float64.
sample_weights : ndarray, dtype=np.float64, ndim=1, mode='c'
The weight of each sample; a one-dimensional c-continuous numpy
array of dtype np.float64.
"""
self.n_samples = X.shape[0]
self.n_features = X.shape[1]
cdef np.ndarray[INTEGER, ndim=1,
mode='c'] feature_indices = np.arange(0, self.n_features,
dtype=np.int32)
self.feature_indices = feature_indices
self.feature_indices_ptr = <INTEGER *> feature_indices.data
self.current_index = -1
self.stride = X.strides[0] / X.itemsize
self.X_data_ptr = <DOUBLE *>X.data
self.Y_data_ptr = <DOUBLE *>Y.data
self.sample_weight_data = <DOUBLE *>sample_weights.data
# Use index array for fast shuffling
cdef np.ndarray[INTEGER, ndim=1,
mode='c'] index = np.arange(0, self.n_samples,
dtype=np.int32)
self.index = index
self.index_data_ptr = <INTEGER *> index.data
cdef void next(self, DOUBLE **x_data_ptr, INTEGER **x_ind_ptr,
int *nnz, DOUBLE *y, DOUBLE *sample_weight):
cdef int current_index = self.current_index
if current_index >= (self.n_samples - 1):
current_index = -1
current_index += 1
cdef int sample_idx = self.index_data_ptr[current_index]
cdef int offset = sample_idx * self.stride
y[0] = self.Y_data_ptr[sample_idx]
x_data_ptr[0] = self.X_data_ptr + offset
x_ind_ptr[0] = self.feature_indices_ptr
nnz[0] = self.n_features
sample_weight[0] = self.sample_weight_data[sample_idx]
self.current_index = current_index
cdef void shuffle(self, seed):
np.random.RandomState(seed).shuffle(self.index)
cdef class CSRDataset(SequentialDataset):
"""A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """
def __cinit__(self, np.ndarray[DOUBLE, ndim=1, mode='c'] X_data,
np.ndarray[INTEGER, ndim=1, mode='c'] X_indptr,
np.ndarray[INTEGER, ndim=1, mode='c'] X_indices,
np.ndarray[DOUBLE, ndim=1, mode='c'] Y,
np.ndarray[DOUBLE, ndim=1, mode='c'] sample_weight):
"""Dataset backed by a scipy sparse CSR matrix.
The feature indices of ``x`` are given by x_ind_ptr[0:nnz].
The corresponding feature values are given by
x_data_ptr[0:nnz].
Parameters
----------
X_data : ndarray, dtype=np.float64, ndim=1, mode='c'
The data array of the CSR matrix; a one-dimensional c-continuous
numpy array of dtype np.float64.
X_indptr : ndarray, dtype=np.int32, ndim=1, mode='c'
The index pointer array of the CSR matrix; a one-dimensional
c-continuous numpy array of dtype np.int32.
X_indices : ndarray, dtype=np.int32, ndim=1, mode='c'
The column indices array of the CSR matrix; a one-dimensional
c-continuous numpy array of dtype np.int32.
Y : ndarray, dtype=np.float64, ndim=1, mode='c'
The target values; a one-dimensional c-continuous numpy array of
dtype np.float64.
sample_weights : ndarray, dtype=np.float64, ndim=1, mode='c'
The weight of each sample; a one-dimensional c-continuous numpy
array of dtype np.float64.
"""
self.n_samples = Y.shape[0]
self.current_index = -1
self.X_data_ptr = <DOUBLE *>X_data.data
self.X_indptr_ptr = <INTEGER *>X_indptr.data
self.X_indices_ptr = <INTEGER *>X_indices.data
self.Y_data_ptr = <DOUBLE *>Y.data
self.sample_weight_data = <DOUBLE *> sample_weight.data
# Use index array for fast shuffling
cdef np.ndarray[INTEGER, ndim=1,
mode='c'] index = np.arange(0, self.n_samples,
dtype=np.int32)
self.index = index
self.index_data_ptr = <INTEGER *> index.data
cdef void next(self, DOUBLE **x_data_ptr, INTEGER **x_ind_ptr,
int *nnz, DOUBLE *y, DOUBLE *sample_weight):
cdef int current_index = self.current_index
if current_index >= (self.n_samples - 1):
current_index = -1
current_index += 1
cdef int sample_idx = self.index_data_ptr[current_index]
cdef int offset = self.X_indptr_ptr[sample_idx]
y[0] = self.Y_data_ptr[sample_idx]
x_data_ptr[0] = self.X_data_ptr + offset
x_ind_ptr[0] = self.X_indices_ptr + offset
nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset
sample_weight[0] = self.sample_weight_data[sample_idx]
self.current_index = current_index
cdef void shuffle(self, seed):
np.random.RandomState(seed).shuffle(self.index)