forked from cwensel/cascading.hbase
-
Notifications
You must be signed in to change notification settings - Fork 11
/
HBaseScheme.java
265 lines (219 loc) · 7.45 KB
/
HBaseScheme.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/*
* Copyright (c) 2009 Concurrent, Inc.
*
* This work has been released into the public domain
* by the copyright holder. This applies worldwide.
*
* In case this is not legally possible:
* The copyright holder grants any entity the right
* to use this work for any purpose, without any
* conditions, unless such conditions are required by law.
*/
package cascading.hbase;
import java.io.IOException;
import java.lang.reflect.Type;
import java.util.HashSet;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import cascading.flow.FlowProcess;
import cascading.scheme.Scheme;
import cascading.scheme.SinkCall;
import cascading.scheme.SourceCall;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import cascading.tuple.type.CoercibleType;
import cascading.util.Util;
/**
* The HBaseScheme class is a {@link Scheme} subclass. It is used in conjunction
* with the {@link HBaseTap} to allow for the reading and writing of data
* to and from a HBase cluster.
*
* @see HBaseTap
*/
@SuppressWarnings("serial")
public class HBaseScheme extends HBaseAbstractScheme {
/** Field LOG */
private static final Logger LOG = LoggerFactory
.getLogger(HBaseScheme.class);
/** String familyNames */
private String[] familyNames;
/** Field valueFields */
private Fields[] valueFields;
/** String columns */
private transient String[] columns;
/** Field fields */
private transient byte[][] fields;
private boolean isFullyQualified = false;
/**
* Constructor HBaseScheme creates a new HBaseScheme instance.
*
* @param keyFields
* of type Fields
* @param familyName
* of type String
* @param valueFields
* of type Fields
*/
public HBaseScheme(Fields keyFields, String familyName, Fields valueFields) {
this(keyFields, new String[] { familyName }, Fields.fields(valueFields));
}
/**
* Constructor HBaseScheme creates a new HBaseScheme instance.
*
* @param keyFields
* of type Fields
* @param familyNames
* of type String[]
* @param valueFields
* of type Fields[]
*/
public HBaseScheme(Fields keyFields, String[] familyNames,
Fields[] valueFields) {
this.keyField = keyFields;
// The column Names only holds the family Names.
this.familyNames = familyNames;
this.valueFields = valueFields;
setSourceSink(this.keyField, this.valueFields);
validate();
}
/**
* Method getFamilyNames returns the set of familyNames of this HBaseScheme
* object.
*
* @return the familyNames (type String[]) of this HBaseScheme object.
*/
public String[] getFamilyNames() {
HashSet<String> familyNameSet = new HashSet<String>();
for (String familyName : familyNames) {
familyNameSet.add(familyName);
}
return familyNameSet.toArray(new String[0]);
}
@Override
public boolean source(FlowProcess<JobConf> flowProcess,
SourceCall<Object[], RecordReader> sourceCall) throws IOException {
Object key = sourceCall.getContext()[0];
Object value = sourceCall.getContext()[1];
boolean hasNext = sourceCall.getInput().next(key, value);
if (!hasNext) {
return false;
}
Tuple result = sourceGetTuple(key);
Result row = (Result) value;
for (int i = 0; i < this.familyNames.length; i++) {
String familyName = this.familyNames[i];
byte[] familyNameBytes = Bytes.toBytes(familyName);
Fields fields = this.valueFields[i];
for (int k = 0; k < fields.size(); k++) {
String fieldName = (String) fields.get(k);
byte[] fieldNameBytes = Bytes.toBytes(fieldName);
byte[] cellValue = row
.getValue(familyNameBytes, fieldNameBytes);
Object deserialized = Bytes.toString( cellValue );
Type type = fields.getType( k );
if( type instanceof CoercibleType )
deserialized = ( (CoercibleType<?>) type ).canonical( deserialized );
result.add( deserialized );
}
}
sourceCall.getIncomingEntry().setTuple(result);
return true;
}
@SuppressWarnings("unchecked")
@Override
public void sink(FlowProcess<JobConf> flowProcess,
SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
TupleEntry tupleEntry = sinkCall.getOutgoingEntry();
Put put = sinkGetPut(tupleEntry);
for (int i = 0; i < valueFields.length; i++) {
Fields fieldSelector = valueFields[i];
TupleEntry values = tupleEntry.selectEntry(fieldSelector);
DataOutputBuffer dataOutputBuffer = new DataOutputBuffer();
for (int j = 0; j < values.getFields().size(); j++) {
String fieldName = values.getFields().get(j).toString();
Type fieldType = values.getFields().getType( j );
Tuple tuple = values.getTuple();
Object object = tuple.getObject(j);
byte[] objectInBytes;
if (object == null)
objectInBytes = HConstants.EMPTY_BYTE_ARRAY;
else if (object instanceof Writable) {
Writable writable = (Writable) object;
dataOutputBuffer.reset();
writable.write(dataOutputBuffer);
objectInBytes = new byte[dataOutputBuffer.getLength()];
System.arraycopy(dataOutputBuffer.getData(), 0,
objectInBytes, 0, dataOutputBuffer.getLength());
}
else if (fieldType instanceof CoercibleType){
CoercibleType<?> coercible = (CoercibleType<?>) fieldType;
objectInBytes = Bytes.toBytes(coercible.coerce( object, String.class ).toString());
}
else {
objectInBytes = Bytes.toBytes(object.toString());
}
put.add(Bytes.toBytes(familyNames[i]),
Bytes.toBytes(fieldName), objectInBytes);
}
}
OutputCollector collector = sinkCall.getOutput();
collector.collect(null, put);
}
@Override
public void sinkConfInit(FlowProcess<JobConf> flowProcess,
Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) {
setSinkInitFields(conf);
}
@Override
public void sourceConfInit(FlowProcess<JobConf> flowProcess,
Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) {
// conf.setInputFormatClass(TableInputFormat.class);
String columns = getColumns();
setSourceInitFields(conf, columns);
LOG.debug("sourcing from columns: {}", columns);
}
private String getColumns() {
return Util.join(columns(this.familyNames, this.valueFields), " ");
}
private String[] columns(String[] familyNames, Fields[] fieldsArray) {
if (columns != null)
return columns;
int size = 0;
for (Fields fields : fieldsArray)
size += fields.size();
columns = new String[size];
for (int i = 0; i < fieldsArray.length; i++) {
Fields fields = fieldsArray[i];
for (int j = 0; j < fields.size(); j++)
if (isFullyQualified)
columns[i + j] = hbaseColumn((String) fields.get(j));
else
columns[i + j] = hbaseColumn(familyNames[i])
+ (String) fields.get(j);
}
return columns;
}
private byte[][] makeBytes(String[] familyNames, Fields[] fieldsArray) {
String[] columns = columns(familyNames, fieldsArray);
byte[][] bytes = new byte[columns.length][];
for (int i = 0; i < columns.length; i++)
bytes[i] = Bytes.toBytes(columns[i]);
return bytes;
}
private String hbaseColumn(String column) {
if (column.indexOf(":") < 0)
return column + ":";
return column;
}
}