Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DS-2952 SOLR full text indexing multiple bitstreams #1595

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions dspace-api/pom.xml
Expand Up @@ -505,6 +505,11 @@
<artifactId>contiperf</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.rometools</groupId>
<artifactId>rome-modules</artifactId>
Expand Down
11 changes: 6 additions & 5 deletions dspace-api/src/main/java/org/dspace/core/Utils.java
Expand Up @@ -16,16 +16,13 @@
import java.rmi.dgc.VMID;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.Random;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.Date;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.text.SimpleDateFormat;
import java.text.ParseException;
import com.coverity.security.Escape;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;

/**
Expand Down Expand Up @@ -413,4 +410,8 @@ public static synchronized String formatISO8601Date(Date d)
int rl = result.length();
return result.substring(0, rl-2) + ":" + result.substring(rl-2);
}

public static <E> Collection<E> emptyIfNull(Collection<E> collection) {
return collection == null ? Collections.<E>emptyList() : collection;
}
}

This file was deleted.

@@ -0,0 +1,212 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.discovery;

import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.Charsets;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.solr.common.util.ContentStreamBase;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.Bitstream;
import org.dspace.content.BitstreamFormat;
import org.dspace.content.Bundle;
import org.dspace.content.Item;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.BitstreamService;
import org.dspace.core.Context;

import javax.annotation.Nullable;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.sql.SQLException;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import static org.dspace.core.Utils.emptyIfNull;

/**
* Construct a <code>ContentStream</code> from a <code>File</code>
*/
public class FullTextContentStreams extends ContentStreamBase
{
private static final Logger log = Logger.getLogger(FullTextContentStreams.class);

public static final String FULLTEXT_BUNDLE = "TEXT";

protected final Context context;
protected List<FullTextBitstream> fullTextStreams;
protected BitstreamService bitstreamService;

public FullTextContentStreams(Context context, Item parentItem) throws SQLException {
this.context = context;
init(parentItem);
}

protected void init(Item parentItem) {
fullTextStreams = new LinkedList<>();

if(parentItem != null) {
sourceInfo = parentItem.getHandle();

//extracted full text is always extracted as plain text
contentType = "text/plain";

buildFullTextList(parentItem);
}
}

private void buildFullTextList(Item parentItem) {
// now get full text of any bitstreams in the TEXT bundle
// trundle through the bundles
List<Bundle> myBundles = parentItem.getBundles();

for (Bundle myBundle : emptyIfNull(myBundles)) {
if (StringUtils.equals(FULLTEXT_BUNDLE, myBundle.getName())) {
// a-ha! grab the text out of the bitstreams
List<Bitstream> bitstreams = myBundle.getBitstreams();

for (Bitstream fulltextBitstream : emptyIfNull(bitstreams)) {
fullTextStreams.add(new FullTextBitstream(sourceInfo, fulltextBitstream));

log.debug("Added BitStream: "
+ fulltextBitstream.getStoreNumber() + " "
+ fulltextBitstream.getSequenceID() + " "
+ fulltextBitstream.getName());
}
}
}
}

@Override
public String getName() {
return StringUtils.join(Iterables.transform(fullTextStreams, new Function<FullTextBitstream, String>() {
@Nullable
@Override
public String apply(@Nullable FullTextBitstream input) {
return input == null ? "" : input.getFileName();
}
}), ";");
}

@Override
public Long getSize() {
long result = 0;

if(CollectionUtils.isNotEmpty(fullTextStreams)) {
Iterable<Long> individualSizes = Iterables.transform(fullTextStreams, new Function<FullTextBitstream, Long>() {
@Nullable
@Override
public Long apply(@Nullable FullTextBitstream input) {
return input == null ? 0L : input.getSize();
}
});

for (Long size : individualSizes) {
result += size;
}
}

return result;
}

@Override
public Reader getReader() throws IOException {
return super.getReader();
}

@Override
public InputStream getStream() throws IOException {
try {
return new SequenceInputStream(new FullTextEnumeration(fullTextStreams.iterator()));
} catch (Exception e) {
log.error("Unable to add full text bitstreams to SOLR for item " + sourceInfo + ": " + e.getMessage(), e);
return new ByteArrayInputStream((e.getClass() + ": " + e.getMessage()).getBytes(StandardCharsets.UTF_8));
}
}

public boolean isEmpty() {
return CollectionUtils.isEmpty(fullTextStreams);
}

private BitstreamService getBitstreamService() {
if(bitstreamService == null) {
bitstreamService = ContentServiceFactory.getInstance().getBitstreamService();
}
return bitstreamService;
}

private class FullTextBitstream {
private String itemHandle;
private Bitstream bitstream;

public FullTextBitstream(final String parentHandle, final Bitstream file) {
this.itemHandle = parentHandle;
this.bitstream = file;
}

public String getContentType(final Context context) throws SQLException {
BitstreamFormat format = bitstream.getFormat(context);
return format == null ? null : StringUtils.trimToEmpty(format.getMIMEType());
}

public String getFileName() {
return StringUtils.trimToEmpty(bitstream.getName());
}

public long getSize() {
return bitstream.getSize();
}

public InputStream getInputStream() throws SQLException, IOException, AuthorizeException {
return getBitstreamService().retrieve(context, bitstream);
}

public String getItemHandle() {
return itemHandle;
}
}

private class FullTextEnumeration implements Enumeration<InputStream> {

private final Iterator<FullTextBitstream> fulltextIterator;

public FullTextEnumeration(final Iterator<FullTextBitstream> fulltextStreams) {
this.fulltextIterator = fulltextStreams;
}

public boolean hasMoreElements() {
return fulltextIterator.hasNext();
}

public InputStream nextElement() {
InputStream inputStream = null;
FullTextBitstream bitstream = null;

try {
bitstream = fulltextIterator.next();
inputStream = bitstream.getInputStream();
} catch (Exception e) {
log.warn("Unable to add full text bitstream " + (bitstream == null ? "NULL" :
bitstream.getFileName() + " for item " + bitstream.getItemHandle())
+ " to SOLR:" + e.getMessage(), e);

inputStream = new ByteArrayInputStream((e.getClass() + ": " + e.getMessage()).getBytes(StandardCharsets.UTF_8));
}

return inputStream == null ? null : new SequenceInputStream(
new ByteArrayInputStream("\n".getBytes(Charsets.UTF_8)), inputStream);
}
}

}