core/src/main/java/com/digitalpebble/behemoth/DocumentFilter.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.digitalpebble.behemoth;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Filters used by Mappers / Reducers to skip Behemoth documents based on their
 * metadata. Can have either positive or negative filters but not both. The
 * filters values are regular expressions and the document will be kept or
 * skipped if it matches ANY of the filters in OR mode or all the constraints if
 * document.filter.md.mode is set to 'AND'. It can filter based on the document
 * URL and or mime-type using regular expressions.
 **/
public class DocumentFilter {

    private static final Logger LOG = LoggerFactory
            .getLogger(DocumentFilter.class);

    public static final String DocumentFilterParamNamePrefixKeep = "document.filter.md.keep.";
    public static final String DocumentFilterParamNameMode = "document.filter.md.mode";
    public static final String DocumentFilterParamNamePrefixSkip = "document.filter.md.skip.";
    public static final String DocumentFilterParamNameURLFilterKeep = "document.filter.url.keep";
    public static final String DocumentFilterParamNameMimeTypeFilterKeep = "document.filter.mimetype.keep";
    public static final String DocumentFilterParamNameLength = "document.filter.max.content.length";

    private Map<String, String> KVpatterns = new HashMap<String, String>();

    private boolean negativeMode = true;

    private Pattern URLRegex;

    private Pattern MimetypeRegex;

    private int maxContentLength = -1;

    private String medataMode = "AND";

    /**
     * Checks whether any filters have been specified in the configuration
     **/
    public static boolean isRequired(Configuration conf) {
        DocumentFilter filter = DocumentFilter.getFilters(conf);
        if (filter.KVpatterns.size() > 0)
            return true;
        if (filter.URLRegex != null)
            return true;
        if (filter.MimetypeRegex != null)
            return true;
        if (filter.maxContentLength != -1)
            return true;
        return false;
    }

    /** Builds a document filter given a Configuration object **/
    public static DocumentFilter getFilters(Configuration conf) {
        // extracts the patterns
        Map<String, String> PositiveKVpatterns = conf
                .getValByRegex(DocumentFilterParamNamePrefixKeep + ".+");
        Map<String, String> NegativeKVpatterns = conf
                .getValByRegex(DocumentFilterParamNamePrefixSkip + ".+");

        Map<String, String> tmpMap;

        DocumentFilter filter = new DocumentFilter();

        filter.medataMode = conf.get(DocumentFilterParamNameMode, "AND");

        // has to be either prositive or negative but not both
        if (PositiveKVpatterns.size() > 0 && NegativeKVpatterns.size() > 0) {
            throw new RuntimeException(
                    "Can't have positive AND negative document filters - check your configuration");
        } else if (PositiveKVpatterns.size() > 0) {
            filter.negativeMode = false;
            tmpMap = PositiveKVpatterns;
        } else {
            filter.negativeMode = true;
            tmpMap = NegativeKVpatterns;
        }

        // normalise the keys
        Iterator<Entry<String, String>> kviter = tmpMap.entrySet().iterator();
        while (kviter.hasNext()) {
            Entry<String, String> ent = kviter.next();
            String k = ent.getKey();
            String v = ent.getValue();
            k = k.substring(DocumentFilterParamNamePrefixKeep.length());

            StringBuilder message = new StringBuilder();
            if (filter.negativeMode)
                message.append("Negative ");
            else
                message.append("Positive ");
            message.append("filter found : ").append(k).append(" = ").append(v);
            LOG.info(message.toString());

            filter.KVpatterns.put(k, v);
        }

        String URLPatternS = conf.get(DocumentFilterParamNameURLFilterKeep, "");
        if (URLPatternS.length() > 0) {
            try {
                filter.URLRegex = Pattern.compile(URLPatternS);
            } catch (PatternSyntaxException e) {
                filter.URLRegex = null;
                LOG.error("Can't create regular expression for URL from "
                        + URLPatternS);
            }
        }

        String MTPatternS = conf.get(DocumentFilterParamNameMimeTypeFilterKeep,
                "");
        if (MTPatternS.length() > 0) {
            try {
                filter.MimetypeRegex = Pattern.compile(MTPatternS);
            } catch (PatternSyntaxException e) {
                filter.MimetypeRegex = null;
                LOG.error("Can't create regular expression for MimeType from "
                        + MTPatternS);
            }
        }

        filter.maxContentLength = conf
                .getInt(DocumentFilterParamNameLength, -1);

        return filter;
    }

    /** Returns true if the document can be kept, false otherwise **/
    public boolean keep(BehemothDocument input) {
        // filter if null
        if (input == null)
            return false;

        // check length content
        if (input.getContent() != null && maxContentLength != -1) {
            if (input.getContent().length > maxContentLength)
                return false;
        }

        // check on the URL
        if (URLRegex != null) {
            if (input.getUrl() == null)
                return false;
            boolean match = URLRegex.matcher(input.getUrl()).matches();
            if (!match)
                return false;
        }

        // check on the MimeType
        if (MimetypeRegex != null) {
            if (input.getContentType() == null)
                return false;
            boolean match = MimetypeRegex.matcher(input.getContentType())
                    .matches();
            if (!match)
                return false;
        }

        MapWritable metadata = input.getMetadata();
        // no rules at all -> fine!
        if (KVpatterns.size() == 0)
            return true;

        // document MUST have a certain value to be kept
        if (metadata == null || metadata.isEmpty()) {
            if (!negativeMode)
                return false;
            else
                return true;
        }

        boolean hasMatch = false;

        // find common keys between filters and content of doc
        boolean matchesAll = true;
        Iterator<String> kiter = KVpatterns.keySet().iterator();
        while (kiter.hasNext()) {
            String k = kiter.next();
            String regex = KVpatterns.get(k);
            // see if we have a metadata for that key
            Writable value = metadata.get(new Text(k));
            if (value == null) {
                matchesAll = false;
                continue;
            }
            if (value.toString().matches(regex)) {
                hasMatch = true;
            } else
                matchesAll = false;
        }

        boolean successMatching = false;

        if (medataMode.equalsIgnoreCase("AND")) {
            if (matchesAll)
                successMatching = true;
        } else if (hasMatch)
            successMatching = true;

        if (successMatching) {
            return (!negativeMode);
        }

        // no negative rule matching
        if (negativeMode)
            return true;

        // no positive rule matching
        return false;
    }

}