Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

510 lines (429 sloc) 20.487 kb
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# Top-level makefile for reranking parser
# Mark Johnson, 24th October 2007
########################################################################
# #
# Summary #
# #
########################################################################
#
# To build just the reranking parser run-time, execute:
#
# make # builds reranking parser programs
#
# To retrain the reranking parser, run the following steps:
#
# make reranker # builds reranking parser and training programs
# make nbesttrain # builds 20 folds of n-best training parses
# make eval-reranker # extracts features, estimates weights, and evaluates
#
# The following high-level goals may also be useful:
#
# make nbestrain-clean # removes temporary files used in nbesttrain
# make nbest-oracle # oracle evaluation of n-best results
# make features # extracts features from 20-fold parses
# make train-reranker # trains reranker model
# make train-clean # removes all temporary files used in training
#
# I typically run nbesttrain to produce the n-best parses
# To run 2 jobs in parallel (e.g. on a multiprocessor) run, e.g.,
#
# make -j 2 nbesttrain
#
# This really only helps with nbesttrain, since the other time consuming
# step (reranker feature weight estimation) isn't yet parallelized.
# The environment variable GCCFLAGS can be used to specify
# machine-dependent optimization flags, e.g.
#
# setenv GCCFLAGS "-march=pentium4"
#
# or
#
# setenv GCCFLAGS "-march=opteron -m64"
#
# The top-level make goal builds the reranking parser using a pre-trained
# model. To build this parser, just run
#
# make
#
# You may need to tweak the following variables to suit your environment
# GCCFLAGS is not set here, so we use the shell environment
# variable's value. But you can set it here if you want.
# Version 4.1 and later gcc permit -march=native, but older
# versions will need -march=pentium4 or -march=opteron
#
# GCCFLAGS = -march=native -mfpmath=sse -msse2 -mmmx -I <path-to-boost-libraries>
# CFLAGS is used for all C and C++ compilation
#
CFLAGS = -MMD -O3 -Wall -ffast-math -finline-functions -fomit-frame-pointer -fstrict-aliasing $(GCCFLAGS)
# for debugging, uncomment the following CFLAGS and LDFLAGS
#
# CFLAGS = -g -O -MMD -Wall -ffast-math -fstrict-aliasing $(GCCFLAGS)
# LDFLAGS = -g -Wall
# Building the 20-fold training data with nbesttrain
# --------------------------------------------------
# For training the parser and reranker you will need your own copy of the
# Penn WSJ Treebank.
#
# PENNWSJTREEBANK must be set to the base directory of the Penn WSJ Treebank
#
PENNWSJTREEBANK=/usr/local/data/Penn3/parsed/mrg/wsj/
# NPARSES is the number of alternative parses to consider for each sentence
#
NPARSES=50
# NFOLDS is the number of folds to use, and FOLDS is a list of the numbers
# from 00 to NFOLDS-1 (I couldn't see how to program this in make).
#
NFOLDS=20
FOLDS=00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19
# SECTIONS is a list of sections from the treebank to n-best parse
# using Eugene's standard n-best parser (in addition to the folds).
#
SECTIONS=22 23 24
# NBESTPARSER is the n-best parser. If you change this, please
# change NBESTPARSERNICKNAME below as well.
#
NBESTPARSER=first-stage/PARSE/parseIt
# NBESTTRAINER is the program (probably a shell script) for training
# the n-best parser. If you change this, please change
# NBESTPARSERNICKNAME below as well.
#
NBESTTRAINER=first-stage/TRAIN/trainParser
# NBESTPARSERNICKNAME is a nickname for the n-best parser. If you
# experiment with several n-best parsers, give each one a different
# nickname.
#
PARSERNICKNAME=ec
# TMP specifies a temporary directory used while constructing the
# folds while producing the training parses for the reranker. You can
# delete this directory after nbesttrain has finished. On an NFS
# system you may want to change this to a local directory.
#
TMP=tmp
# Extracting features from 20-fold n-best parses
# ----------------------------------------------
# VERSION should be either "final" or "nonfinal". If VERSION is
# "nonfinal" then we train on folds 00-19, folds 20-21 are used as
# dev, and sections 22 and 24 are used as test1 and test2
# respectively. If VERSION is "final" then we train on folds 00-21,
# section 24 is used as dev and sections 22 and 23 are used as test1
# and test2 respectively.
#
VERSION=nonfinal
# VERSION=final
# FEATUREEXTRACTOR is the program that used to extract features from
# the 20-fold n-best parses. If you change this, please pick a new
# FEATURESNICKNAME below.
#
FEATUREEXTRACTOR=second-stage/programs/features/extract-spfeatures
# FEATUREEXTRACTORFLAGS are flags you want to give to the feature extractor
#
FEATUREEXTRACTORFLAGS=-l -c -i -s 5
# FEATURESNICKNAME is an arbitrary string used to identify a
# particular set of extracted features for training the reranker. You
# can keep several different sets of feature counts and corresponding
# models around by giving each a unique FEATURESNICKNAME. If you
# develop a new set of features, give them a new FEATURESNICKNAME so
# they doesn't over-write the existing features.
#
FEATURESNICKNAME=sp
# Estimating weights for features
# -------------------------------
# ESTIMATOR is the program used to estimate feature weights from the
# feature counts. This is the feature weight estimator that gives best
# performance. There are others in the same directory (e.g., weighted
# perceptron). If you decide to use a different feature weight
# estimator you should also change ESTIMATORNICKNAME below.
#
ESTIMATOR=second-stage/programs/wlle/gavper
# ESTIMATORFLAGS are flags given to the estimator
#
ESTIMATORFLAGS= -a -n 10 -d 10 -F 1 -m 0
# ESTIMATORNICKNAME
#
ESTIMATORNICKNAME=gavper-aa
########################################################################
#
# You probably shouldn't need to change anything below here.
# TARGETS is the list of targets built when make is called
# without arguments
#
TARGETS = PARSE reranker-runtime sparseval
CXXFLAGS = $(CFLAGS)
export CFLAGS
export CXXFLAGS
.PHONY: top
top: $(TARGETS)
# zcat on OS X behaves differently in some cases
ZCAT = gunzip -c
# PARSE builds the n-best first-stage parser (i.e., Eugene's parser).
#
.PHONY: PARSE
PARSE:
make -C first-stage/PARSE parseIt
# TRAIN builds the programs needed to train the first-stage parser.
#
.PHONY: TRAIN
TRAIN:
make -C first-stage/TRAIN all
# reranker-runtime builds the run-time components of the reranker.
# These include best-parses, which reranks the n-best parses produced
# by the first-stage parser, and ptb, which is a program that converts
# Penn Treebank trees into the various formats needed by Eugene's
# parser, the reranker training programs, sparseval, etc.
#
.PHONY: reranker-runtime
reranker-runtime:
make -C second-stage/programs/features best-parses
make -C second-stage/programs/prepare-data ptb
# reranker builds the training and run-time components of the reranker.
# These include:
# ptb, which converts the Penn Treebank parse trees into
# the various formats needed by Eugene's parser, the reranker training
# program, sparseval, etc.,
# extract-spfeatures, which produces feature-count files used to train
# the reranker,
# cvlm, which estimates the feature weights.
#
.PHONY: reranker
reranker: top TRAIN
make -C second-stage
# EVALB has been replaced with sparseval (nearly the same features with fewer bugs)
#
sparseval: SParseval/src/sparseval
SParseval:
wget http://old-site.clsp.jhu.edu/ws2005/groups/eventdetect/files/SParseval.tgz
tar xvzf SParseval.tgz
rm SParseval.tgz
SParseval/src/sparseval: SParseval
rm -f SParseval/src/*.o
$(MAKE) -C SParseval/src sparseval
# clean removes object files.
#
.PHONY: clean
clean:
(cd first-stage; rm -f PARSE/*.o; rm -f TRAIN/*.o)
make -C first-stage/TRAIN clean
make -C first-stage/PARSE clean
make -C second-stage clean
# nbesttrain-clean removes temporary files used in constructing the 20
# folds of n-best training data.
#
nbesttrain-clean:
rm -fr $(TMP)
# train-clean gets rid of all data not essential for the reranking
# parser.
#
.PHONY: train-clean
train-clean: nbesttrain-clean
rm -fr results
make -C second-stage train-clean
# real-clean tries to get rid of all object and binary files to
# produce a version for distribution. But Eugene writes new programs
# faster than I can make real-clean clean them up!
#
.PHONY: real-clean
real-clean: clean train-clean
(cd first-stage; rm -f PARSE/parseIt)
make -C second-stage real-clean
########################################################################
# #
# nbesttrain -- Preparing the N-best training data for the reranker #
# #
########################################################################
# To build the 20-fold n-best data in second-stage/train
# for training the ranker, run
#
# make nbesttrain
#
# or
#
# make -j 2 nbesttrain
#
# on a multiprocessor machine
# TRAIN specifies the location of the trees to be divided into NFOLDS
# This is defined here to use sections 2-21 of the Penn WSJ treebank.
#
TRAIN=$(PENNWSJTREEBANK)/0[2-9]/*mrg $(PENNWSJTREEBANK)/1[0-9]/*mrg $(PENNWSJTREEBANK)/2[0-1]/*mrg
# NBESTDIR is the directory that holds the n-best parses for training
# the reranker.
#
NBESTDIR=second-stage/nbest/$(PARSERNICKNAME)$(NPARSES)
# NBESTFILES are all of the files in the n-best folds, plus dev and test sections
#
NBESTFILES= $(foreach fold,$(FOLDS),$(NBESTDIR)/fold$(fold).gz) $(foreach section,$(SECTIONS),$(NBESTDIR)/section$(section).gz)
.PHONY: nbesttrain
nbesttrain: $(NBESTFILES) PARSE TRAIN second-stage/programs/prepare-data/ptb
# This goal copies and gzips the output of the n-best parser
# into the appropriate directory for training the reranker.
#
# .PRECIOUS: $(NBESTDIR)/fold%.gz
.INTERMEDIATE: $(NBESTDIR)/fold%.gz
$(NBESTDIR)/fold%.gz: $(TMP)/fold%/$(NPARSES)best
mkdir -p $(NBESTDIR)
gzip -c $+ > $@
# The remaining goals in this section are for training and parsing
# with the n-best parser to produce the folds for training the
# reranker.
.INTERMEDIATE: $(TMP)/fold%/$(NPARSES)best
$(TMP)/fold%/$(NPARSES)best: $(TMP)/fold%/DATA $(TMP)/fold%/yield $(NBESTPARSER)
$(NBESTPARSER) -l400 -K -N$(NPARSES) $(@D)/DATA/ $(@D)/yield > $@
.INTERMEDIATE: $(TMP)/fold%/DATA
$(TMP)/fold%/DATA: $(TMP)/fold%/train $(TMP)/fold%/dev $(NBESTTRAINER)
mkdir -p $@
LC_COLLATE=C; cp first-stage/DATA/EN/[a-z]* $@
$(NBESTTRAINER) $@ $(@D)/train $(@D)/dev
.INTERMEDIATE: $(TMP)/fold%/train
$(TMP)/fold%/train: second-stage/programs/prepare-data/ptb
mkdir -p $(@D)
second-stage/programs/prepare-data/ptb -n $(NFOLDS) -x $(patsubst $(TMP)/fold%,%,$(@D)) -e $(TRAIN) > $@
.INTERMEDIATE: $(TMP)/fold%/dev
$(TMP)/fold%/dev: second-stage/programs/prepare-data/ptb
mkdir -p $(@D)
second-stage/programs/prepare-data/ptb -n $(NFOLDS) -i $(patsubst $(TMP)/fold%,%,$(@D)) -e $(TRAIN) > $@
# $(TMP)/fold%/DATA: $(TMP)/%/train $(TMP)/%/dev
# mkdir -p $@
# LC_COLLATE=C; cp first-stage/DATA/EN/[a-z]* $@
# first-stage/TRAIN/trainParser $@ $(@D)/train $(@D)/dev
.INTERMEDIATE: $(TMP)/fold%/yield
$(TMP)/fold%/yield: second-stage/programs/prepare-data/ptb
mkdir -p $(@D)
second-stage/programs/prepare-data/ptb -n $(NFOLDS) -i $(patsubst $(TMP)/fold%,%,$(@D)) -c $(TRAIN) > $@
# .PRECIOUS: $(NBESTDIR)/section%.gz
.INTERMEDIATE: $(NBESTDIR)/section%.gz
$(NBESTDIR)/section%.gz: $(TMP)/section%/$(NPARSES)best
mkdir -p $(NBESTDIR)
gzip -c $+ > $@
.INTERMEDIATE: $(TMP)/section%/$(NPARSES)best
$(TMP)/section%/$(NPARSES)best: $(TMP)/section%/yield $(NBESTPARSER)
$(NBESTPARSER) -l400 -K -N$(NPARSES) first-stage/DATA/EN/ $(@D)/yield > $@
.INTERMEDIATE: $(TMP)/section%/yield
$(TMP)/section%/yield: second-stage/programs/prepare-data/ptb
mkdir -p $(@D)
second-stage/programs/prepare-data/ptb -c $(PENNWSJTREEBANK)/$(patsubst $(TMP)/section%,%,$(@D))/wsj*.mrg > $@
########################################################################
# #
# nbest oracle evaluation #
# #
########################################################################
.PHONY: nbest-oracle
nbest-oracle: second-stage/programs/features/oracle-score second-stage/programs/prepare-data/ptb $(NBESTFILES)
second-stage/programs/features/oracle-score "$(ZCAT) $(NBESTDIR)/fold[0-1][0-9].gz" "second-stage/programs/prepare-data/ptb -g $(TRAIN)"
second-stage/programs/features/oracle-score "$(ZCAT) $(NBESTDIR)/section22.gz" "second-stage/programs/prepare-data/ptb -g $(PENNWSJTREEBANK)/22/wsj*.mrg"
second-stage/programs/features/oracle-score "$(ZCAT) $(NBESTDIR)/section24.gz" "second-stage/programs/prepare-data/ptb -g $(PENNWSJTREEBANK)/24/wsj*.mrg"
.PHONY: nbest-oracle-detailed
nbest-oracle-detailed: second-stage/programs/eval-beam/main second-stage/programs/prepare-data/ptb $(NBESTFILES)
second-stage/programs/eval-beam/main "$(ZCAT) $(NBESTDIR)/fold[0-1][0-9].gz" "second-stage/programs/prepare-data/ptb -g $(TRAIN)"
second-stage/programs/eval-beam/main "$(ZCAT) $(NBESTDIR)/section22.gz" "second-stage/programs/prepare-data/ptb -g $(PENNWSJTREEBANK)/22/wsj*.mrg"
second-stage/programs/eval-beam/main "$(ZCAT) $(NBESTDIR)/section24.gz" "second-stage/programs/prepare-data/ptb -g $(PENNWSJTREEBANK)/24/wsj*.mrg"
########################################################################
# #
# extract-features extracts feature counts for training reranker #
# #
########################################################################
# FEATBASEDIR is the directory in which the feature counts will be saved,
# minus the $(VERSION) flag.
#
FEATBASEDIR=second-stage/features/$(PARSERNICKNAME)$(NPARSES)$(FEATURESNICKNAME)
# FEATDIR is the directory in which the feature counts will be saved.
#
FEATDIR=$(FEATBASEDIR)$(VERSION)
# MODELBASEDIR is the directory in which the features and feature
# weights are saved, minus the version.
#
MODELBASEDIR=second-stage/models/$(PARSERNICKNAME)$(NPARSES)$(FEATURESNICKNAME)
# MODELDIR is the directory in which the features and feature weights
# are saved.
#
MODELDIR=$(MODELBASEDIR)$(VERSION)
.PHONY: features
features: $(MODELDIR)/features.gz $(FEATDIR)/train.gz $(FEATDIR)/dev.gz $(FEATDIR)/test1.gz $(FEATDIR)/test2.gz
# This goal does feature extraction for reranker training for the
# nonfinal case (i.e., train is folds 0-17, dev is folds 18-19, test1
# is section 22 and test2 is section 24).
#
$(MODELBASEDIR)nonfinal/features.gz $(FEATBASEDIR)nonfinal/train.gz $(FEATBASEDIR)nonfinal/dev.gz $(FEATBASEDIR)nonfinal/test1.gz $(FEATBASEDIR)nonfinal/test2.gz: second-stage/programs/prepare-data/ptb $(FEATUREEXTRACTOR) $(NBESTFILES)
mkdir -p $(FEATBASEDIR)nonfinal
mkdir -p $(MODELBASEDIR)nonfinal
$(FEATUREEXTRACTOR) $(FEATUREEXTRACTORFLAGS) \
"$(ZCAT) $(NBESTDIR)/fold0[0-9].gz $(NBESTDIR)/fold1[0-7].gz" \
"second-stage/programs/prepare-data/ptb -g -n 10 -x 9 $(TRAIN)" \
$(FEATBASEDIR)nonfinal/train.gz \
"$(ZCAT) $(NBESTDIR)/fold1[8-9].gz" \
"second-stage/programs/prepare-data/ptb -g -n 10 -i 9 $(TRAIN)" \
$(FEATBASEDIR)nonfinal/dev.gz \
"$(ZCAT) $(NBESTDIR)/section22.gz" \
"second-stage/programs/prepare-data/ptb -g $(PENNWSJTREEBANK)/22/*mrg" \
$(FEATBASEDIR)nonfinal/test1.gz \
"$(ZCAT) $(NBESTDIR)/section24.gz" \
"second-stage/programs/prepare-data/ptb -g $(PENNWSJTREEBANK)/24/*mrg" \
$(FEATBASEDIR)nonfinal/test2.gz \
| gzip > $(MODELBASEDIR)nonfinal/features.gz
# This goal does feature extraction for reranker training for the
# final case (i.e., train is folds 0-19, dev is section 24, test1
# is section 22 and test2 is section 23).
#
$(MODELBASEDIR)final/features.gz $(FEATBASEDIR)final/train.gz $(FEATBASEDIR)final/dev.gz $(FEATBASEDIR)final/test1.gz $(FEATBASEDIR)final/test2.gz: second-stage/programs/prepare-data/ptb $(FEATUREEXTRACTOR) $(NBESTFILES)
mkdir -p $(FEATBASEDIR)final
mkdir -p $(MODELBASEDIR)final
$(FEATUREEXTRACTOR) $(FEATUREEXTRACTORFLAGS) \
"$(ZCAT) $(NBESTDIR)/fold*.gz" \
"second-stage/programs/prepare-data/ptb -g $(TRAIN)" \
$(FEATBASEDIR)final/train.gz \
"$(ZCAT) $(NBESTDIR)/section22.gz" \
"second-stage/programs/prepare-data/ptb -g $(PENNWSJTREEBANK)/22/*mrg" \
$(FEATBASEDIR)final/test1.gz \
"$(ZCAT) $(NBESTDIR)/section23.gz" \
"second-stage/programs/prepare-data/ptb -g $(PENNWSJTREEBANK)/23/*mrg" \
$(FEATBASEDIR)final/test2.gz \
"$(ZCAT) $(NBESTDIR)/section24.gz" \
"second-stage/programs/prepare-data/ptb -g $(PENNWSJTREEBANK)/24/*mrg" \
$(FEATBASEDIR)final/dev.gz \
| gzip > $(MODELBASEDIR)final/features.gz
########################################################################
# #
# train-reranker estimates the reranker feature weights #
# #
########################################################################
WEIGHTSFILE=$(MODELDIR)/$(ESTIMATORNICKNAME)-weights
WEIGHTSFILEGZ=$(WEIGHTSFILE).gz
.PHONY: train-reranker
train-reranker: $(WEIGHTSFILEGZ)
# This goal estimates the reranker feature weights (i.e., trains the
# reranker). This is not hard to parallelize, but I haven't actually
# done that yet.
#
# $(WEIGHTSFILEGZ): $(ESTIMATOR)
$(WEIGHTSFILEGZ): $(ESTIMATOR) $(MODELDIR)/features.gz $(FEATDIR)/train.gz $(FEATDIR)/dev.gz $(FEATDIR)/test1.gz
$(ZCAT) $(FEATDIR)/train.gz | $(ESTIMATOR) $(ESTIMATORFLAGS) -e $(FEATDIR)/dev.gz -f $(MODELDIR)/features.gz -o $(WEIGHTSFILE) -x $(FEATDIR)/test1.gz
rm -f $(WEIGHTSFILEGZ)
gzip $(WEIGHTSFILE)
########################################################################
# #
# eval-reranker evaluates the reranker on the two test data sets #
# #
########################################################################
EVALDIR=second-stage/eval/$(PARSERNICKNAME)$(NPARSES)$(FEATURESNICKNAME)$(VERSION)-$(ESTIMATORNICKNAME)
.PHONY: eval-reranker
eval-reranker: $(EVALDIR)/weights-eval # $(EVALDIR)/dev-parsediffs.gz
$(EVALDIR)/weights-eval: $(WEIGHTSFILEGZ) $(MODELDIR)/features.gz $(FEATDIR)/dev.gz $(FEATDIR)/test1.gz $(FEATDIR)/test2.gz second-stage/programs/eval-weights/eval-weights
mkdir -p $(EVALDIR)
$(ZCAT) $(WEIGHTSFILEGZ) | second-stage/programs/eval-weights/eval-weights $(EVALWEIGHTSARGS) $(MODELDIR)/features.gz $(FEATDIR)/dev.gz > $(EVALDIR)/weights-eval
$(ZCAT) $(WEIGHTSFILEGZ) | second-stage/programs/eval-weights/eval-weights $(EVALWEIGHTSARGS) $(MODELDIR)/features.gz $(FEATDIR)/test1.gz >> $(EVALDIR)/weights-eval
$(ZCAT) $(WEIGHTSFILEGZ) | second-stage/programs/eval-weights/eval-weights $(EVALWEIGHTSARGS) $(MODELDIR)/features.gz $(FEATDIR)/test2.gz >> $(EVALDIR)/weights-eval
$(EVALDIR)/dev-parsediffs.gz: $(WEIGHTSFILEGZ) $(FEATDIR)/test1.gz $(NBESTDIR)/section24.gz second-stage/programs/eval-weights/best-indices second-stage/programs/eval-weights/best-parses second-stage/programs/eval-weights/pretty-print
$(ZCAT) $(WEIGHTSFILEGZ) \
| second-stage/programs/eval-weights/best-indices $(FEATDIR)/test1.gz \
| second-stage/programs/eval-weights/best-parses $(NBESTDIR)/section24.gz \
| second-stage/programs/eval-weights/pretty-print -d \
| gzip > $(EVALDIR)/dev-parsediffs.gz
Jump to Line
Something went wrong with that request. Please try again.