diff --git a/AMR_FEATURE/.classpath b/AMR_FEATURE/.classpath
new file mode 100644
index 0000000..e910e9c
--- /dev/null
+++ b/AMR_FEATURE/.classpath
@@ -0,0 +1,9 @@
+
+
+
+
+
+
+
+
+
diff --git a/AMR_FEATURE/.project b/AMR_FEATURE/.project
new file mode 100644
index 0000000..b04548c
--- /dev/null
+++ b/AMR_FEATURE/.project
@@ -0,0 +1,17 @@
+
+
+ AMR_FEATURE
+
+
+
+
+
+ org.eclipse.jdt.core.javabuilder
+
+
+
+
+
+ org.eclipse.jdt.core.javanature
+
+
diff --git a/AMR_FEATURE/.settings/org.eclipse.jdt.core.prefs b/AMR_FEATURE/.settings/org.eclipse.jdt.core.prefs
new file mode 100644
index 0000000..3a21537
--- /dev/null
+++ b/AMR_FEATURE/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,11 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
+org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
+org.eclipse.jdt.core.compiler.compliance=1.8
+org.eclipse.jdt.core.compiler.debug.lineNumber=generate
+org.eclipse.jdt.core.compiler.debug.localVariable=generate
+org.eclipse.jdt.core.compiler.debug.sourceFile=generate
+org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
+org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.source=1.8
diff --git a/AMR_FEATURE/bin/convertingAMR.class b/AMR_FEATURE/bin/convertingAMR.class
new file mode 100644
index 0000000..0e63a79
Binary files /dev/null and b/AMR_FEATURE/bin/convertingAMR.class differ
diff --git a/AMR_FEATURE/bin/json-20170516.jar b/AMR_FEATURE/bin/json-20170516.jar
new file mode 100644
index 0000000..5033f66
Binary files /dev/null and b/AMR_FEATURE/bin/json-20170516.jar differ
diff --git a/AMR_FEATURE/joints.txt b/AMR_FEATURE/joints.txt
new file mode 100644
index 0000000..78be015
--- /dev/null
+++ b/AMR_FEATURE/joints.txt
@@ -0,0 +1,922 @@
+have to
+at all
+so far
+more than
+less than
+no one
+as well
+at least
+right wing
+left wing
+as long as
+all over
+of course
+kind of
+after all
+by oneself
+by the way
+in fact
+be all
+head up
+come out
+coop up
+seize up
+bust up
+hang out
+limber up
+quieten down
+crack up
+fuck up
+get out
+clear out
+rip up
+rock on
+shout down
+bundle up
+pump up
+smooth out
+set down
+drop off
+think over
+core out
+tidy up
+make off
+fight on
+set out
+think up
+try out
+sign in
+take out
+top off
+nail down
+block up
+cash in
+fork out
+mark down
+rattle off
+bandage up
+sleep over
+patch up
+freeze over
+seal off
+free up
+clown around
+tear down
+dust off
+live up
+cut loose
+louse up
+sit down
+stand by
+take up
+steal away
+lay off
+turn in
+meet up
+check up
+taper off
+dole out
+catch up
+shape up
+tax away
+pass off
+give in
+speak up
+call upon
+stall out
+butt in
+carve out
+step up
+trigger off
+prop up
+scoop up
+summon forth
+boss around
+cool down
+give back
+cut down
+jot down
+doze off
+drum up
+bog down
+throw out
+shy away
+frost over
+rack up
+even out
+light up
+shack up
+bone up
+cut out
+sum up
+shut up
+send out
+pine away
+take over
+gobble up
+shoot back
+lay on
+swear off
+spread out
+pin down
+find out
+drag on
+thaw out
+bump off
+fatten up
+get back
+arm up
+load up
+give vent
+top up
+bounce back
+bad off
+come by
+single out
+call out
+slow down
+ask out
+slice up
+roll up
+divide up
+hold over
+touch off
+pass out
+have mod
+screw up
+iron out
+tell on
+dry out
+zero out
+rev up
+request confirmation
+scrawl out
+tie in
+pass up
+scratch out
+miss out
+root out
+frighten off
+have subevent
+go on
+follow through
+lighten up
+trade off
+carry over
+pay out
+mellow out
+fool around
+get down
+stretch out
+run down
+scrub up
+splash out
+stop by
+touch upon
+dig out
+stick around
+act out
+pass by
+watch out
+share out
+shut out
+get along
+go through
+tease out
+kill off
+slug out
+bottom out
+tie down
+neaten up
+dress down
+turn off
+bandy around
+yammer away
+gulp down
+cut back
+chatter away
+glaze over
+drop by
+slack off
+fess up
+seek out
+creep out
+hold up
+knock up
+shine through
+fence off
+zero in
+flip out
+rein in
+screen out
+cheer up
+saw up
+sign off
+flatten out
+heat up
+add on
+clip off
+doll up
+touch on
+fall off
+suit up
+palm off
+mist over
+flesh out
+burn up
+sweat out
+work up
+brazen out
+peel off
+pay up
+get even
+fill out
+whip up
+shout out
+kick in
+draw up
+thrash out
+head off
+come in
+break up
+speed up
+spout off
+type up
+polish off
+trot out
+puke up
+bank up
+rip off
+dry up
+settle down
+cry out
+go out
+face off
+ride up
+buckle up
+pair up
+come off
+auction off
+roll back
+throw in
+eat up
+suck up
+shut down
+wipe out
+nod off
+choke off
+sleep off
+stand up
+frost up
+join in
+mix up
+crisp up
+knock out
+talk out
+set off
+sit in
+bang on
+flake out
+take off
+queue up
+square off
+make over
+ramp up
+let down
+toss out
+finish up
+blow over
+sound off
+cut up
+rough in
+blot out
+stave off
+stop off
+act up
+scout out
+pay off
+beat out
+copy out
+wolf down
+have manner
+get through
+break off
+drug up
+pump out
+take hold
+polish up
+pucker up
+write off
+shell out
+come over
+color in
+tamp down
+shut off
+have mode
+strike up
+beat up
+sweep up
+come up
+blast off
+lie in
+warm over
+ratchet up
+bump up
+play out
+look out
+tip over
+fudge over
+warm up
+throw away
+crank up
+tip off
+have quant
+go back
+roll out
+trim down
+set up
+rake in
+piss off
+give over
+buoy up
+pen up
+touch up
+parcel out
+boom out
+give off
+jump up
+leave over
+tone down
+dream on
+lock in
+win over
+stop over
+turn over
+play on
+edge out
+get up
+leave off
+finish off
+slim down
+wall off
+puff up
+plug up
+write out
+let out
+stop up
+calm down
+bring about
+phase out
+belly up
+break down
+stick up
+lock up
+pull out
+set upon
+jet off
+pay down
+fart around
+zone out
+bear out
+take away
+bleed off
+write up
+lash out
+lam out
+tie up
+siphon off
+dress up
+stamp out
+black out
+snuff out
+whip out
+go off
+ease up
+tune out
+gun down
+freak out
+chop down
+strip away
+step down
+hit up
+read up
+chew up
+start out
+own up
+close down
+come upon
+cone down
+yield up
+get away
+gear up
+bring on
+figure out
+turn up
+check out
+bead up
+ship out
+crank out
+flush out
+let on
+put on
+usher in
+spin off
+knock off
+skim off
+pass on
+finish out
+instead of
+leave out
+frighten away
+buy up
+knock over
+straighten out
+wear off
+whiz away
+call on
+put out
+totter around
+salt away
+spell out
+creep up
+hold out
+sign up
+branch out
+mark up
+hail down
+pick out
+shoot off
+din out
+beef up
+get off
+break through
+smarten up
+help out
+buy out
+stake out
+take in
+do in
+come to
+sell out
+shore up
+hem in
+hang up
+boil over
+sort out
+wipe up
+curl up
+whack off
+track down
+dig up
+run out
+haul out
+plot out
+loan out
+coil up
+die off
+pipe down
+kick off
+come through
+print out
+pick away
+gloss over
+ring up
+go down
+read off
+pitch in
+choke up
+break in
+crack down
+boot up
+blurt out
+sluice down
+fill up
+spring up
+lock out
+pack up
+look over
+whittle down
+chicken out
+bandy about
+cart off
+plug in
+buy off
+pick on
+crash out
+total up
+pile on
+pan out
+prick up
+dish up
+stash away
+round up
+shoot up
+balance out
+bring along
+quiet down
+cut off
+vamp up
+run off
+pull down
+team up
+hold back
+hammer out
+stack up
+think through
+match up
+rise up
+have concession
+wipe off
+hash out
+come down
+sock away
+jump in
+hang on
+ferret out
+wake up
+brick over
+burst out
+tack down
+spike out
+use up
+carry on
+bottle up
+tighten up
+start up
+carry off
+speak out
+set about
+tag along
+hook up
+oil up
+fend off
+start over
+sit up
+sign on
+take down
+study up
+while away
+fold up
+cheer on
+bust out
+rate entity
+play down
+book up
+bind up
+stay on
+come about
+put up
+dine out
+have frequency
+store up
+give up
+vote down
+bring up
+tape up
+leave behind
+turn on
+save up
+break out
+wash up
+fork over
+hollow out
+freshen up
+screw over
+dash off
+have part
+mess up
+buy into
+burn out
+cave in
+lead up
+clear up
+cry down
+stand out
+turn away
+drown out
+run in
+cover up
+spill over
+die out
+farm out
+hand over
+poke around
+ride out
+come across
+give away
+tack on
+bow out
+squeeze out
+write in
+show up
+come on
+fix up
+sew up
+fort up
+do away
+liven up
+scrunch up
+log on
+ham up
+look down
+firm up
+tally up
+tool up
+weigh in
+flare up
+strike down
+thin out
+blast away
+reel off
+feed up
+camp out
+well off
+crop up
+be like
+open up
+link up
+lick up
+look up
+statistical test
+charge off
+drop out
+keep up
+tick off
+tune in
+write down
+bat in
+stay over
+gas up
+pick up
+cook up
+boil down
+pull through
+call off
+pop off
+hand out
+push up
+fritter away
+trail off
+chop up
+rear end
+fuck around
+rattle on
+tire out
+street address
+keep on
+pack away
+keg stand
+close off
+lose out
+wring out
+make believe
+soak up
+tee off
+shake up
+scent out
+steer clear
+have instrument
+tear up
+feel up
+live down
+bowl over
+step in
+hobnob around
+bow down
+buzz off
+tangle up
+catch on
+price out
+snap up
+live out
+touch base
+be done
+have li
+vomit up
+clean out
+laid back
+buckle down
+slip in
+swear in
+stall off
+shoot down
+be from
+serve up
+join up
+back up
+well up
+pull up
+put down
+wash down
+dish out
+age out
+fight back
+bring down
+run up
+zip up
+switch over
+spend down
+call up
+be polite
+pop up
+fall apart
+net out
+jut out
+wind up
+rent out
+cross out
+rough up
+broke ass
+dredge up
+wait out
+shuffle off
+build up
+box in
+shake off
+cool off
+get on
+hit on
+straighten up
+start off
+belch out
+lie down
+play up
+give out
+haul in
+hard put
+make up
+snap off
+follow suit
+pass away
+smooth over
+hole up
+turn out
+clog up
+sober up
+smash up
+contract out
+go over
+dope up
+bed down
+sit out
+hype up
+drop in
+put off
+ward off
+get together
+turn down
+back off
+swoop up
+out trade
+size up
+pull off
+conjure up
+stock up
+sleep away
+monkey around
+break away
+pile up
+put in
+dream up
+wrap up
+gum up
+bound up
+tuck away
+board up
+have purpose
+stick out
+fall out
+take aback
+chart out
+latch on
+belt out
+wear on
+muck up
+step aside
+lead off
+point out
+line up
+check in
+start in
+bunch up
+watch over
+fill in
+work out
+joke around
+hum along
+lock down
+wear out
+rip out
+bleed out
+come along
+play off
+show off
+have extent
+concrete over
+narrow down
+jack up
+stare down
+pipe up
+loosen up
+wear down
+bear up
+cover over
+have polarity
+mic up
+make do
+close over
+deck out
+blow out
+play to
+hammer away
+ration out
+sell off
+have name
+strike out
+shuttle off
+call in
+shrug off
+chalk up
+perk up
+knock down
+follow up
+pass over
+brush off
+drink up
+fly out
+close in
+grow up
+eat away
+have condition
+snatch away
+pick off
+stress out
+take on
+muddle up
+tuck in
+live on
+skip off
+look forward
+stir up
+bail out
+stand down
+close up
+run over
+throw up
+fuck off
+swallow up
+spill out
+fall back
+fight off
+rig up
+sweat off
+hide out
+divvy up
+flash back
+end up
+make it
+toss in
+round out
+sniff out
+grind up
+chip in
+cough up
+phase in
+let up
+water down
+hold on
+level off
+have value
+fit in
+yammer on
+key in
+hold off
+silt up
+get by
+split up
+make out
+look after
+rubber stamp
+sketch out
+pull over
+spruce up
+glass over
+add up
+mist up
+brush up
+wind down
+clutch on
+knock back
+pare down
+rule out
+fall through
+hack away
+asphalt over
+clean up
+pound out
+die down
+carry out
+fall over
+blow up
+weasel out
+break even
diff --git a/AMR_FEATURE/src/convertingAMR.java b/AMR_FEATURE/src/convertingAMR.java
new file mode 100644
index 0000000..56777f0
--- /dev/null
+++ b/AMR_FEATURE/src/convertingAMR.java
@@ -0,0 +1,493 @@
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.json.JSONArray;
+import org.json.JSONObject;
+
+import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.ling.IndexedWord;
+import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP;
+import edu.stanford.nlp.semgraph.SemanticGraph;
+import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
+import edu.stanford.nlp.semgraph.SemanticGraphEdge;
+import edu.stanford.nlp.util.CoreMap;
+import edu.stanford.nlp.util.PropertiesUtils;
+
+public class convertingAMR {
+ private Map> map;
+
+ /*initialize from joints.txt, which is used for greedily connecting phrases
+ * e.g. make-up
+ * extracted from AMRPropBank and training set
+ * */
+ public convertingAMR(String file) {
+ map = new HashMap>();
+ Set tmp;
+
+ try (FileInputStream fis = new FileInputStream(file);
+ BufferedReader br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));) {
+
+ String line;
+ String[] pair;
+ while ((line = br.readLine()) != null) {
+ pair = line.split(" ");
+ String past = "";
+ for (int i = 0; i < pair.length - 1; i++) {
+ past += pair[i] + " ";
+ tmp = map.getOrDefault(past.trim(), new HashSet());
+ tmp.add(pair[i + 1]);
+ map.put(past.trim().replace(" ", "-"), tmp);
+
+ }
+
+ }
+ // System.out.println(map.toString());
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public static List folderToFilesPath(String folder, String suffix) {
+ List results = new ArrayList();
+
+ File[] files = new File(folder).listFiles();
+ // If this pathname does not denote a directory, then listFiles() returns null.
+
+ for (File file : files) {
+ if (file.isFile() && file.getName().endsWith(suffix)) {
+ results.add(folder + file.getName());
+ }
+ }
+ return results;
+ }
+
+ public void featureExtract(String file) {
+
+ // build pipeline
+ StanfordCoreNLP pipeline = new StanfordCoreNLP(
+ PropertiesUtils.asProperties("annotators", "tokenize,ssplit,pos,lemma,ner", "tokenize.options",
+ "splitHyphenated=true",
+ "tokenize.whitespace", "true",//start with tokenized file
+ "ssplit.isOneSentence", //ignore multi-sentence construction
+ "true", "tokenize.language", "en"));
+
+ String[] name = file.split("/");
+
+ String line = "";
+ try (FileInputStream fis = new FileInputStream(file);
+ BufferedReader br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));) {
+
+ System.out.println(name[name.length - 1]);
+ int n = 0;
+ int changed = 0;
+ List obs = new ArrayList();
+ line = br.readLine();
+ while (line != null && !line.trim().isEmpty()) {
+ // if (n % 2 == 0)
+ // System.out.println(n+"\n"+line);
+ n++;
+ JSONObject obj = new JSONObject();
+ StringBuilder pre = new StringBuilder();
+ while (!line.startsWith("# ::tok ") && !line.startsWith("# ::snt ")) {
+ pre.append(line + "\n");
+ line = br.readLine();
+ }
+ obj.put("pre", pre.toString());
+
+ //build a sentence without buggy texts....
+ String snt = line.replace("# ::tok ", "").replace("# ::snt ", "");
+ snt = snt.replaceAll("\\.{2,}", "").replaceAll(" ", " ");
+ snt = snt.replace(" ", " ").replace(" ", " ").replace("\n", "");
+ snt = snt.replaceAll("\"", " \" ");
+ snt = snt.replaceAll("\\(", " \\( ");
+ snt = snt.replaceAll("\\)", " \\) ");
+ snt = snt.replaceAll("@-@", "-").replaceAll(" @:@ ", ":");
+ obj.put("snt", snt);
+
+ //initial feature extraction and connecting of phrase
+ HashMap> data = extractSentence(obj, pipeline,true);
+ //connects number
+ changed += post_procee_number(data);
+ //connects ner, mainly due to "-" and "'s" construction in AMR NER
+ changed += post_procee_ner(data);
+ obj.put("ner", data.get("ner"));
+ obj.put("lem", data.get("lem"));
+ obj.put("tok", data.get("tok"));
+ obj.put("pos", data.get("pos"));
+ obs.add(obj);
+
+ if (obs.size() % 500 == 0) {
+ System.out.println(obs.size() + " " + name[name.length - 1]);
+ obj.keys().forEachRemaining(k -> {
+ System.out.println(k + ": " + obj.get(k));
+ });
+ }
+
+ //read remaining e.g. AMR graph
+ StringBuilder post = new StringBuilder();
+ line = br.readLine();
+ while (line != null && !line.trim().isEmpty()) {
+ post.append(line + "\n");
+ line = br.readLine();
+ }
+
+ obj.put("post", post.toString());
+
+ while (line != null && line.trim().isEmpty()) {
+ line = br.readLine();
+ }
+
+ }
+ System.out.println("\n" + name[name.length - 1] + " done. Total sentences: " + obs.size() + "\n");
+ System.out.println("\n" + changed + " changed." + "\n");
+ String out = obs.stream().map(obj -> writeObject(obj)).collect(Collectors.joining("\n"));
+ Files.write(Paths.get(file.replaceAll(".txt(_[a-z]*)*", ".txt_pre_processed")), out.getBytes());
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (NullPointerException e) {
+ System.out.println(file + " null pointer??");
+ System.out.println(line + " null pointer??");
+ e.printStackTrace();
+ }
+ }
+
+ //same as featureExtract, but have sentence only
+ public void featureExtractSentenceOnly(String file) {
+
+ StanfordCoreNLP pipeline = new StanfordCoreNLP(
+ PropertiesUtils.asProperties("annotators", "tokenize,ssplit,pos,lemma,ner", "tokenize.options",
+ "splitHyphenated=true", "tokenize.whitespace", "true",
+ "ssplit.isOneSentence", "true", "tokenize.language", "en"));
+ String[] name = file.split("/");
+
+ String line = "";
+ try (FileInputStream fis = new FileInputStream(file);
+ BufferedReader br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));) {
+
+ System.out.println(name[name.length - 1]);
+ int n = 0;
+ int changed = 0;
+ List obs = new ArrayList();
+ line = br.readLine();
+ while (line != null && !line.trim().isEmpty()) {
+ // if (n % 2 == 0)
+ // System.out.println(n+"\n"+line);
+ n++;
+ JSONObject obj = new JSONObject();
+ StringBuilder pre = new StringBuilder();
+ obj.put("pre", pre.toString());
+
+ //build a sentence without buggy texts....
+ String snt = line.replace("# ::tok ", "").replace("# ::snt ", "");
+ /* if (snt.startsWith("the ones who are suffering are the ordinary people :")) {
+ System.out.println("!!!!\n"+snt+"\n!!!!");
+ }
+ snt = snt.replaceAll("\\.{2,}", "").replaceAll(" ", " ");
+ if (snt.startsWith("the ones who are suffering are the ordinary people :")) {
+ System.out.println("!!!!\n"+snt+"\n!!!!");
+ }
+ snt = snt.replace(" ", " ").replace(" ", " ").replace("\n", "");
+ snt = snt.replaceAll("\"", " \" ");
+ snt = snt.replaceAll("\\(", " \\( ");
+ snt = snt.replaceAll("\\)", " \\) ");
+ snt = snt.replaceAll("@-@", "-").replaceAll(" @:@ ", ":");*/
+ obj.put("snt", snt);
+
+ //feature extraction and connecting of phrase, no change of tokenization
+ HashMap> data = extractSentence(obj, pipeline,false);
+ if (snt.startsWith("the ones who are suffering are the ordinary people :")) {
+ System.out.println("!!!!\n"+snt+"\n!!!!");
+ System.out.println( data.get("tok"));
+ }
+ obj.put("ner", data.get("ner"));
+ obj.put("lem", data.get("lem"));
+ obj.put("tok", data.get("tok"));
+ obj.put("pos", data.get("pos"));
+ obs.add(obj);
+
+ if (obs.size() % 500 == 0) {
+ System.out.println(obs.size() + " " + name[name.length - 1]);
+ obj.keys().forEachRemaining(k -> {
+ System.out.println(k + ": " + obj.get(k));
+ });
+ }
+
+ StringBuilder post = new StringBuilder();
+ obj.put("post", post.toString());
+
+ line = br.readLine();
+ while (line != null && line.trim().isEmpty()) {
+ line = br.readLine();
+ }
+
+ }
+ System.out.println("\n" + name[name.length - 1] + " done. Total sentences: " + obs.size() + "\n");
+ System.out.println("\n" + changed + " changed." + "\n");
+ String out = obs.stream().map(obj -> writeObject(obj)).collect(Collectors.joining("\n"));
+ Files.write(Paths.get(file.replaceAll(".txt(_[a-z]*)*", ".txt_processed")), out.getBytes());
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (NullPointerException e) {
+ System.out.println(file + " null pointer??");
+ System.out.println(line + " null pointer??");
+ e.printStackTrace();
+ }
+ }
+
+ public String jsonArrayToWhiteSpaceString(JSONArray arr) {
+ String out = "";
+ for (Object s : arr) {
+ out += "\t" + s.toString();
+ }
+ return out.substring(1);
+ }
+
+ // output data to txt
+ public String writeObject(JSONObject obj) {
+ StringBuilder out = new StringBuilder();
+ if (!obj.getString("pre").trim().isEmpty())
+ out.append(obj.getString("pre"));
+ String tok = jsonArrayToWhiteSpaceString(obj.getJSONArray("tok"));
+ out.append("# ::tok\t" + tok + "\n");
+ String lemma = jsonArrayToWhiteSpaceString(obj.getJSONArray("lem"));
+ out.append("# ::lem\t" + lemma + "\n");
+ String pos = jsonArrayToWhiteSpaceString(obj.getJSONArray("pos"));
+ out.append("# ::pos\t" + pos + "\n");
+ String ner = jsonArrayToWhiteSpaceString(obj.getJSONArray("ner"));
+ out.append("# ::ner\t" + ner + "\n");
+
+ assert tok.split(" ").length == lemma.split(" ").length;
+ assert tok.split(" ").length == pos.split(" ").length;
+ assert tok.split(" ").length == ner.split(" ").length;
+
+ if (!obj.getString("post").trim().isEmpty())
+ out.append(obj.getString("post") + "\n");
+
+ return out.toString();
+ }
+
+ public volatile int positive = 0;
+ public volatile int truth = 0;
+ public volatile int truth_positive = 0;
+
+ public HashMap> extractSentence(JSONObject obj, StanfordCoreNLP pipeline,boolean retoken) {
+ String text = obj.getString("snt");
+ obj.put("snt", text);
+ // create an empty Annotation just with the given text
+
+ Annotation sent = new Annotation(text);
+ HashMap> data = new HashMap>();
+ // run all Annotators on this text
+ pipeline.annotate(sent);
+ LinkedList lemma = new LinkedList();
+ LinkedList tok = new LinkedList();
+ LinkedList ner = new LinkedList();
+ LinkedList pos = new LinkedList();
+ String p_l = "";
+ String p_s = "";
+ String p_n = "";
+ String p_p = "";
+ int changed = 0;
+
+ List sentences = sent.get(SentencesAnnotation.class);
+ Set tmp = new HashSet();
+ for (CoreMap sentence : sentences) {
+
+ for (int i = 0; i < sentence.get(TokensAnnotation.class).size(); i++) {
+ CoreLabel token = sentence.get(TokensAnnotation.class).get(i);
+ if (retoken &&tmp.contains(token.get(LemmaAnnotation.class))
+ && (!map.containsKey(lemma.getLast() + "-" + token.get(LemmaAnnotation.class)) //not x-y-z
+ || (i + 1 < sentence.get(TokensAnnotation.class).size() - 1 &&
+ map.get(lemma.getLast() + "-" + token.get(LemmaAnnotation.class))
+ .contains(sentence.get(TokensAnnotation.class).get(i + 1)
+ .get(LemmaAnnotation.class))
+
+ ))) {
+ p_s = tok.removeLast();
+ p_l = lemma.removeLast();
+ p_p = pos.removeLast();
+ p_n = ner.removeLast();
+ changed = 1;
+ tok.add(p_s + "-" + token.get(TextAnnotation.class));
+ lemma.add(p_l + "-" + token.get(LemmaAnnotation.class).toLowerCase());
+ pos.add("COMP");
+ ner.add("O");
+ } else {
+
+ tok.add(token.get(TextAnnotation.class));
+ lemma.add(token.get(LemmaAnnotation.class).toLowerCase());
+ pos.add(token.get(PartOfSpeechAnnotation.class));
+ if (lemma.get(lemma.size() - 1).contains("www.") || lemma.get(lemma.size() - 1).contains("http"))
+ ner.add("URL");
+ else
+ ner.add(token.get(NamedEntityTagAnnotation.class));
+
+ }
+ tmp = map.getOrDefault(lemma.getLast(), new HashSet());
+ }
+
+ }
+ assert ner.size() == lemma.size() && lemma.size() == tok.size() && tok.size() == pos.size();
+ data.put("lem", lemma);
+ data.put("tok", tok);
+ data.put("pos", pos);
+ data.put("ner", ner);
+ return data;
+
+ }
+
+ private String[] tobehashed = { "hundred", "thousand", "million", "billion", "trillion", "hundreds", "thousands",
+ "millions", "billions", "trillions", "-" };
+ private HashSet num_txts = new HashSet<>(Arrays.asList(tobehashed));
+
+ public boolean number_read(String old, String t) {
+ return num_txts.contains(t) && !old.equals("-") && !t.equals("-");
+ }
+
+ public int post_procee_number(HashMap> obj) {
+ LinkedList ner_ = (LinkedList) obj.get("ner");
+ LinkedList lemma_ = (LinkedList) obj.get("lem");
+ LinkedList tok_ = (LinkedList) obj.get("tok");
+ LinkedList pos_ = (LinkedList) obj.get("pos");
+ String p_l = "";
+ String p_t = "";
+ String p_n = "";
+ String p_p = "";
+ LinkedList lemma = new LinkedList();
+ LinkedList tok = new LinkedList();
+ LinkedList ner = new LinkedList();
+ LinkedList pos = new LinkedList();
+ int out = 0;
+ for (int i = 0; i < lemma_.size(); i++) {
+ if (pos.isEmpty() || !pos_.get(i).equals("CD") || (!pos.isEmpty() && !pos.getLast().equals("CD"))
+ || (!number_read(lemma.getLast(), lemma_.get(i)))) {
+
+ lemma.add(lemma_.get(i));
+ tok.add(tok_.get(i));
+ ner.add(ner_.get(i));
+ pos.add(pos_.get(i));
+ } else {
+ if (lemma.getLast().equals("-")) {
+ System.out.println("!!!" + lemma.getLast() + " " + lemma_.get(i));
+ System.out.println("!!!" + tok_);
+ System.out.println("!!!" + pos_);
+ }
+ out += 1;
+ p_t = tok.removeLast();
+ p_l = lemma.removeLast();
+ p_p = pos.removeLast();
+ p_n = ner.removeLast();
+
+ tok.add(p_t + "," + tok_.get(i));
+ lemma.add(p_l + "," + lemma_.get(i));
+ pos.add("CD");
+ ner.add(p_n);
+ }
+ }
+ obj.put("lem", lemma);
+ obj.put("tok", tok);
+ obj.put("pos", pos);
+ obj.put("ner", ner);
+ return out;
+ }
+
+ public int post_procee_ner(HashMap> obj) {
+ LinkedList ner_ = obj.get("ner");
+ LinkedList lemma_ = obj.get("lem");
+ LinkedList tok_ = obj.get("tok");
+ LinkedList pos_ = obj.get("pos");
+ String p_l = "";
+ String p_t = "";
+ String p_n = "";
+ String p_p = "";
+ LinkedList lemma = new LinkedList();
+ LinkedList tok = new LinkedList();
+ LinkedList ner = new LinkedList();
+ LinkedList pos = new LinkedList();
+
+ Set tmp = new HashSet();
+ int out = 0;
+ boolean last = false;
+ for (int i = 0; i < lemma_.size(); i++) {
+ if (( !ner_.get(i).equals("O")) && ( lemma_.get(i).equals("'s") ||lemma_.get(i).equals("-")|| last)
+ && !ner.isEmpty() && ner.getLast().equals(ner_.get(i))) {
+
+ p_t = tok.removeLast();
+ p_l = lemma.removeLast();
+ p_p = pos.removeLast();
+ p_n = ner.removeLast();
+ last = lemma_.get(i).equals("-");
+ out += 1;
+ tok.add(p_t + tok_.get(i));
+ lemma.add(p_l + lemma_.get(i));
+ pos.add(p_p);
+ ner.add(p_n);
+ } else {
+ last = false;
+ lemma.add(lemma_.get(i));
+ tok.add(tok_.get(i));
+ ner.add(ner_.get(i));
+ pos.add(pos_.get(i));
+ }
+ }
+ obj.put("lem", lemma);
+ obj.put("tok", tok);
+ obj.put("pos", pos);
+ obj.put("ner", ner);
+ return out;
+ }
+
+ public void featureExtractFolder(String folder, String suffix) {
+ List files = convertingAMR.folderToFilesPath(folder, suffix);
+ files.parallelStream().forEach(file -> featureExtract(file));
+ }
+
+ public void featureExtractFolderSentenceOnly(String folder, String suffix) {
+ List files = convertingAMR.folderToFilesPath(folder, suffix);
+ files.parallelStream().forEach(file -> featureExtractSentenceOnly(file));
+
+ }
+
+ public static void main(String[] args) {
+ String home = System.getProperty("user.home"); //change this accordingly
+
+ convertingAMR convetor = new convertingAMR("joints.txt");
+
+ System.out.println("Processing r2");
+ System.out.println("Processing Dev");
+ convetor.featureExtractFolder(home + "/Data/amr_annotation_r2/data/alignments/split/dev/", "combined.txt_");
+ System.out.println("Processing Training");
+ convetor.featureExtractFolder(home + "/Data/amr_annotation_r2/data/alignments/split/training/", "combined.txt_");
+ System.out.println("Processing Test");
+ convetor.featureExtractFolder(home + "/Data/amr_annotation_r2/data/alignments/split/test/", "combined.txt_");
+
+
+
+ }
+
+}
diff --git a/AMR_FEATURE/src/json-20170516.jar b/AMR_FEATURE/src/json-20170516.jar
new file mode 100644
index 0000000..5033f66
Binary files /dev/null and b/AMR_FEATURE/src/json-20170516.jar differ
diff --git a/README.md b/README.md
index 87ef34a..74e1a33 100644
--- a/README.md
+++ b/README.md
@@ -25,8 +25,10 @@ If you use our code, please cite our paper as follows:
##Preprocessing:
Combine all *.txt files into a single one, and use stanford corenlp to extract ner, pos and lemma.
-Processed file saved in the same folder.
+Processed file saved in the same folder.
`python src/preprocessing.py `
+or Process from [AMR-to-English aligner](https://www.isi.edu/natural-language/mt/amr_eng_align.pdf) using java script in AMR_FEATURE (I used eclipse to run it)
+
Build the copying dictionary and recategorization system (can skip as they are in data/).
`python src/rule_system_build.py `
Build data into tensor.
@@ -57,7 +59,7 @@ Keeping the files under data/ folder unchanged, download [model](https://drive.g
Should allow one to run parsing.
##Notes
-This code starts with sentence original AMR files, while the paper version is trained on tokenized version provided by [AMR-to-English aligner](https://www.isi.edu/natural-language/mt/amr_eng_align.pdf)
+This python src/preprocessing.py starts with sentence original AMR files, while the paper version is trained on tokenized version provided by [AMR-to-English aligner](https://www.isi.edu/natural-language/mt/amr_eng_align.pdf)
So the results could be slightly different.
## Contact
diff --git a/data/aux_dict b/data/aux_dict
index 618aa65..eb885c3 100644
Binary files a/data/aux_dict and b/data/aux_dict differ
diff --git a/data/category_dict b/data/category_dict
index a4e0192..a20987e 100644
Binary files a/data/category_dict and b/data/category_dict differ
diff --git a/data/lemma_dict b/data/lemma_dict
index c4fbf06..79dad8d 100644
Binary files a/data/lemma_dict and b/data/lemma_dict differ
diff --git a/data/ner_dict b/data/ner_dict
index 3b90e69..c4f2423 100644
Binary files a/data/ner_dict and b/data/ner_dict differ
diff --git a/data/pos_dict b/data/pos_dict
index 11881f5..8332c07 100644
Binary files a/data/pos_dict and b/data/pos_dict differ
diff --git a/data/rel_dict b/data/rel_dict
index 83d01a0..b3f9098 100644
Binary files a/data/rel_dict and b/data/rel_dict differ
diff --git a/data/sensed_dict b/data/sensed_dict
index a53266d..b474c18 100644
Binary files a/data/sensed_dict and b/data/sensed_dict differ
diff --git a/data/word_dict b/data/word_dict
index 27091cc..e41b97e 100644
Binary files a/data/word_dict and b/data/word_dict differ
diff --git a/utility/__pycache__/Naive_Scores.cpython-36.pyc b/utility/__pycache__/Naive_Scores.cpython-36.pyc
index b795110..6ca58fa 100644
Binary files a/utility/__pycache__/Naive_Scores.cpython-36.pyc and b/utility/__pycache__/Naive_Scores.cpython-36.pyc differ
diff --git a/utility/__pycache__/constants.cpython-36.pyc b/utility/__pycache__/constants.cpython-36.pyc
index ce1518a..b781b9e 100644
Binary files a/utility/__pycache__/constants.cpython-36.pyc and b/utility/__pycache__/constants.cpython-36.pyc differ
diff --git a/utility/constants.py b/utility/constants.py
index 69be8c5..9472973 100644
--- a/utility/constants.py
+++ b/utility/constants.py
@@ -3,7 +3,7 @@
# Change the path according to your system
-save_to = 'model/' #the folder amr model will be saved to (model name is parameterized by some hyper parameter)
+save_to = '/disk/scratch/s1544871/model/' #the folder amr model will be saved to (model name is parameterized by some hyper parameter)
train_from = 'model/gpus_0valid_best.pt' #default model loading
embed_path = "/disk/scratch/s1544871/glove.840B.300d.txt" #file containing glove embedding
core_nlp_url = 'http://localhost:9000' #local host url of standford corenlp server