diff --git a/AMR_FEATURE/.classpath b/AMR_FEATURE/.classpath new file mode 100644 index 0000000..e910e9c --- /dev/null +++ b/AMR_FEATURE/.classpath @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/AMR_FEATURE/.project b/AMR_FEATURE/.project new file mode 100644 index 0000000..b04548c --- /dev/null +++ b/AMR_FEATURE/.project @@ -0,0 +1,17 @@ + + + AMR_FEATURE + + + + + + org.eclipse.jdt.core.javabuilder + + + + + + org.eclipse.jdt.core.javanature + + diff --git a/AMR_FEATURE/.settings/org.eclipse.jdt.core.prefs b/AMR_FEATURE/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000..3a21537 --- /dev/null +++ b/AMR_FEATURE/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,11 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 +org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve +org.eclipse.jdt.core.compiler.compliance=1.8 +org.eclipse.jdt.core.compiler.debug.lineNumber=generate +org.eclipse.jdt.core.compiler.debug.localVariable=generate +org.eclipse.jdt.core.compiler.debug.sourceFile=generate +org.eclipse.jdt.core.compiler.problem.assertIdentifier=error +org.eclipse.jdt.core.compiler.problem.enumIdentifier=error +org.eclipse.jdt.core.compiler.source=1.8 diff --git a/AMR_FEATURE/bin/convertingAMR.class b/AMR_FEATURE/bin/convertingAMR.class new file mode 100644 index 0000000..0e63a79 Binary files /dev/null and b/AMR_FEATURE/bin/convertingAMR.class differ diff --git a/AMR_FEATURE/bin/json-20170516.jar b/AMR_FEATURE/bin/json-20170516.jar new file mode 100644 index 0000000..5033f66 Binary files /dev/null and b/AMR_FEATURE/bin/json-20170516.jar differ diff --git a/AMR_FEATURE/joints.txt b/AMR_FEATURE/joints.txt new file mode 100644 index 0000000..78be015 --- /dev/null +++ b/AMR_FEATURE/joints.txt @@ -0,0 +1,922 @@ +have to +at all +so far +more than +less than +no one +as well +at least +right wing +left wing +as long as +all over +of course +kind of +after all +by oneself +by the way +in fact +be all +head up +come out +coop up +seize up +bust up +hang out +limber up +quieten down +crack up +fuck up +get out +clear out +rip up +rock on +shout down +bundle up +pump up +smooth out +set down +drop off +think over +core out +tidy up +make off +fight on +set out +think up +try out +sign in +take out +top off +nail down +block up +cash in +fork out +mark down +rattle off +bandage up +sleep over +patch up +freeze over +seal off +free up +clown around +tear down +dust off +live up +cut loose +louse up +sit down +stand by +take up +steal away +lay off +turn in +meet up +check up +taper off +dole out +catch up +shape up +tax away +pass off +give in +speak up +call upon +stall out +butt in +carve out +step up +trigger off +prop up +scoop up +summon forth +boss around +cool down +give back +cut down +jot down +doze off +drum up +bog down +throw out +shy away +frost over +rack up +even out +light up +shack up +bone up +cut out +sum up +shut up +send out +pine away +take over +gobble up +shoot back +lay on +swear off +spread out +pin down +find out +drag on +thaw out +bump off +fatten up +get back +arm up +load up +give vent +top up +bounce back +bad off +come by +single out +call out +slow down +ask out +slice up +roll up +divide up +hold over +touch off +pass out +have mod +screw up +iron out +tell on +dry out +zero out +rev up +request confirmation +scrawl out +tie in +pass up +scratch out +miss out +root out +frighten off +have subevent +go on +follow through +lighten up +trade off +carry over +pay out +mellow out +fool around +get down +stretch out +run down +scrub up +splash out +stop by +touch upon +dig out +stick around +act out +pass by +watch out +share out +shut out +get along +go through +tease out +kill off +slug out +bottom out +tie down +neaten up +dress down +turn off +bandy around +yammer away +gulp down +cut back +chatter away +glaze over +drop by +slack off +fess up +seek out +creep out +hold up +knock up +shine through +fence off +zero in +flip out +rein in +screen out +cheer up +saw up +sign off +flatten out +heat up +add on +clip off +doll up +touch on +fall off +suit up +palm off +mist over +flesh out +burn up +sweat out +work up +brazen out +peel off +pay up +get even +fill out +whip up +shout out +kick in +draw up +thrash out +head off +come in +break up +speed up +spout off +type up +polish off +trot out +puke up +bank up +rip off +dry up +settle down +cry out +go out +face off +ride up +buckle up +pair up +come off +auction off +roll back +throw in +eat up +suck up +shut down +wipe out +nod off +choke off +sleep off +stand up +frost up +join in +mix up +crisp up +knock out +talk out +set off +sit in +bang on +flake out +take off +queue up +square off +make over +ramp up +let down +toss out +finish up +blow over +sound off +cut up +rough in +blot out +stave off +stop off +act up +scout out +pay off +beat out +copy out +wolf down +have manner +get through +break off +drug up +pump out +take hold +polish up +pucker up +write off +shell out +come over +color in +tamp down +shut off +have mode +strike up +beat up +sweep up +come up +blast off +lie in +warm over +ratchet up +bump up +play out +look out +tip over +fudge over +warm up +throw away +crank up +tip off +have quant +go back +roll out +trim down +set up +rake in +piss off +give over +buoy up +pen up +touch up +parcel out +boom out +give off +jump up +leave over +tone down +dream on +lock in +win over +stop over +turn over +play on +edge out +get up +leave off +finish off +slim down +wall off +puff up +plug up +write out +let out +stop up +calm down +bring about +phase out +belly up +break down +stick up +lock up +pull out +set upon +jet off +pay down +fart around +zone out +bear out +take away +bleed off +write up +lash out +lam out +tie up +siphon off +dress up +stamp out +black out +snuff out +whip out +go off +ease up +tune out +gun down +freak out +chop down +strip away +step down +hit up +read up +chew up +start out +own up +close down +come upon +cone down +yield up +get away +gear up +bring on +figure out +turn up +check out +bead up +ship out +crank out +flush out +let on +put on +usher in +spin off +knock off +skim off +pass on +finish out +instead of +leave out +frighten away +buy up +knock over +straighten out +wear off +whiz away +call on +put out +totter around +salt away +spell out +creep up +hold out +sign up +branch out +mark up +hail down +pick out +shoot off +din out +beef up +get off +break through +smarten up +help out +buy out +stake out +take in +do in +come to +sell out +shore up +hem in +hang up +boil over +sort out +wipe up +curl up +whack off +track down +dig up +run out +haul out +plot out +loan out +coil up +die off +pipe down +kick off +come through +print out +pick away +gloss over +ring up +go down +read off +pitch in +choke up +break in +crack down +boot up +blurt out +sluice down +fill up +spring up +lock out +pack up +look over +whittle down +chicken out +bandy about +cart off +plug in +buy off +pick on +crash out +total up +pile on +pan out +prick up +dish up +stash away +round up +shoot up +balance out +bring along +quiet down +cut off +vamp up +run off +pull down +team up +hold back +hammer out +stack up +think through +match up +rise up +have concession +wipe off +hash out +come down +sock away +jump in +hang on +ferret out +wake up +brick over +burst out +tack down +spike out +use up +carry on +bottle up +tighten up +start up +carry off +speak out +set about +tag along +hook up +oil up +fend off +start over +sit up +sign on +take down +study up +while away +fold up +cheer on +bust out +rate entity +play down +book up +bind up +stay on +come about +put up +dine out +have frequency +store up +give up +vote down +bring up +tape up +leave behind +turn on +save up +break out +wash up +fork over +hollow out +freshen up +screw over +dash off +have part +mess up +buy into +burn out +cave in +lead up +clear up +cry down +stand out +turn away +drown out +run in +cover up +spill over +die out +farm out +hand over +poke around +ride out +come across +give away +tack on +bow out +squeeze out +write in +show up +come on +fix up +sew up +fort up +do away +liven up +scrunch up +log on +ham up +look down +firm up +tally up +tool up +weigh in +flare up +strike down +thin out +blast away +reel off +feed up +camp out +well off +crop up +be like +open up +link up +lick up +look up +statistical test +charge off +drop out +keep up +tick off +tune in +write down +bat in +stay over +gas up +pick up +cook up +boil down +pull through +call off +pop off +hand out +push up +fritter away +trail off +chop up +rear end +fuck around +rattle on +tire out +street address +keep on +pack away +keg stand +close off +lose out +wring out +make believe +soak up +tee off +shake up +scent out +steer clear +have instrument +tear up +feel up +live down +bowl over +step in +hobnob around +bow down +buzz off +tangle up +catch on +price out +snap up +live out +touch base +be done +have li +vomit up +clean out +laid back +buckle down +slip in +swear in +stall off +shoot down +be from +serve up +join up +back up +well up +pull up +put down +wash down +dish out +age out +fight back +bring down +run up +zip up +switch over +spend down +call up +be polite +pop up +fall apart +net out +jut out +wind up +rent out +cross out +rough up +broke ass +dredge up +wait out +shuffle off +build up +box in +shake off +cool off +get on +hit on +straighten up +start off +belch out +lie down +play up +give out +haul in +hard put +make up +snap off +follow suit +pass away +smooth over +hole up +turn out +clog up +sober up +smash up +contract out +go over +dope up +bed down +sit out +hype up +drop in +put off +ward off +get together +turn down +back off +swoop up +out trade +size up +pull off +conjure up +stock up +sleep away +monkey around +break away +pile up +put in +dream up +wrap up +gum up +bound up +tuck away +board up +have purpose +stick out +fall out +take aback +chart out +latch on +belt out +wear on +muck up +step aside +lead off +point out +line up +check in +start in +bunch up +watch over +fill in +work out +joke around +hum along +lock down +wear out +rip out +bleed out +come along +play off +show off +have extent +concrete over +narrow down +jack up +stare down +pipe up +loosen up +wear down +bear up +cover over +have polarity +mic up +make do +close over +deck out +blow out +play to +hammer away +ration out +sell off +have name +strike out +shuttle off +call in +shrug off +chalk up +perk up +knock down +follow up +pass over +brush off +drink up +fly out +close in +grow up +eat away +have condition +snatch away +pick off +stress out +take on +muddle up +tuck in +live on +skip off +look forward +stir up +bail out +stand down +close up +run over +throw up +fuck off +swallow up +spill out +fall back +fight off +rig up +sweat off +hide out +divvy up +flash back +end up +make it +toss in +round out +sniff out +grind up +chip in +cough up +phase in +let up +water down +hold on +level off +have value +fit in +yammer on +key in +hold off +silt up +get by +split up +make out +look after +rubber stamp +sketch out +pull over +spruce up +glass over +add up +mist up +brush up +wind down +clutch on +knock back +pare down +rule out +fall through +hack away +asphalt over +clean up +pound out +die down +carry out +fall over +blow up +weasel out +break even diff --git a/AMR_FEATURE/src/convertingAMR.java b/AMR_FEATURE/src/convertingAMR.java new file mode 100644 index 0000000..56777f0 --- /dev/null +++ b/AMR_FEATURE/src/convertingAMR.java @@ -0,0 +1,493 @@ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import org.json.JSONArray; +import org.json.JSONObject; + +import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.IndexedWord; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.semgraph.SemanticGraph; +import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation; +import edu.stanford.nlp.semgraph.SemanticGraphEdge; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.PropertiesUtils; + +public class convertingAMR { + private Map> map; + + /*initialize from joints.txt, which is used for greedily connecting phrases + * e.g. make-up + * extracted from AMRPropBank and training set + * */ + public convertingAMR(String file) { + map = new HashMap>(); + Set tmp; + + try (FileInputStream fis = new FileInputStream(file); + BufferedReader br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));) { + + String line; + String[] pair; + while ((line = br.readLine()) != null) { + pair = line.split(" "); + String past = ""; + for (int i = 0; i < pair.length - 1; i++) { + past += pair[i] + " "; + tmp = map.getOrDefault(past.trim(), new HashSet()); + tmp.add(pair[i + 1]); + map.put(past.trim().replace(" ", "-"), tmp); + + } + + } + // System.out.println(map.toString()); + + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static List folderToFilesPath(String folder, String suffix) { + List results = new ArrayList(); + + File[] files = new File(folder).listFiles(); + // If this pathname does not denote a directory, then listFiles() returns null. + + for (File file : files) { + if (file.isFile() && file.getName().endsWith(suffix)) { + results.add(folder + file.getName()); + } + } + return results; + } + + public void featureExtract(String file) { + + // build pipeline + StanfordCoreNLP pipeline = new StanfordCoreNLP( + PropertiesUtils.asProperties("annotators", "tokenize,ssplit,pos,lemma,ner", "tokenize.options", + "splitHyphenated=true", + "tokenize.whitespace", "true",//start with tokenized file + "ssplit.isOneSentence", //ignore multi-sentence construction + "true", "tokenize.language", "en")); + + String[] name = file.split("/"); + + String line = ""; + try (FileInputStream fis = new FileInputStream(file); + BufferedReader br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));) { + + System.out.println(name[name.length - 1]); + int n = 0; + int changed = 0; + List obs = new ArrayList(); + line = br.readLine(); + while (line != null && !line.trim().isEmpty()) { + // if (n % 2 == 0) + // System.out.println(n+"\n"+line); + n++; + JSONObject obj = new JSONObject(); + StringBuilder pre = new StringBuilder(); + while (!line.startsWith("# ::tok ") && !line.startsWith("# ::snt ")) { + pre.append(line + "\n"); + line = br.readLine(); + } + obj.put("pre", pre.toString()); + + //build a sentence without buggy texts.... + String snt = line.replace("# ::tok ", "").replace("# ::snt ", ""); + snt = snt.replaceAll("\\.{2,}", "").replaceAll(" ", " "); + snt = snt.replace(" ", " ").replace(" ", " ").replace("\n", ""); + snt = snt.replaceAll("\"", " \" "); + snt = snt.replaceAll("\\(", " \\( "); + snt = snt.replaceAll("\\)", " \\) "); + snt = snt.replaceAll("@-@", "-").replaceAll(" @:@ ", ":"); + obj.put("snt", snt); + + //initial feature extraction and connecting of phrase + HashMap> data = extractSentence(obj, pipeline,true); + //connects number + changed += post_procee_number(data); + //connects ner, mainly due to "-" and "'s" construction in AMR NER + changed += post_procee_ner(data); + obj.put("ner", data.get("ner")); + obj.put("lem", data.get("lem")); + obj.put("tok", data.get("tok")); + obj.put("pos", data.get("pos")); + obs.add(obj); + + if (obs.size() % 500 == 0) { + System.out.println(obs.size() + " " + name[name.length - 1]); + obj.keys().forEachRemaining(k -> { + System.out.println(k + ": " + obj.get(k)); + }); + } + + //read remaining e.g. AMR graph + StringBuilder post = new StringBuilder(); + line = br.readLine(); + while (line != null && !line.trim().isEmpty()) { + post.append(line + "\n"); + line = br.readLine(); + } + + obj.put("post", post.toString()); + + while (line != null && line.trim().isEmpty()) { + line = br.readLine(); + } + + } + System.out.println("\n" + name[name.length - 1] + " done. Total sentences: " + obs.size() + "\n"); + System.out.println("\n" + changed + " changed." + "\n"); + String out = obs.stream().map(obj -> writeObject(obj)).collect(Collectors.joining("\n")); + Files.write(Paths.get(file.replaceAll(".txt(_[a-z]*)*", ".txt_pre_processed")), out.getBytes()); + + } catch (IOException e) { + e.printStackTrace(); + } catch (NullPointerException e) { + System.out.println(file + " null pointer??"); + System.out.println(line + " null pointer??"); + e.printStackTrace(); + } + } + + //same as featureExtract, but have sentence only + public void featureExtractSentenceOnly(String file) { + + StanfordCoreNLP pipeline = new StanfordCoreNLP( + PropertiesUtils.asProperties("annotators", "tokenize,ssplit,pos,lemma,ner", "tokenize.options", + "splitHyphenated=true", "tokenize.whitespace", "true", + "ssplit.isOneSentence", "true", "tokenize.language", "en")); + String[] name = file.split("/"); + + String line = ""; + try (FileInputStream fis = new FileInputStream(file); + BufferedReader br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));) { + + System.out.println(name[name.length - 1]); + int n = 0; + int changed = 0; + List obs = new ArrayList(); + line = br.readLine(); + while (line != null && !line.trim().isEmpty()) { + // if (n % 2 == 0) + // System.out.println(n+"\n"+line); + n++; + JSONObject obj = new JSONObject(); + StringBuilder pre = new StringBuilder(); + obj.put("pre", pre.toString()); + + //build a sentence without buggy texts.... + String snt = line.replace("# ::tok ", "").replace("# ::snt ", ""); + /* if (snt.startsWith("the ones who are suffering are the ordinary people :")) { + System.out.println("!!!!\n"+snt+"\n!!!!"); + } + snt = snt.replaceAll("\\.{2,}", "").replaceAll(" ", " "); + if (snt.startsWith("the ones who are suffering are the ordinary people :")) { + System.out.println("!!!!\n"+snt+"\n!!!!"); + } + snt = snt.replace(" ", " ").replace(" ", " ").replace("\n", ""); + snt = snt.replaceAll("\"", " \" "); + snt = snt.replaceAll("\\(", " \\( "); + snt = snt.replaceAll("\\)", " \\) "); + snt = snt.replaceAll("@-@", "-").replaceAll(" @:@ ", ":");*/ + obj.put("snt", snt); + + //feature extraction and connecting of phrase, no change of tokenization + HashMap> data = extractSentence(obj, pipeline,false); + if (snt.startsWith("the ones who are suffering are the ordinary people :")) { + System.out.println("!!!!\n"+snt+"\n!!!!"); + System.out.println( data.get("tok")); + } + obj.put("ner", data.get("ner")); + obj.put("lem", data.get("lem")); + obj.put("tok", data.get("tok")); + obj.put("pos", data.get("pos")); + obs.add(obj); + + if (obs.size() % 500 == 0) { + System.out.println(obs.size() + " " + name[name.length - 1]); + obj.keys().forEachRemaining(k -> { + System.out.println(k + ": " + obj.get(k)); + }); + } + + StringBuilder post = new StringBuilder(); + obj.put("post", post.toString()); + + line = br.readLine(); + while (line != null && line.trim().isEmpty()) { + line = br.readLine(); + } + + } + System.out.println("\n" + name[name.length - 1] + " done. Total sentences: " + obs.size() + "\n"); + System.out.println("\n" + changed + " changed." + "\n"); + String out = obs.stream().map(obj -> writeObject(obj)).collect(Collectors.joining("\n")); + Files.write(Paths.get(file.replaceAll(".txt(_[a-z]*)*", ".txt_processed")), out.getBytes()); + + } catch (IOException e) { + e.printStackTrace(); + } catch (NullPointerException e) { + System.out.println(file + " null pointer??"); + System.out.println(line + " null pointer??"); + e.printStackTrace(); + } + } + + public String jsonArrayToWhiteSpaceString(JSONArray arr) { + String out = ""; + for (Object s : arr) { + out += "\t" + s.toString(); + } + return out.substring(1); + } + + // output data to txt + public String writeObject(JSONObject obj) { + StringBuilder out = new StringBuilder(); + if (!obj.getString("pre").trim().isEmpty()) + out.append(obj.getString("pre")); + String tok = jsonArrayToWhiteSpaceString(obj.getJSONArray("tok")); + out.append("# ::tok\t" + tok + "\n"); + String lemma = jsonArrayToWhiteSpaceString(obj.getJSONArray("lem")); + out.append("# ::lem\t" + lemma + "\n"); + String pos = jsonArrayToWhiteSpaceString(obj.getJSONArray("pos")); + out.append("# ::pos\t" + pos + "\n"); + String ner = jsonArrayToWhiteSpaceString(obj.getJSONArray("ner")); + out.append("# ::ner\t" + ner + "\n"); + + assert tok.split(" ").length == lemma.split(" ").length; + assert tok.split(" ").length == pos.split(" ").length; + assert tok.split(" ").length == ner.split(" ").length; + + if (!obj.getString("post").trim().isEmpty()) + out.append(obj.getString("post") + "\n"); + + return out.toString(); + } + + public volatile int positive = 0; + public volatile int truth = 0; + public volatile int truth_positive = 0; + + public HashMap> extractSentence(JSONObject obj, StanfordCoreNLP pipeline,boolean retoken) { + String text = obj.getString("snt"); + obj.put("snt", text); + // create an empty Annotation just with the given text + + Annotation sent = new Annotation(text); + HashMap> data = new HashMap>(); + // run all Annotators on this text + pipeline.annotate(sent); + LinkedList lemma = new LinkedList(); + LinkedList tok = new LinkedList(); + LinkedList ner = new LinkedList(); + LinkedList pos = new LinkedList(); + String p_l = ""; + String p_s = ""; + String p_n = ""; + String p_p = ""; + int changed = 0; + + List sentences = sent.get(SentencesAnnotation.class); + Set tmp = new HashSet(); + for (CoreMap sentence : sentences) { + + for (int i = 0; i < sentence.get(TokensAnnotation.class).size(); i++) { + CoreLabel token = sentence.get(TokensAnnotation.class).get(i); + if (retoken &&tmp.contains(token.get(LemmaAnnotation.class)) + && (!map.containsKey(lemma.getLast() + "-" + token.get(LemmaAnnotation.class)) //not x-y-z + || (i + 1 < sentence.get(TokensAnnotation.class).size() - 1 && + map.get(lemma.getLast() + "-" + token.get(LemmaAnnotation.class)) + .contains(sentence.get(TokensAnnotation.class).get(i + 1) + .get(LemmaAnnotation.class)) + + ))) { + p_s = tok.removeLast(); + p_l = lemma.removeLast(); + p_p = pos.removeLast(); + p_n = ner.removeLast(); + changed = 1; + tok.add(p_s + "-" + token.get(TextAnnotation.class)); + lemma.add(p_l + "-" + token.get(LemmaAnnotation.class).toLowerCase()); + pos.add("COMP"); + ner.add("O"); + } else { + + tok.add(token.get(TextAnnotation.class)); + lemma.add(token.get(LemmaAnnotation.class).toLowerCase()); + pos.add(token.get(PartOfSpeechAnnotation.class)); + if (lemma.get(lemma.size() - 1).contains("www.") || lemma.get(lemma.size() - 1).contains("http")) + ner.add("URL"); + else + ner.add(token.get(NamedEntityTagAnnotation.class)); + + } + tmp = map.getOrDefault(lemma.getLast(), new HashSet()); + } + + } + assert ner.size() == lemma.size() && lemma.size() == tok.size() && tok.size() == pos.size(); + data.put("lem", lemma); + data.put("tok", tok); + data.put("pos", pos); + data.put("ner", ner); + return data; + + } + + private String[] tobehashed = { "hundred", "thousand", "million", "billion", "trillion", "hundreds", "thousands", + "millions", "billions", "trillions", "-" }; + private HashSet num_txts = new HashSet<>(Arrays.asList(tobehashed)); + + public boolean number_read(String old, String t) { + return num_txts.contains(t) && !old.equals("-") && !t.equals("-"); + } + + public int post_procee_number(HashMap> obj) { + LinkedList ner_ = (LinkedList) obj.get("ner"); + LinkedList lemma_ = (LinkedList) obj.get("lem"); + LinkedList tok_ = (LinkedList) obj.get("tok"); + LinkedList pos_ = (LinkedList) obj.get("pos"); + String p_l = ""; + String p_t = ""; + String p_n = ""; + String p_p = ""; + LinkedList lemma = new LinkedList(); + LinkedList tok = new LinkedList(); + LinkedList ner = new LinkedList(); + LinkedList pos = new LinkedList(); + int out = 0; + for (int i = 0; i < lemma_.size(); i++) { + if (pos.isEmpty() || !pos_.get(i).equals("CD") || (!pos.isEmpty() && !pos.getLast().equals("CD")) + || (!number_read(lemma.getLast(), lemma_.get(i)))) { + + lemma.add(lemma_.get(i)); + tok.add(tok_.get(i)); + ner.add(ner_.get(i)); + pos.add(pos_.get(i)); + } else { + if (lemma.getLast().equals("-")) { + System.out.println("!!!" + lemma.getLast() + " " + lemma_.get(i)); + System.out.println("!!!" + tok_); + System.out.println("!!!" + pos_); + } + out += 1; + p_t = tok.removeLast(); + p_l = lemma.removeLast(); + p_p = pos.removeLast(); + p_n = ner.removeLast(); + + tok.add(p_t + "," + tok_.get(i)); + lemma.add(p_l + "," + lemma_.get(i)); + pos.add("CD"); + ner.add(p_n); + } + } + obj.put("lem", lemma); + obj.put("tok", tok); + obj.put("pos", pos); + obj.put("ner", ner); + return out; + } + + public int post_procee_ner(HashMap> obj) { + LinkedList ner_ = obj.get("ner"); + LinkedList lemma_ = obj.get("lem"); + LinkedList tok_ = obj.get("tok"); + LinkedList pos_ = obj.get("pos"); + String p_l = ""; + String p_t = ""; + String p_n = ""; + String p_p = ""; + LinkedList lemma = new LinkedList(); + LinkedList tok = new LinkedList(); + LinkedList ner = new LinkedList(); + LinkedList pos = new LinkedList(); + + Set tmp = new HashSet(); + int out = 0; + boolean last = false; + for (int i = 0; i < lemma_.size(); i++) { + if (( !ner_.get(i).equals("O")) && ( lemma_.get(i).equals("'s") ||lemma_.get(i).equals("-")|| last) + && !ner.isEmpty() && ner.getLast().equals(ner_.get(i))) { + + p_t = tok.removeLast(); + p_l = lemma.removeLast(); + p_p = pos.removeLast(); + p_n = ner.removeLast(); + last = lemma_.get(i).equals("-"); + out += 1; + tok.add(p_t + tok_.get(i)); + lemma.add(p_l + lemma_.get(i)); + pos.add(p_p); + ner.add(p_n); + } else { + last = false; + lemma.add(lemma_.get(i)); + tok.add(tok_.get(i)); + ner.add(ner_.get(i)); + pos.add(pos_.get(i)); + } + } + obj.put("lem", lemma); + obj.put("tok", tok); + obj.put("pos", pos); + obj.put("ner", ner); + return out; + } + + public void featureExtractFolder(String folder, String suffix) { + List files = convertingAMR.folderToFilesPath(folder, suffix); + files.parallelStream().forEach(file -> featureExtract(file)); + } + + public void featureExtractFolderSentenceOnly(String folder, String suffix) { + List files = convertingAMR.folderToFilesPath(folder, suffix); + files.parallelStream().forEach(file -> featureExtractSentenceOnly(file)); + + } + + public static void main(String[] args) { + String home = System.getProperty("user.home"); //change this accordingly + + convertingAMR convetor = new convertingAMR("joints.txt"); + + System.out.println("Processing r2"); + System.out.println("Processing Dev"); + convetor.featureExtractFolder(home + "/Data/amr_annotation_r2/data/alignments/split/dev/", "combined.txt_"); + System.out.println("Processing Training"); + convetor.featureExtractFolder(home + "/Data/amr_annotation_r2/data/alignments/split/training/", "combined.txt_"); + System.out.println("Processing Test"); + convetor.featureExtractFolder(home + "/Data/amr_annotation_r2/data/alignments/split/test/", "combined.txt_"); + + + + } + +} diff --git a/AMR_FEATURE/src/json-20170516.jar b/AMR_FEATURE/src/json-20170516.jar new file mode 100644 index 0000000..5033f66 Binary files /dev/null and b/AMR_FEATURE/src/json-20170516.jar differ diff --git a/README.md b/README.md index 87ef34a..74e1a33 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,10 @@ If you use our code, please cite our paper as follows: ##Preprocessing: Combine all *.txt files into a single one, and use stanford corenlp to extract ner, pos and lemma. -Processed file saved in the same folder. +Processed file saved in the same folder. `python src/preprocessing.py ` +or Process from [AMR-to-English aligner](https://www.isi.edu/natural-language/mt/amr_eng_align.pdf) using java script in AMR_FEATURE (I used eclipse to run it) + Build the copying dictionary and recategorization system (can skip as they are in data/). `python src/rule_system_build.py ` Build data into tensor. @@ -57,7 +59,7 @@ Keeping the files under data/ folder unchanged, download [model](https://drive.g Should allow one to run parsing. ##Notes -This code starts with sentence original AMR files, while the paper version is trained on tokenized version provided by [AMR-to-English aligner](https://www.isi.edu/natural-language/mt/amr_eng_align.pdf) +This python src/preprocessing.py starts with sentence original AMR files, while the paper version is trained on tokenized version provided by [AMR-to-English aligner](https://www.isi.edu/natural-language/mt/amr_eng_align.pdf) So the results could be slightly different. ## Contact diff --git a/data/aux_dict b/data/aux_dict index 618aa65..eb885c3 100644 Binary files a/data/aux_dict and b/data/aux_dict differ diff --git a/data/category_dict b/data/category_dict index a4e0192..a20987e 100644 Binary files a/data/category_dict and b/data/category_dict differ diff --git a/data/lemma_dict b/data/lemma_dict index c4fbf06..79dad8d 100644 Binary files a/data/lemma_dict and b/data/lemma_dict differ diff --git a/data/ner_dict b/data/ner_dict index 3b90e69..c4f2423 100644 Binary files a/data/ner_dict and b/data/ner_dict differ diff --git a/data/pos_dict b/data/pos_dict index 11881f5..8332c07 100644 Binary files a/data/pos_dict and b/data/pos_dict differ diff --git a/data/rel_dict b/data/rel_dict index 83d01a0..b3f9098 100644 Binary files a/data/rel_dict and b/data/rel_dict differ diff --git a/data/sensed_dict b/data/sensed_dict index a53266d..b474c18 100644 Binary files a/data/sensed_dict and b/data/sensed_dict differ diff --git a/data/word_dict b/data/word_dict index 27091cc..e41b97e 100644 Binary files a/data/word_dict and b/data/word_dict differ diff --git a/utility/__pycache__/Naive_Scores.cpython-36.pyc b/utility/__pycache__/Naive_Scores.cpython-36.pyc index b795110..6ca58fa 100644 Binary files a/utility/__pycache__/Naive_Scores.cpython-36.pyc and b/utility/__pycache__/Naive_Scores.cpython-36.pyc differ diff --git a/utility/__pycache__/constants.cpython-36.pyc b/utility/__pycache__/constants.cpython-36.pyc index ce1518a..b781b9e 100644 Binary files a/utility/__pycache__/constants.cpython-36.pyc and b/utility/__pycache__/constants.cpython-36.pyc differ diff --git a/utility/constants.py b/utility/constants.py index 69be8c5..9472973 100644 --- a/utility/constants.py +++ b/utility/constants.py @@ -3,7 +3,7 @@ # Change the path according to your system -save_to = 'model/' #the folder amr model will be saved to (model name is parameterized by some hyper parameter) +save_to = '/disk/scratch/s1544871/model/' #the folder amr model will be saved to (model name is parameterized by some hyper parameter) train_from = 'model/gpus_0valid_best.pt' #default model loading embed_path = "/disk/scratch/s1544871/glove.840B.300d.txt" #file containing glove embedding core_nlp_url = 'http://localhost:9000' #local host url of standford corenlp server