## Read and parse XML
(find VUAMC.xml in the git folder and download it in one directory with this parser)

In [22]:
# import xml.etree.ElementTree as etree
from lxml import etree, objectify

root = etree.parse('VUAMC.xml')

## Cleanup xml schema/namespaces from tags ##    
for elem in root.getiterator():
    if not hasattr(elem.tag, 'find'): continue  # (1)
    i = elem.tag.find('}')
    if i >= 0:
        elem.tag = elem.tag[i+1:]
objectify.deannotate(root, cleanup_namespaces=True)

## Traverse XML tree and extract sentences containing metaphors

In [23]:
import pandas as pd

def extract_similes(root):
    rows = []
    for sent in root.findall('.//s'): # scan all sentences
        text = ''
        mflag = ''
        mrw = ''
        type_ = ''
        for word in sent.findall('.//w'): # for each word in sentence
            aseg = word.find('.//seg')
            if aseg is not None:
                if not aseg.text or not aseg.text.strip():
                    continue
                ft = aseg.text.strip()#.encode('UTF-8')
                if aseg.get('function') == 'mFlag': # flag for similes
                    mflag += ' ' + ft
                    text += ' ' + ft
                if aseg.get('function') == 'mrw':
                     # and not (not mflag): # start collecting keywords only after mflag
                    mrw += ' ' + ft
                    text += ' ' + ft
                    type_ = aseg.get('type')
            elif not (not word.text):
                text += ' ' + word.text.strip()#.encode('UTF-8')

        text = text.strip()
        mrw = mrw.strip()
        mflag = mflag.strip()
        if not (not mrw) and type_ == 'met': 
            rows.append([mflag, mrw, text])
    df = pd.DataFrame(rows)
    df.columns = ['_', 'mrw', 'sentence']
    return df
   

## This creates an xml file with metaphors and sentences w/met extracted from the corpus
(replace with your code, when you have a classifier)

In [3]:
df = extract_similes(root)
df.to_csv('metaphors.csv')

In [24]:
import pandas as pd

In [35]:
df.tail()

Unnamed: 0,_,mrw,sentence
8102,,got from taking at,We 've got five weeks discouraged from taking ...
8103,,take at,In Belgium many people do take three weeks at ...
8104,,with,You do come back with somebody sitting
8105,,got got on hands,Perhaps when they come back they 've got six m...
8106,,that,Oh well if you 're here that 's all right


In [26]:
df[0:1]

Unnamed: 0,_,mrw,sentence
0,,reveals approach leading to,Latest corporate unbundler reveals laid-back a...


In [27]:
df["sentence"]

0       Latest corporate unbundler reveals laid-back a...
1       IT SEEMS that Roland Franklin the latest unbun...
2       He has not properly investigated the target 's...
3       The 63-year-old head of Pembridge Investments ...
4       If he had taken his own rule seriously he woul...
5       There are other things he has on his own admis...
6       When the bid was launched last week Mr Frankli...
7                     He regards the charges as unfounded
8                                 On property he is blunt
9            I do not regard property profits as earnings
10      We have made a bid of nearly £700m for a compa...
11      If they can prove it is there we might pay for it
12        On the other criticism he is equally dismissive
13      That point about the core business is very unf...
14      We would eventually like to do it along with t...
15      The Franklin philosophy was learnt in the US l...
16      Mr Franklin went there at the end of the 1970s...
17      He fee

In [31]:
# With Natasha's VUAMC corpus
# with open('metaphors.csv', 'r') as text_file:
#     metaphor_corpus = text_file.read()

metaphor_corpus = str(df["sentence"])

# Use default tokenizer to start with
def tokenize_text(corpus):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus) # Split text into sentences
    
    return [nltk.word_tokenize(word) for word in raw_sents]

metaphor_sents = tokenize_text(metaphor_corpus)

In [33]:
def freq_normed_unigrams(sents):
    wnl = WordNetLemmatizer() # to get word stems
    
    tagged_POS_sents = [nltk.pos_tag(sent) for sent in sents] # tags sents
    
    normed_tagged_words = [wnl.lemmatize(word[0].lower()) for sent in tagged_POS_sents
                           for word in sent 
                           if word[0].lower() not in nltk.corpus.stopwords.words('english')
                           and word[0] not in punctuation # remove punctuation
                           and not re.search(r'''^[\.,;"'?!():\-_`]+$''', word[0])
                           and word[1].startswith('N')]  # include only nouns

    top_normed_unigrams = [word for (word, count) in nltk.FreqDist(normed_tagged_words).most_common(40)]
    return top_normed_unigrams

def categories_from_hypernyms(sents):
    termlist = freq_normed_unigrams(sents) # get top unigrams
    hypterms = []
    hypterms_dict = defaultdict(list)
    for term in termlist:                  # for each term
        s = wn.synsets(term.lower(), 'n')  # get its nominal synsets
        for syn in s:                      # for each lemma synset
            for hyp in syn.hypernyms():    # It has a list of hypernyms
                hypterms = hypterms + [hyp.name]      # Extract the hypernym name and add to list
                hypterms_dict[hyp.name].append(term)  # Extract examples and add them to dict
    hypfd = nltk.FreqDist(hypterms)             # After going through all the nouns, print out the hypernyms 
    for (name, count) in hypfd.most_common(25):  # that have accumulated the most counts (have seen the most descendents)
        print( name(), '({0})'.format(count))
        print ('\t', ', '.join(set(hypterms_dict[name])))  # show the children found for each hypernym
        print ()
        
categories_from_hypernyms(metaphor_sents)

time_period.n.01 (4)
	 week, day

direction.n.06 (3)
	 rule

group.n.01 (3)
	 people

concept.n.01 (3)
	 property, rule

metallic_element.n.01 (2)
	 u, er

boundary.n.01 (2)
	 border

work_time.n.01 (2)
	 week, day

goal.n.01 (2)
	 target, object

edge.n.06 (2)
	 border

time_unit.n.01 (2)
	 day

family.n.04 (2)
	 people, name

duration.n.01 (1)
	 rule

explosive.n.01 (1)
	 charge

libidinal_energy.n.01 (1)
	 charge

possession.n.02 (1)
	 property

argumentation.n.02 (1)
	 policy

important_person.n.01 (1)
	 name

affair.n.03 (1)
	 party

causal_agent.n.01 (1)
	 somebody

literal_interpretation.n.01 (1)
	 letter

time.n.03 (1)
	 day

administrative_district.n.01 (1)
	 town

sanction.n.04 (1)
	 name

antioxidant.n.01 (1)
	 se

mon-khmer.n.01 (1)
	 mon



In [34]:
# With Metanet
with open('metanet.txt', 'r') as text_file:
     metaphor_corpus = text_file.read()

metanet_corpus = str(df["sentence"])

# Use default tokenizer to start with
def tokenize_text(corpus):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus) # Split text into sentences
    
    return [nltk.word_tokenize(word) for word in raw_sents]

metanet_sents = tokenize_text(metanet_corpus)
categories_from_hypernyms(metanet_sents)

time_period.n.01 (4)
	 week, day

direction.n.06 (3)
	 rule

group.n.01 (3)
	 people

concept.n.01 (3)
	 property, rule

metallic_element.n.01 (2)
	 u, er

boundary.n.01 (2)
	 border

work_time.n.01 (2)
	 week, day

goal.n.01 (2)
	 target, object

edge.n.06 (2)
	 border

time_unit.n.01 (2)
	 day

family.n.04 (2)
	 people, name

duration.n.01 (1)
	 rule

explosive.n.01 (1)
	 charge

libidinal_energy.n.01 (1)
	 charge

possession.n.02 (1)
	 property

argumentation.n.02 (1)
	 policy

important_person.n.01 (1)
	 name

affair.n.03 (1)
	 party

causal_agent.n.01 (1)
	 somebody

literal_interpretation.n.01 (1)
	 letter

time.n.03 (1)
	 day

administrative_district.n.01 (1)
	 town

sanction.n.04 (1)
	 name

antioxidant.n.01 (1)
	 se

mon-khmer.n.01 (1)
	 mon



# Looking at https://github.com/ytsvetko/metaphor

In [95]:
##imports
#%pylab inline
import re
import math
import string
import nltk
import pprint
import matplotlib
from nltk import word_tokenize
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
from collections import Counter
from __future__ import division
from nltk.collocations import *
import string, random
from nltk.corpus import brown
from nltk.collocations import *
from string import punctuation
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [41]:
df_anmet = pd.read_csv("an_mets.csv", low_memory=False)

In [42]:
df_anmet.head()

Unnamed: 0,an_metaphors
0,angry welt
1,bald assertion
2,bare outline
3,black humor
4,blind alley


In [66]:
df_anmet = pd.read_csv("an_mets.csv", low_memory=False)
df_anmet['metaphor'] = 1
df_anmet['an_met'] = 1
df_anmet['an_nonmet'] = 0
df_anmet['svo_met'] = 0
df_anmet['svo_nonmet'] = 0
df_anmet.head()

Unnamed: 0,sample,metaphor,an_met,an_nonmet,svo_met,svo_nonmet
0,angry welt,1,1,0,0,0
1,bald assertion,1,1,0,0,0
2,bare outline,1,1,0,0,0
3,black humor,1,1,0,0,0
4,blind alley,1,1,0,0,0


In [67]:
df_annonmet = pd.read_csv("an_nonmets.csv", low_memory=False)
df_annonmet['metaphor'] = 0
df_annonmet['an_met'] = 0
df_annonmet['an_nonmet'] = 1
df_annonmet['svo_met'] = 0
df_annonmet['svo_nonmet'] = 0
df_annonmet.head()

Unnamed: 0,sample,metaphor,an_met,an_nonmet,svo_met,svo_nonmet
0,angry protester,0,0,1,0,0
1,bald eagle,0,0,1,0,0
2,big city,0,0,1,0,0
3,blind man,0,0,1,0,0
4,bloody nose,0,0,1,0,0


In [68]:
df_svomet = pd.read_csv("svo_mets.csv", low_memory=False)
df_svomet['metaphor'] = 1
df_svomet['an_met'] = 0
df_svomet['an_nonmet'] = 0
df_svomet['svo_met'] = 1
df_svomet['svo_nonmet'] = 0
df_svomet.head()

Unnamed: 0,sample,metaphor,an_met,an_nonmet,svo_met,svo_nonmet
0,conversation turn subject,1,0,0,1,0
1,resumption bring relief,1,0,0,1,0
2,economy move direction,1,0,0,1,0
3,service meet expectation,1,0,0,1,0
4,material live dream,1,0,0,1,0


In [69]:
df_svononmet = pd.read_csv("svo_nonmets.csv", low_memory=False)
df_svononmet['metaphor'] = 0
df_svononmet['an_met'] = 0
df_svononmet['an_nonmet'] = 0
df_svononmet['svo_met'] = 0
df_svononmet['svo_nonmet'] = 1
df_svononmet.head()

Unnamed: 0,sample,metaphor,an_met,an_nonmet,svo_met,svo_nonmet
0,car break *none,0,0,0,0,1
1,crowd scream *none,0,0,0,0,1
2,*person turn *none,0,0,0,0,1
3,foot slip *none,0,0,0,0,1
4,man shake head,0,0,0,0,1


In [76]:
frames = [df_anmet, df_annonmet, df_svomet, df_svononmet]
df_combo = pd.concat(frames)

In [84]:
random_index = np.random.permutation(df_combo.index)
random_index[:10]
df_combo.ix[random_index, ['sample', 'metaphor', 'an_met', 'an_nonmet', 'svo_met', 'svo_nonmet']]
df_shuffled = df_combo.ix[random_index, ['sample', 'metaphor', 'an_met', 'an_nonmet', 'svo_met', 'svo_nonmet']]
df_shuffled.reset_index(drop=True, inplace=True)
df_shuffled[500:510]

Unnamed: 0,sample,metaphor,an_met,an_nonmet,svo_met,svo_nonmet
500,mad sprint,1,1,0,0,0
501,hollow cylinder,0,0,1,0,0
502,bolt refuse *none,1,0,0,1,0
503,*person stare stone,0,0,0,0,1
504,deep understanding,1,1,0,0,0
505,deep sea,0,0,1,0,0
506,insurance cover care,1,0,0,1,0
507,statue stand *none,0,0,0,0,1
508,sunny disposition,1,1,0,0,0
509,steep hill,0,0,1,0,0


In [85]:
rows, columns = df_shuffled.shape
print("Rows:", rows)
print("Columns:", columns)
#train_size = round(rows*.6)
train_size = round(rows*.9)
#dev_size   = round(rows*.2)
dev_size   = round(rows*.1)
df_train = df_shuffled.loc[:train_size]
df_train.shape
df_dev = df_shuffled.loc[train_size:dev_size+train_size].reset_index(drop=True)
df_dev.shape
df_test = df_shuffled.loc[dev_size+train_size:].reset_index(drop=True)
df_test.shape

Rows: 1644
Columns: 6


(0, 6)

In [86]:
vec = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', analyzer=u'word', min_df=5)
df_train = df_train.fillna("")
df_dev = df_dev.fillna("")
df_test = df_test.fillna("")

In [88]:
arr_train_feature_sparse = vec.fit_transform(df_train['sample'])
arr_train_feature_sparse
arr_train_feature = arr_train_feature_sparse.toarray()
feature_labels = vec.get_feature_names()

In [89]:
arr_dev_feature_sparse = vec.transform(df_dev["sample"])
arr_dev_feature = arr_dev_feature_sparse.toarray()

In [96]:
logreg = LogisticRegression()
logreg_model = logreg.fit(arr_train_feature, df_train['metaphor']) #defining features (from reviews) and passing in Category label
logreg_predictions = logreg_model.predict(arr_dev_feature)
accuracy_score(df_dev['metaphor'], logreg_predictions)

0.60365853658536583