In [129]:
%matplotlib inline
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', 100)

In [14]:
import xml.etree.ElementTree as ET
train_data = ET.parse('line-train.xml')

In [None]:
import string
string.punctuation

In [15]:
test =  ET.parse('line-test.xml')

In [16]:
def process_children(element):
    text = ''
    for child in element.getchildren():
        try:
            text = text + ' ' + child.text
        except TypeError:
            pass
    return text

In [17]:
from copy import copy
def get_context_list(xmlobject):
    data = []
    dict_item = {}
    iterator = xmlobject.iter()
    for element in iterator:
        if element.tag == 'context':
            dict_item['context'] = process_children(element)
            data.append(copy(dict_item))
            dict_item = {}
            continue
        elif element.tag == 'answer':
            dict_item['senseid'] = element.attrib.get('senseid')
            dict_item['element'] = ET.tostring(element).decode("utf-8").strip('\n')
            continue
    return data

In [18]:
data = get_context_list(train_data)

In [19]:
df_train = pd.DataFrame(data, columns=["element", 'senseid', 'context'])

In [20]:
def get_context_list_test(xmlobject):
    data = []
    data_item = {}
    iterator = xmlobject.iter()
    for element in iterator:
        if element.tag == 'instance':
            data_item['id'] = element.attrib.get('id')
            data_item['context'] = process_children(element.getchildren()[0])
            data.append(copy(data_item))
    return data

In [21]:
data = get_context_list_test(test)

In [22]:
df_test = pd.DataFrame(data, columns=['id', 'context'])

In [23]:
df_test['tokens'] = df_test['context'].apply(nltk.word_tokenize)

In [27]:
stopwords = nltk.corpus.stopwords.words('english')
remove_punct = lambda text: ''.join((char for char in text if char not in string.punctuation))
remove_stopwords = lambda token_list: [word for word in token_list if word not in stopwords]

In [158]:
from sklearn.feature_extraction.text import CountVectorizer

In [160]:
vectorizer = CountVectorizer()

In [161]:
X = vectorizer.fit_transform(df['context'])

In [163]:
print(len(vectorizer.get_feature_names()))

3303


In [167]:
X.shape

(374, 3303)

In [169]:
df.shape

(374, 4)

In [102]:
import sys
import nltk
import string
import pandas as pd
import xml.etree.ElementTree as ET
from copy import copy
from collections import defaultdict

def process_children(element):
    text = ''
    for child in element.getchildren():
        try:
            text = text + ' ' + child.text
        except TypeError:
            pass
    return text

def get_context_list_train(xmlobject):
    data = []
    dict_item = {}
    iterator = xmlobject.iter()
    for element in iterator:
        if element.tag == 'context':
            dict_item['context'] = process_children(element)
            data.append(copy(dict_item))
            dict_item = {}
            continue
        elif element.tag == 'answer':
            dict_item['senseid'] = element.attrib.get('senseid')
            dict_item['element'] = ET.tostring(element).decode("utf-8").strip('\n')
            continue
    return data

def get_dataframe_train_data(train_file):
    train_data = ET.parse(train_file)
    train_data = get_context_list_train(train_data)
    return pd.DataFrame(train_data, columns=["element", 'senseid', 'context'])

def get_context_list_test(xmlobject):
    data = []
    data_item = {}
    iterator = xmlobject.iter()
    for element in iterator:
        if element.tag == 'instance':
            data_item['id'] = element.attrib.get('id')
            data_item['context'] = process_children(element.getchildren()[0])
            data.append(copy(data_item))
    return data

def get_dataframe_test_data(test_file):
    test_data = ET.parse(test_file)
    test_data = get_context_list_test(test_data)
    return pd.DataFrame(test_data, columns=['id', 'context'])

def clean_data(df, column_name):
    stopwords = nltk.corpus.stopwords.words('english')
    remove_punct = lambda text: ''.join((char for char in text if char not in string.punctuation))
    remove_stopwords = lambda sentence: [word for word in sentence.lower().split() if word not in stopwords]
    unpunct_array = df[column_name].apply(remove_punct)
    df['word_tokens'] = unpunct_array.apply(remove_stopwords)
    return df

def get_tagged_features_freq(df, column_name):
    df['pos_tag'] = df[column_name].apply(nltk.pos_tag)
    all_feature_phone = defaultdict(lambda: 0)
    all_feature_product = defaultdict(lambda: 0)
    df['pos_tag'] = df[column_name].apply(nltk.pos_tag)
    for pos_set in df[df['senseid'] == 'phone']['pos_tag']:
        for word_tag in pos_set:
            all_feature_phone[word_tag] = all_feature_phone[word_tag] + 1
    for pos_set in df[df['senseid'] == 'product']['pos_tag']:
        for word_tag in pos_set:
            all_feature_product[word_tag] = all_feature_product[word_tag] + 1
    return all_feature_phone, all_feature_product


In [118]:
train_file = 'line-train.xml'
test_file = 'line-test.xml'
train_df = get_dataframe_train_data(train_file)
test_df = get_dataframe_test_data(test_file)
train_df['tokens'] = train_df['context'].apply(nltk.word_tokenize)
train_df = clean_data(train_df, 'context')
freq_features_phone, freq_features_product = get_tagged_features_freq(train_df, 'word_tokens')
phone_freq = train_df[train_df['senseid'] == 'phone'].shape[0]
product_freq = train_df[train_df['senseid'] != 'phone'].shape[0]

In [119]:
test_df = clean_data(test_df, 'context')
test_df['pos_tag'] = test_df['word_tokens'].apply(nltk.pos_tag)

In [126]:
def score(pos_set, feature_set):
    score_ = 0
    for word_tag in pos_set:
        score_ = score_ + feature_set[word_tag]
    return score_

def sense_predict(df):
    if df.phone_score < df.product_score:
        return 'phone'
    elif df.phone_score > df.product_score:
        return 'product'
    else:
        return 'phone'

In [121]:
test_df['phone_score'] = test_df['pos_tag'].apply(score, args=(freq_features_phone,))
test_df['phone_score'] = test_df['pos_tag'].apply(score, args=(freq_features_phone,))

In [130]:
test_df['sense'] = np.where(test_df['phone_score'] > test_df['product_score'], 'phone', 
                   np.where(test_df['phone_score'] < test_df['product_score'], 'product', 'phone'))

In [135]:
for id_, sense in zip(test_df['id'], test_df['sense']):
    print(f'<answer instance="{id_}" senseid="{sense}"/>')

<answer instance="line-n.w8_059:8174:" senseid="phone"/>
<answer instance="line-n.w7_098:12684:" senseid="phone"/>
<answer instance="line-n.w8_106:13309:" senseid="phone"/>
<answer instance="line-n.w9_40:10187:" senseid="phone"/>
<answer instance="line-n.w9_16:217:" senseid="phone"/>
<answer instance="line-n.w8_119:16927:" senseid="product"/>
<answer instance="line-n.w8_008:13756:" senseid="phone"/>
<answer instance="line-n.w8_041:15186:" senseid="phone"/>
<answer instance="line-n.art7} aphb 05601797:" senseid="phone"/>
<answer instance="line-n.w8_119:2964:" senseid="product"/>
<answer instance="line-n.w7_040:13652:" senseid="phone"/>
<answer instance="line-n.w7_122:2194:" senseid="phone"/>
<answer instance="line-n.art7} aphb 45903907:" senseid="product"/>
<answer instance="line-n.art7} aphb 43602625:" senseid="phone"/>
<answer instance="line-n.w8_034:3995:" senseid="product"/>
<answer instance="line-n.w8_139:696:" senseid="product"/>
<answer instance="line-n.art7} aphb 20801955:" sens

In [94]:
from collections import defaultdict
all_feature_phone = defaultdict(lambda : 0)
all_feature_product = defaultdict(lambda : 0)
for pos_set in train_df[train_df['senseid'] == 'phone']['pos_tag']:
    for word_tag in pos_set:
        all_feature_phone[word_tag] = all_feature_phone[word_tag] + 1
for pos_set in train_df[train_df['senseid'] == 'product']['pos_tag']:
    for word_tag in pos_set:
        all_feature_product[word_tag] = all_feature_product[word_tag] + 1

defaultdict(<function __main__.<lambda>()>,
            {('new', 'JJ'): 37,
             ('york', 'NN'): 10,
             ('plan', 'NN'): 3,
             ('froze', 'VBP'): 1,
             ('basic', 'JJ'): 1,
             ('rates', 'NNS'): 3,
             ('offered', 'VBN'): 1,
             ('protection', 'NN'): 1,
             ('nynex', 'JJ'): 1,
             ('economic', 'JJ'): 3,
             ('downturn', 'NN'): 1,
             ('sharply', 'RB'): 1,
             ('cut', 'JJ'): 1,
             ('demand', 'NN'): 1,
             ('didnt', 'NN'): 4,
             ('offer', 'VBP'): 3,
             ('flexible', 'JJ'): 1,
             ('pricing', 'NN'): 1,
             ('contrast', 'NN'): 2,
             ('california', 'NN'): 2,
             ('economy', 'NN'): 4,
             ('booming', 'VBG'): 2,
             ('45', 'CD'): 1,
             ('access', 'NN'): 20,
             ('far', 'RB'): 3,
             ('technique', 'NN'): 1,
             ('working', 'VBG'): 2,
             ('since', 'IN'