# Download the tagged corpus

In [1]:
! wget https://raw.githubusercontent.com/apertium/apertium-eng/master/texts/eng.tagged && mv eng.tagged data/eng.tagged

--2019-04-04 23:11:30--  https://raw.githubusercontent.com/apertium/apertium-eng/master/texts/eng.tagged
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.240.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.240.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 645001 (630K) [text/plain]
Saving to: ‘eng.tagged’


2019-04-04 23:11:34 (199 KB/s) - ‘eng.tagged’ saved [645001/645001]



# Load the data

In [2]:
with open('data/eng.tagged', 'r') as f:
    lines = f.readlines()

In [3]:
lines[0:20]

['^Politics/politics<n><sg>$\n',
 '^in/in<pr>$\n',
 '^Afghanistan/Afghanistan<np><top><sg>$\n',
 '^has/have<vbhaver><pres><p3><sg>$\n',
 '^historically/historically<adv>$\n',
 '^consisted/consist<vblex><pp>$\n',
 '^of/of<pr>$\n',
 '^power/power<n><sg>$\n',
 '^struggles/struggle<n><pl>$\n',
 '^,/,<cm>$\n',
 '^bloody/*bloody$\n',
 '^coups/coup<n><pl>$\n',
 '^and/and<cnjcoo>$\n',
 '^unstable/unstable<adj>$\n',
 '^transfers/transfer<n><pl>$\n',
 '^of/of<pr>$\n',
 '^power/power<n><sg>$\n',
 '^./.<sent>$\n',
 '^With/with<pr>$\n',
 '^the/the<det><def><sp>$\n']

In [4]:
' '.join([l[1:l.find('/')] for l in lines[:18]])

'Politics in Afghanistan has historically consisted of power struggles , bloody coups and unstable transfers of power .'

# Build the corpus

In [5]:
import pandas as pd

def split_X_y(file_lines):
    'With/with<pr>$'
    splitted_lines = [line[1:].strip().split('/') for line in file_lines]
    splitted_lines = [line for line in splitted_lines if line[0]]

    tokens = [l[0] for l in splitted_lines]
    targets = [l[1][:-1] for l in splitted_lines]
    return pd.DataFrame({
        'token': tokens,
        'target': targets
    })

In [6]:
corpus_df = split_X_y(lines)

In [7]:
corpus_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29650 entries, 0 to 29649
Data columns (total 2 columns):
token     29650 non-null object
target    29650 non-null object
dtypes: object(2)
memory usage: 463.4+ KB


In [8]:
corpus_df.isnull().sum()

token     0
target    0
dtype: int64

# [TODO] Remove the stopwords from the corpus

# Split the sentences into train/test sets

In [9]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(corpus_df, test_size=0.1, random_state=42)

In [10]:
train_df.head()

Unnamed: 0,token,target
11909,millionaires,millionaire<n><pl>
1827,rate,rate<n><sg>
10299,interrupter,*interrupter
640,.,.<sent>
15046,and,and<cnjcoo>


In [11]:
test_df.head()

Unnamed: 0,token,target
25628,of,of<pr>
7249,frustrated,frustrate<vblex><pp>
9043,following,follow<vblex><ger>
8164,",",",<cm>"
26082,otherwise,otherwise<adv>


# Build the classification model

In [12]:
import numpy as np

unigrams_counts = train_df['target'].value_counts().reset_index()
unigrams_counts.columns = ['analysis', 'count']
unigrams_counts['log_prob'] = -np.log(unigrams_counts['count']/(train_df.shape[0]))
unigrams_counts.head()

Unnamed: 0,analysis,count,log_prob
0,the<det><def><sp>,1862,2.66245
1,",<cm>",1475,2.895444
2,.<sent>,1031,3.253572
3,of<pr>,930,3.356672
4,and<cnjcoo>,704,3.635079


In [13]:
prob_dict = pd.Series(unigrams_counts.log_prob.values, index=unigrams_counts.analysis).to_dict()

# Run Apertium's analyzer

In [14]:
import settings
import subprocess
def get_apertium_analyses(token):
    #TODO: Don't use shell=True
    analyses = subprocess.run(
        ['echo {} | apertium-destxt | lt-proc {}'.format(token,
                                                         settings.APERTIUM_ANALYZER_BIN_LOC)],
        stdout=subprocess.PIPE,
        shell=True).stdout.decode()

    return analyses[analyses.find('/') + 1: analyses.find('$')].split('/')

In [15]:
get_apertium_analyses('stores')

['store<n><pl>', 'store<vblex><pres><p3><sg>']

# Rank the analysis

In [16]:
def rank_analyses(token):
    '''The first tag is the most probable one'''
    analyses = get_apertium_analyses(token)
    
    analyses_prob = [(prob_dict.get(analysis, np.inf), analysis) for analysis in analyses]
    analyses_prob.sort()
    
    return analyses_prob

In [17]:
rank_analyses(test_df.sample().loc[:, 'token'].values[0])

[(inf, 'zinc<n><sg>')]

# Generate the one symbol tokens

In [18]:
import re
def get_symbols_from_target(target):
    return '\t'.join(re.findall(r'<.+?>', target))

In [19]:
one_symbol_tokens = pd.Series('\t'.join(unigrams_counts['analysis'].apply(get_symbols_from_target)).split('\t')).unique()
one_symbol_tokens = [sym for sym in one_symbol_tokens if sym]
one_symbol_tokens[:5]

['<det>', '<def>', '<sp>', '<cm>', '<sent>']

In [20]:
with open('fst_input_data/one_sym', 'w') as f:
    f.write('\n'.join(one_symbol_tokens))

# Generate the string pairs

In [21]:
def tuple_to_stringpair(row):
    return '{}:{}\t{}'.format(row['token'].replace(':', '\:'), row['target'].replace(':', '\:'), prob_dict.get(row['target']))

In [22]:
string_pairs = train_df.drop_duplicates().apply(tuple_to_stringpair, axis=1)

In [23]:
with open('fst_input_data/str_pairs', 'w') as f:
    f.write('\n'.join(string_pairs))

# Generate a new hfst weighted transducer

In [24]:
subprocess.run(
        ['hfst-strings2fst -i {} -o {} -j -m {}'.format('fst_input_data/str_pairs',
                                                        'bin/hfst_model',
                                                        'fst_input_data/one_sym')],
        stdout=subprocess.PIPE,
        shell=True)

CompletedProcess(args=['hfst-strings2fst -i fst_input_data/str_pairs -o bin/hfst_model -j -m fst_input_data/one_sym'], returncode=0, stdout=b'')

# Convert hfst transducer to att format

In [25]:
subprocess.run(
        ['hfst-fst2txt -i {} -o {}'.format('bin/hfst_model', 'fst_input_data/letters.att')],
        stdout=subprocess.PIPE,
        shell=True)


CompletedProcess(args=['hfst-fst2txt -i bin/hfst_model -o fst_input_data/letters.att'], returncode=0, stdout=b'')

# Generate a new weighted transducer from att format

In [26]:
subprocess.run(
        ['lt-comp lr {} {}'.format('fst_input_data/letters.att', 'bin/apert_model')],
        stdout=subprocess.PIPE,
        shell=True)


CompletedProcess(args=['lt-comp lr fst_input_data/letters.att bin/apert_model'], returncode=0, stdout=b'main@standard 34369 34368\nfinal@inconditional 29 28\n')

# Generate analysis from new weighted transducer
Note: You will need to build the master branch of lttoolbox so that the analyses weights are computed correctly

In [27]:
import settings
import subprocess
def get_apertium_analyses(token):
    #TODO: Don't use shell=True
    analyses = subprocess.run(
        ['echo {} | apertium-destxt | ../lttoolbox/lttoolbox/lt-proc {} -W'.format(token,
                                                         'bin/apert_model')],
        stdout=subprocess.PIPE,
        shell=True).stdout.decode()

    return analyses[analyses.find('/') + 1: analyses.find('$')].split('/')

In [28]:
get_apertium_analyses('do')

['do<vblex><pres><W:9.498710>',
 'do<vbdo><pres><W:9.498710>',
 'do<vblex><inf><W:10.191857>']