# Download the tagged corpus

In [1]:
! wget https://raw.githubusercontent.com/apertium/apertium-eng/master/texts/eng.tagged && mv eng.tagged data/eng.tagged

--2019-03-21 19:07:18--  https://raw.githubusercontent.com/apertium/apertium-eng/master/texts/eng.tagged
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.240.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.240.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 645001 (630K) [text/plain]
Saving to: ‘eng.tagged’


2019-03-21 19:07:33 (95.9 KB/s) - ‘eng.tagged’ saved [645001/645001]



# Load the data

In [2]:
with open('data/eng.tagged', 'r') as f:
    lines = f.readlines()

In [3]:
lines[0:20]

['^Politics/politics<n><sg>$\n',
 '^in/in<pr>$\n',
 '^Afghanistan/Afghanistan<np><top><sg>$\n',
 '^has/have<vbhaver><pres><p3><sg>$\n',
 '^historically/historically<adv>$\n',
 '^consisted/consist<vblex><pp>$\n',
 '^of/of<pr>$\n',
 '^power/power<n><sg>$\n',
 '^struggles/struggle<n><pl>$\n',
 '^,/,<cm>$\n',
 '^bloody/*bloody$\n',
 '^coups/coup<n><pl>$\n',
 '^and/and<cnjcoo>$\n',
 '^unstable/unstable<adj>$\n',
 '^transfers/transfer<n><pl>$\n',
 '^of/of<pr>$\n',
 '^power/power<n><sg>$\n',
 '^./.<sent>$\n',
 '^With/with<pr>$\n',
 '^the/the<det><def><sp>$\n']

In [4]:
' '.join([l[1:l.find('/')] for l in lines[:18]])

'Politics in Afghanistan has historically consisted of power struggles , bloody coups and unstable transfers of power .'

# Build the corpus

In [5]:
import pandas as pd

def split_X_y(file_lines):
    'With/with<pr>$'
    splitted_lines = [line[1:].strip().split('/') for line in file_lines]
    splitted_lines = [line for line in splitted_lines if line[0]]

    tokens = [l[0] for l in splitted_lines]
    targets = [l[1][:-1] for l in splitted_lines]
    return pd.DataFrame({
        'token': tokens,
        'target': targets
    })

In [6]:
corpus_df = split_X_y(lines)

# [TODO] Remove the stopwords from the corpus

# Split the sentences into train/test sets

In [7]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(corpus_df, test_size=0.1, random_state=42)

In [8]:
train_df.head()

Unnamed: 0,token,target
11909,millionaires,millionaire<n><pl>
1827,rate,rate<n><sg>
10299,interrupter,*interrupter
640,.,.<sent>
15046,and,and<cnjcoo>


In [9]:
test_df.head()

Unnamed: 0,token,target
25628,of,of<pr>
7249,frustrated,frustrate<vblex><pp>
9043,following,follow<vblex><ger>
8164,",",",<cm>"
26082,otherwise,otherwise<adv>


# Build the classification model

In [10]:
import numpy as np

unigrams_counts = train_df['target'].value_counts().reset_index()
unigrams_counts.columns = ['analysis', 'count']
unigrams_counts['log_prob'] = -np.log(unigrams_counts['count']/(train_df.shape[0]))
unigrams_counts.head()

Unnamed: 0,analysis,count,log_prob
0,the<det><def><sp>,1862,2.66245
1,",<cm>",1475,2.895444
2,.<sent>,1031,3.253572
3,of<pr>,930,3.356672
4,and<cnjcoo>,704,3.635079


In [11]:
prob_dict = pd.Series(unigrams_counts.log_prob.values, index=unigrams_counts.analysis).to_dict()

# Run Apertium's analyzer

In [12]:
import settings
import subprocess
def get_apertium_analyses(token):
    #TODO: Don't use shell=True
    analyses = subprocess.run(
        ['echo {} | apertium-destxt | lt-proc {}'.format(token,
                                                         settings.APERTIUM_ANALYZER_BIN_LOC)],
        stdout=subprocess.PIPE,
        shell=True).stdout.decode()

    return analyses[analyses.find('/') + 1: analyses.find('$')].split('/')

In [13]:
get_apertium_analyses('stores')

['store<n><pl>', 'store<vblex><pres><p3><sg>']

# Rank the analysis

In [14]:
def rank_analyses(token):
    '''The first tag is the most probable one'''
    analyses = get_apertium_analyses(token)
    
    analyses_prob = [(prob_dict.get(analysis, np.inf), analysis) for analysis in analyses]
    analyses_prob.sort()
    
    return analyses_prob

In [15]:
rank_analyses(test_df.sample().loc[:, 'token'].values[0])

[(8.245946739712732, 'second<adj><ord>'),
 (10.191856888768045, 'second<n><sg>'),
 (inf, 'second<adv>'),
 (inf, 'second<vblex><imp>'),
 (inf, 'second<vblex><inf>'),
 (inf, 'second<vblex><pres>')]