In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz
from string import ascii_lowercase

In [2]:
df = pd.read_csv('./data/derived/words_of_interest.csv')
df.head()

Unnamed: 0,word,first,last,len,prefix,cls,suffix
0,abalienate,a,e,10,l,ie,n
1,abalienated,a,d,11,l,ie,n
2,abalienating,a,g,12,l,ie,n
3,abalienation,a,n,12,l,ie,n
4,abbacies,a,s,8,c,ie,s


In [3]:
x_features = []
letters = list(ascii_lowercase)
letters.append('')
for feat in ['first', 'last', 'prefix', 'suffix']:
    for l in letters:
        oh_name = '_'.join([feat, l])
        x_features.append(oh_name)
        df[oh_name] = (df[feat] == l).astype(np.int)

x_features.append('len')
#df['cls'] = (df.cls == 'ei').astype(np.int)
df.head()
    

Unnamed: 0,word,first,last,len,prefix,cls,suffix,first_a,first_b,first_c,...,suffix_r,suffix_s,suffix_t,suffix_u,suffix_v,suffix_w,suffix_x,suffix_y,suffix_z,suffix_
0,abalienate,a,e,10,l,ie,n,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,abalienated,a,d,11,l,ie,n,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,abalienating,a,g,12,l,ie,n,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,abalienation,a,n,12,l,ie,n,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,abbacies,a,s,8,c,ie,s,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [4]:
X = df[x_features].values
Y = (df.cls == 'ei').astype(np.int)
clf = DecisionTreeClassifier(max_depth=3)
clf = clf.fit(X, Y)

In [5]:
export_graphviz(clf, max_depth=4, feature_names=x_features, out_file='./data/derived/dtree.dot')

dot -Tpng ./data/derived/dtree.dot -o ./data/derived/dtree.png

In [6]:
plot_tree(clf)

[Text(248.0, 323.4, 'X[99] <= 0.5\ngini = 0.37\nsamples = 17740\nvalue = [13397, 4343]'),
 Text(124.0, 230.99999999999997, 'X[98] <= 0.5\ngini = 0.439\nsamples = 11445\nvalue = [7720, 3725]'),
 Text(62.0, 138.6, 'X[87] <= 0.5\ngini = 0.479\nsamples = 8862\nvalue = [5332, 3530]'),
 Text(31.0, 46.19999999999999, 'gini = 0.467\nsamples = 8332\nvalue = [5237, 3095]'),
 Text(93.0, 46.19999999999999, 'gini = 0.294\nsamples = 530\nvalue = [95, 435]'),
 Text(186.0, 138.6, 'X[44] <= 0.5\ngini = 0.14\nsamples = 2583\nvalue = [2388, 195]'),
 Text(155.0, 46.19999999999999, 'gini = 0.306\nsamples = 913\nvalue = [741, 172]'),
 Text(217.0, 46.19999999999999, 'gini = 0.027\nsamples = 1670\nvalue = [1647, 23]'),
 Text(372.0, 230.99999999999997, 'X[39] <= 0.5\ngini = 0.177\nsamples = 6295\nvalue = [5677, 618]'),
 Text(310.0, 138.6, 'X[29] <= 0.5\ngini = 0.149\nsamples = 6174\nvalue = [5672, 502]'),
 Text(279.0, 46.19999999999999, 'gini = 0.133\nsamples = 6106\nvalue = [5670, 436]'),
 Text(341.0, 46.1999

In [7]:
df[df.suffix_s == 0].groupby('cls').count()

Unnamed: 0_level_0,word,first,last,len,prefix,suffix,first_a,first_b,first_c,first_d,...,suffix_r,suffix_s,suffix_t,suffix_u,suffix_v,suffix_w,suffix_x,suffix_y,suffix_z,suffix_
cls,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ei,3725,3725,3725,3725,3628,3597,3725,3725,3725,3725,...,3725,3725,3725,3725,3725,3725,3725,3725,3725,3725
ie,7720,7720,7720,7720,7692,6655,7720,7720,7720,7720,...,7720,7720,7720,7720,7720,7720,7720,7720,7720,7720


# Evaluate possible strategies

In [8]:
def always_ie(row):
    return 'ie'

def i_before_e_except_after_c(row):
    return 'ei' if row.prefix == 'c' else 'ie'

def i_before_e_except_after_w(row):
    return 'ei' if row.prefix == 'w' else 'ie'

def i_before_e_except_at_beginning(row):
    return 'ei' if row.prefix == '' else 'ie'

def i_before_e_except_after_e(row):
    return 'ei' if row.prefix == 'e' else 'ie'

def dtree(row):
    return 'ei' if clf.predict([row[x_features]])[0] else 'ie'

df['always_ie'] = df.apply(always_ie, axis=1)
df['ib4exc'] = df.apply(i_before_e_except_after_c, axis=1)
df['ib4exw'] = df.apply(i_before_e_except_after_w, axis=1)
df['ib4ex'] = df.apply(i_before_e_except_at_beginning, axis=1)
df['ib4exe'] = df.apply(i_before_e_except_after_e, axis=1)
df['dtree'] = df.apply(dtree, axis=1)


rules = ['always_ie', 'ib4exc', 'ib4exw', 'ib4ex', 'ib4exe', 'dtree']
row_cnt = float(df.shape[0])
results = []
for r in rules:
    n_correct = df[df.cls == df[r]].shape[0]
    results.append((r, n_correct, n_correct/row_cnt))

results_df = pd.DataFrame(results, columns=['rule', 'n_correct', 'accuracy'])
results_df.sort_values('accuracy', ascending=False)

Unnamed: 0,rule,n_correct,accuracy
5,dtree,13912,0.784216
4,ib4exe,13505,0.761274
2,ib4exw,13496,0.760767
0,always_ie,13397,0.755186
3,ib4ex,13397,0.755186
1,ib4exc,12899,0.727114


# Conclusion
 - 'ie always' is more reliable than i before e, except after c
 - w & e may be better prefixes to replace the traditional rule

# Future Analysis
 - Analyizing a lemmatized set of words would reduce the impact of word suffixes (ing, ist, ism) and duplicate root words
 - An interesting (and more relevant to our purpose) dataset would be commonly misspelled words