In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns

In [None]:
%matplotlib notebook
pd.options.display.max_rows = 100
pd.options.display.min_rows = 50

In [None]:
words_import = pd.read_csv('my.dict',header=0, names=['words'])

In [None]:
# clean import, split plurals into non-plural and plural versions
words_import = words_import.dropna()

s_end = words_import[words_import.loc[:,'words'].str[-2:] == "'s"]
s_end_normal = pd.DataFrame(s_end.loc[:,'words'].str.replace("'s",""), columns=['words'])
s_end_plural = pd.DataFrame(s_end.loc[:,'words'].str.replace("'",""), columns=['words'])

words_import = words_import[words_import.loc[:,'words'].str[-2:] != "'s"]
words_import = pd.concat([words_import, s_end_normal, s_end_plural]).reset_index(drop=True)

In [None]:
words_df = words_import.copy()
words_df.loc[:,'words_raw'] = words_df.loc[:,'words'].str.replace("'","")
words_df = words_df.drop_duplicates(subset=['words_raw'])
words_df.loc[:,'words_length'] = words_df.loc[:,'words_raw'].apply(lambda word: len(word))
words_df = words_df[words_df.loc[:,'words_length'] > 3]

words_df.loc[:,'first_char'] = words_df.loc[:,'words_raw'].str[0]
words_df.loc[:,'last_char'] = words_df.loc[:,'words_raw'].str[-1]
words_df.loc[:,'middle_chars'] = words_df.loc[:,'words_raw'].apply(lambda word: ''.join(sorted(word[1:-1])))

In [None]:
matches = words_df.groupby(by=['first_char','last_char','middle_chars'], as_index=False)['words'].count().rename(columns={'words':'count'})

matches = matches[matches.loc[:,'count'] > 1].reset_index(drop=True)
matches.loc[:,'match_index'] = matches.index
matches = matches.merge(words_df, how='left', on=['first_char','last_char','middle_chars'])
matches.loc[:,'single'] = 1

In [None]:
matches_flat = matches.set_index([matches.loc[:,'match_index'], matches.groupby('match_index')['single'].cumsum()])
matches_flat = matches_flat[['words']].copy()
matches_flat = matches_flat.unstack()
matches_flat.columns = ['word_'+str(matches_flat.columns[x][1]) for x in range(len(matches_flat.columns))]

matches_flat = matches_flat.fillna('')
count_only = matches[['match_index','count','words_length']].drop_duplicates()
matches_flat = matches_flat.merge(count_only, how='left', left_index=True, right_on='match_index').drop(
    ['match_index'], axis=1).reset_index(drop=True)

In [None]:
first_letter =''
top_x = 50
sort_on = 'words_length'

if first_letter == '':
    summary = matches_flat.copy()
else:
    summary = matches_flat[matches_flat.loc[:,'word_1'].str.lower().str[0] == first_letter]
summary.sort_values(sort_on, ascending=False).head(top_x)

In [None]:
match_rate = (matches.groupby('words_length')['words'].count() / words_df.groupby('words_length')['words'].count()).dropna()

In [None]:
fig, ((ax1, ax2)) = plt.subplots(1,2,sharex=True, sharey=False, figsize=(10,5))

sns.distplot(words_df.loc[:,'words_length'], ax=ax1, label='all', kde=False, bins=np.arange(0,25), norm_hist=True)
sns.distplot(matches.loc[:,'words_length'], ax=ax1, label='matched', kde=False, bins=np.arange(0,25), norm_hist=True)
# sns.hist(matches.loc[:,'words_length'], bw=1, ax=ax1, label='matched words')
ax2.bar(list(match_rate.index), list(match_rate), width=1, color='purple', alpha=0.5);

plt.sca(ax1)
plt.title("Distribution of Words by Length")
plt.ylabel('Proportion of Words', color='gray')
plt.xlabel('Length of Word', color='gray');

plt.sca(ax2)
plt.title("Rate of Matches by Word Length")
plt.xlabel('Length of Word', color='gray')
ax2.yaxis.tick_right()
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(decimals=0, xmax=1))
ax2.yaxis.set_label_position('right')
plt.ylabel('Rate of Words with a match', color='gray')

fig.suptitle('5 letter words are the most likely to be typoglycemic...', fontweight='bold', y=0.95)
plt.tight_layout()
plt.subplots_adjust(top=0.8)