In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import classification_report

import pickle

In [2]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vTqd_a5YgMTVUGfc6F9dx62fvq2ptPvyq6cGpfAtPMYWTDn1qiVg2_ma79xS2NyQ-CHkOXy_MzCd03I/pub?gid=712009217&single=true&output=csv')
# Note: The encoding is not removing spaces, apostrophes, or accents on characters
# I manually removed them for now. #TechDebt


# Import other countries
country_urls = ['https://docs.google.com/spreadsheets/d/e/2PACX-1vTqd_a5YgMTVUGfc6F9dx62fvq2ptPvyq6cGpfAtPMYWTDn1qiVg2_ma79xS2NyQ-CHkOXy_MzCd03I/pub?gid=1621398561&single=true&output=csv'
               ,'https://docs.google.com/spreadsheets/d/e/2PACX-1vTqd_a5YgMTVUGfc6F9dx62fvq2ptPvyq6cGpfAtPMYWTDn1qiVg2_ma79xS2NyQ-CHkOXy_MzCd03I/pub?gid=1688323099&single=true&output=csv'
               ,'https://docs.google.com/spreadsheets/d/e/2PACX-1vTqd_a5YgMTVUGfc6F9dx62fvq2ptPvyq6cGpfAtPMYWTDn1qiVg2_ma79xS2NyQ-CHkOXy_MzCd03I/pub?gid=311250649&single=true&output=csv' 
               ]

big_surname_list = []

# Multiply each name by the number of times it should be in the list (col index 5)
for url in country_urls:
    country_df = pd.read_csv(url, usecols=[1,5])
    country_surnames = country_df.values.tolist()

    for name in country_surnames:
        for qty in range(0,name[1]):
            big_surname_list.append(name[0])

# Create 0 flags for all new names
surname_flags = [0] * len(big_surname_list)

# Zip it all together and join into master dataframe
new_surnames = pd.DataFrame(list(zip(big_surname_list,surname_flags)), columns=['surname_plain','irish_flag'])
df = pd.concat([df, new_surnames])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df['surname_plain'], df['irish_flag'], test_size=0.15, random_state=123, stratify=df['irish_flag'])

pipe = Pipeline([('vec', CountVectorizer())
                ,('clf', ComplementNB())
                # ,('rf', RandomForestClassifier())
                # ,('knn', KNeighborsClassifier())
                ])
param_grid = [{'vec__ngram_range':[(3,3),(4,4),(5,5)]
              ,'vec__analyzer':['char','char_wb']
              ,'clf__alpha':[0.01, 0.1, 0.5, 1, 10]
              # ,'knn__n_neighbors':[1,2,3,4,5]
              # ,'knn__weights':['uniform','distance']
              }]
f2_scorer = make_scorer(fbeta_score, beta=2)


gs = GridSearchCV(pipe
                  ,param_grid=param_grid
                  ,cv=3
                  ,verbose=1
                  ,scoring=f2_scorer)
gs.fit(X_train,y_train)

print(f"Best model parameters: {gs.best_params_}")
final_model_params = gs.best_params_
print()
print(f"Best training F2 score: {gs.best_score_}")
print()
# print(gs.cv_results_['mean_test_score'],'\n',gs.cv_results_['std_test_score'])

df_pred = pd.DataFrame(zip(X_test,y_test,gs.predict_proba(X_test),gs.predict(X_test), gs.predict_log_proba(X_test)))

print(f"F2 score on test data: {gs.score(X_test, y_test)}")
print()
print("Classification Report")
print(classification_report(y_test, gs.predict(X_test)))



Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best model parameters: {'clf__alpha': 0.1, 'vec__analyzer': 'char_wb', 'vec__ngram_range': (5, 5)}

Best training F2 score: 0.8693676972849392

F2 score on test data: 0.8968347010550997

Classification Report
              precision    recall  f1-score   support

           0       0.97      0.93      0.95      1536
           1       0.80      0.92      0.86       497

    accuracy                           0.93      2033
   macro avg       0.89      0.93      0.90      2033
weighted avg       0.93      0.93      0.93      2033



In [6]:
final_pipe = Pipeline([('vec', CountVectorizer(analyzer='char_wb', ngram_range=(4,4)))
                      ,('clf', ComplementNB(alpha=1))
                      ])

In [7]:
final_model = final_pipe.fit(df['surname_plain'], df['irish_flag'])

In [8]:
# This is the correct prior probability of an Irish name
final_model['clf'].class_log_prior_[1]

-1.4083782189053284

In [9]:
# These are the correct feature log probabilities of a feature in the Irish name class
final_model['clf'].feature_log_prob_[0]*-1

array([ -9.02232262, -10.12093491, -10.12093491, ..., -10.12093491,
       -10.12093491, -10.12093491])

In [10]:
def get_salient_words(nb_clf, vect, class_ind):
    """Return salient words for given class
    Parameters
    ----------
    nb_clf : a Naive Bayes classifier (e.g. MultinomialNB, BernoulliNB)
    vect : CountVectorizer
    class_ind : int
    Returns
    -------
    list
        a sorted list of (word, log prob) sorted by log probability in descending order.

    Note: Feature log probabilities for Class 1 are obtained by calling all the feature log probabilities for Class 0 and  multiplying them by -1
    """

    words = vect.get_feature_names()
    zipped = list(zip(words, nb_clf.feature_log_prob_[class_ind]*-1))
    sorted_zip = sorted(zipped, key=lambda t: t[1], reverse=True)

    return sorted_zip

pos_all = get_salient_words(final_model['clf'], final_model['vec'], 0)

In [11]:
try:
    print(list(filter(lambda prob: prob[0] == 'kwu ', pos_all))[0][1])
except:
    print('nope')
# Murphy score is -39.48

-10.120934911305456


In [12]:
with open("saved-objects/irish_log_probs.p", "wb") as p:
    pickle.dump(pos_all, p)

In [33]:
# print(neg_salient_top_20)
df_features = pd.DataFrame(pos_all, columns=['ngram','log_prob'])

In [34]:
df_features['log_prob'].value_counts()
# 910 log_prob values that aren't the last category
# 3697 values that are the last category
# 4607 total

-10.120854    4939
-9.427707      789
-9.022242      242
-8.734560      186
-8.511417      157
              ... 
-5.946467        1
-6.009981        1
-6.208831        1
-6.249653        1
-6.131870        1
Name: log_prob, Length: 68, dtype: int64

In [35]:
df_features.head(10)
print('Murphy')
print(df_features[df_features['ngram']==' mur'].values)
print(df_features[df_features['ngram']=='murp'].values)
print(df_features[df_features['ngram']=='urph'].values)
print(df_features[df_features['ngram']=='rphy'].values)
print(df_features[df_features['ngram']=='phy '].values)
# Murphy prob = 0.000000000000005 (that's 14 zeros)
# 5.0e-15
# Murphy index ~6.58

# print('Bailey')
# print(df_features[df_features['ngram']==' bai'].values)
# print(df_features[df_features['ngram']=='bail'].values)
# print(df_features[df_features['ngram']=='aile'].values)
# print(df_features[df_features['ngram']=='iley'].values)
# print(df_features[df_features['ngram']=='ley '].values)
# Bailey prob = 0.000000000000000014 (that's 16 zeros)
# 1.4e-17
# Bailey 7.762
# Murphy index = 6.58/7.762 = .8477

# print('Burkson')
# print(df_features[df_features['ngram']==' bur'].values)
# print(df_features[df_features['ngram']=='burk'].values)
# print(df_features[df_features['ngram']=='urks'].values)
# print(df_features[df_features['ngram']=='rkso'].values)
# print(df_features[df_features['ngram']=='kson'].values)
# print(df_features[df_features['ngram']=='son '].values)
# Burkson 8.18
# Murphy index = 6.58/8.18 = .8044

print('Sims')
print(df_features[df_features['ngram']==' sim'].values)
print(df_features[df_features['ngram']=='sims'].values)
print(df_features[df_features['ngram']=='ims '].values)
# Sims 8.68
# Murphy index = 6.58/8.68 = .7581

# print('Ravichandran')
# print(df_features[df_features['ngram']==' rav'].values)
# print(df_features[df_features['ngram']=='ravi'].values)
# print(df_features[df_features['ngram']=='avic'].values)
# print(df_features[df_features['ngram']=='vich'].values)
# print(df_features[df_features['ngram']=='icha'].values)
# print(df_features[df_features['ngram']=='chan'].values)
# print(df_features[df_features['ngram']=='hand'].values)
# print(df_features[df_features['ngram']=='andr'].values)
# print(df_features[df_features['ngram']=='ndra'].values)
# print(df_features[df_features['ngram']=='dran'].values)
# print(df_features[df_features['ngram']=='ran '].values)
# Ravichandran 8.38
# Murphy index = 6.58/8.38 = .7852


# print('Platzfelder')
# print(df_features[df_features['ngram']==' pla'].values)
# print(df_features[df_features['ngram']=='plat'].values)
# print(df_features[df_features['ngram']=='latz'].values)
# print(df_features[df_features['ngram']=='atzf'].values)
# print(df_features[df_features['ngram']=='tzfe'].values)
# print(df_features[df_features['ngram']=='zfel'].values)
# print(df_features[df_features['ngram']=='feld'].values)
# print(df_features[df_features['ngram']=='elde'].values)
# print(df_features[df_features['ngram']=='lder'].values)
# print(df_features[df_features['ngram']=='der '].values)

# print('Timothy')
# print(df_features[df_features['ngram']==' tim'].values)
# print(df_features[df_features['ngram']=='timo'].values)
# print(df_features[df_features['ngram']=='imot'].values)
# print(df_features[df_features['ngram']=='moth'].values)
# print(df_features[df_features['ngram']=='othy'].values)
# print(df_features[df_features['ngram']=='thy '].values)
# Timothy 8.33
# Murphy index 6.58/8.33 = .7899

# Murphy index could be Avg log probability of a name divided by the avg log probability of the name Murphy


Murphy
[[' mur' -5.751406598605862]]
[['murp' -6.788649940897679]]
[['urph' -6.788649940897679]]
[['rphy' -6.788649940897679]]
[['phy ' -6.6551185482731565]]
Sims
[[' sim' -10.120854451072884]]
[['sims' -10.120854451072884]]
[['ims ' -10.120854451072884]]


In [176]:
df_features[df_features['ngram']==' mur']

Unnamed: 0,ngram,log_prob
0,mur,-6.485653


In [38]:
df_features[925:950]

Unnamed: 0,ngram,log_prob
925,larn,-8.329095
926,las,-8.329095
927,ledd,-8.329095
928,ling,-8.329095
929,lloc,-8.329095
930,low,-8.329095
931,lyhu,-8.329095
932,ment,-8.329095
933,mick,-8.329095
934,morg,-8.329095


In [186]:
with open("pickled_CompNB_model.p", "wb") as p:
    pickle.dump(final_model, p)

In [16]:
final_model.predict_log_proba(['Knickerbocker'])

array([[-0.00583429, -5.14691913]])

In [187]:
final_model.predict_proba(['Sims'])

array([[0.60195858, 0.39804142]])

Okay, so now I have a model. Now I need to put it in a Streamlit app that can receive input from the user and return a response of some sort.