In [None]:
# pip install chardet
# !pip install pandas

In [1]:
import pandas as pd

import re
import sys

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score

import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np

import pickle
# from ngram_config import ngram_creator

import chardet

In [2]:
df = pd.read_csv('irish_names_split.csv') #, encoding='ascii', errors='replace')
# Note: The encoding is not removing spaces, apostrophes, or accents on characters
# I manually removed them for now. #TechDebt

In [3]:
df.dropna(axis=0,subset=['surname_plain'], inplace=True)

In [4]:
df.head()

Unnamed: 0,full_name,given_name,surname,full_name_plain,given_name_plain,surname_plain,irish_flag
0,Maria Bailey,Maria,Bailey,MariaBailey,Maria,Bailey,1
1,Pat Breen,Pat,Breen,PatBreen,Pat,Breen,1
2,Colm Brophy,Colm,Brophy,ColmBrophy,Colm,Brophy,1
3,Richard Bruton,Richard,Bruton,RichardBruton,Richard,Bruton,1
4,Peter Burke,Peter,Burke,PeterBurke,Peter,Burke,1


In [6]:
# document = ['Murphy', 'MacGuiness', 'Bailey', 'Sims', 'Knickerbocker']

X_train, X_test, y_train, y_test = train_test_split(
    df['surname_plain'], df['irish_flag'], test_size=0.05, random_state=42, stratify=df['irish_flag'])

pipe = Pipeline([('vec', CountVectorizer())
                ,('clf', ComplementNB())
                # ,('rf', RandomForestClassifier())
                # ,('knn', KNeighborsClassifier())
                ])
param_grid = [{'vec__ngram_range':[(3,3),(4,4),(5,5)]
              ,'vec__analyzer':['char','char_wb']
              ,'clf__alpha':[0.01, 0.1, 0.5, 1, 10]
              # ,'knn__n_neighbors':[1,2,3,4,5]
              # ,'knn__weights':['uniform','distance']
              }]
f2_scorer = make_scorer(fbeta_score, beta=2)


gs = GridSearchCV(pipe
                  ,param_grid=param_grid
                  ,cv=3
                  ,verbose=1
                  ,scoring=f2_scorer)
gs.fit(X_train,y_train)

print(f"Best model parameters: {gs.best_params_}")
final_model_params = gs.best_params_
print()
print(f"Best training F2 score: {gs.best_score_}")
print()
# print(gs.cv_results_['mean_test_score'],'\n',gs.cv_results_['std_test_score'])

df_pred = pd.DataFrame(zip(X_test,y_test,gs.predict_proba(X_test),gs.predict(X_test), gs.predict_log_proba(X_test)))

print(f"F2 score on test data: {gs.score(X_test, y_test)}")
print()
print("Classification Report")
print(classification_report(y_test, gs.predict(X_test)))



Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best model parameters: {'clf__alpha': 1, 'vec__analyzer': 'char_wb', 'vec__ngram_range': (4, 4)}

Best training F2 score: 0.4575445881293246

F2 score on test data: 0.5737704918032787

Classification Report
              precision    recall  f1-score   support

           0       0.93      0.84      0.88        62
           1       0.41      0.64      0.50        11

    accuracy                           0.81        73
   macro avg       0.67      0.74      0.69        73
weighted avg       0.85      0.81      0.82        73



In [98]:
final_pipe = Pipeline([('vec', CountVectorizer(analyzer='char_wb', ngram_range=(4,4)))
                      ,('clf', ComplementNB(alpha=1))
                      ])

In [99]:
final_model = final_pipe.fit(df['surname_plain'], df['irish_flag'])

In [148]:
print(final_model['vec'].fit_transform(df['surname_plain'], df['irish_flag'])[0:2])

  (0, 34)	1
  (0, 1073)	1
  (0, 745)	1
  (0, 2303)	1
  (0, 2631)	1
  (1, 67)	1
  (1, 1168)	1
  (1, 3665)	1
  (1, 1599)	1


In [168]:
# This is the correct prior probability of an Irish name
final_model['clf'].class_log_prior_[1]

-1.9056091949823255

In [110]:
# These are the correct feature log probabilities of a feature in the Irish name class
final_model['clf'].feature_log_prob_[0]*-1

array([-8.68287711, -8.68287711, -8.68287711, ..., -8.68287711,
       -8.68287711, -8.68287711])

In [113]:
def get_salient_words(nb_clf, vect, class_ind):
    """Return salient words for given class
    Parameters
    ----------
    nb_clf : a Naive Bayes classifier (e.g. MultinomialNB, BernoulliNB)
    vect : CountVectorizer
    class_ind : int
    Returns
    -------
    list
        a sorted list of (word, log prob) sorted by log probability in descending order.

    Note: Feature log probabilities for Class 1 are obtained by calling all the feature log probabilities for Class 0 and  multiplying them by -1
    """

    words = vect.get_feature_names()
    zipped = list(zip(words, nb_clf.feature_log_prob_[class_ind]*-1))
    sorted_zip = sorted(zipped, key=lambda t: t[1], reverse=True)

    return sorted_zip

pos_all = get_salient_words(final_model['clf'], final_model['vec'], 0)

In [139]:
# pos_all[0:25]

# Murphy score is -39.48

[(' mur', -6.48565252972095),
 ('han ', -6.603435565377333),
 ('murp', -6.603435565377333),
 ('phy ', -6.603435565377333),
 ('rphy', -6.603435565377333),
 ('urph', -6.603435565377333),
 ('lly ', -6.736966958001855),
 ('rke ', -6.736966958001855),
 ('van ', -6.736966958001855),
 ('conn', -6.8911176378291135),
 ('ell ', -6.8911176378291135),
 ('ivan', -6.8911176378291135),
 ('ley ', -6.8911176378291135),
 ('liva', -6.8911176378291135),
 ('ulli', -6.8911176378291135),
 (' bro', -7.073439194623068),
 (' mcg', -7.073439194623068),
 (' obr', -7.073439194623068),
 (' oca', -7.073439194623068),
 (' odo', -7.073439194623068),
 (' osu', -7.073439194623068),
 ('alla', -7.073439194623068),
 ('aly ', -7.073439194623068),
 ('arre', -7.073439194623068),
 ('gan ', -7.073439194623068)]

In [149]:
# print(neg_salient_top_20)
df_features = pd.DataFrame(pos_all, columns=['ngram','log_prob'])

In [157]:
df_features['log_prob'].value_counts()
# 910 log_prob values that aren't the last category
# 3697 values that are the last category
# 4607 total

-8.682877    3697
-7.989730     693
-7.584265     122
-7.296583      54
-7.073439      26
-6.891118       6
-6.603436       5
-6.736967       3
-6.485653       1
Name: log_prob, dtype: int64

In [166]:
df_features.head(10)
print('Murphy')
print(df_features[df_features['ngram']==' mur'].values)
print(df_features[df_features['ngram']=='murp'].values)
print(df_features[df_features['ngram']=='urph'].values)
print(df_features[df_features['ngram']=='rphy'].values)
print(df_features[df_features['ngram']=='phy '].values)
# Murphy prob = 0.000000000000005 (that's 14 zeros)
# 5.0e-15
# Murphy index ~6.58

# print('Bailey')
# print(df_features[df_features['ngram']==' bai'].values)
# print(df_features[df_features['ngram']=='bail'].values)
# print(df_features[df_features['ngram']=='aile'].values)
# print(df_features[df_features['ngram']=='iley'].values)
# print(df_features[df_features['ngram']=='ley '].values)
# Bailey prob = 0.000000000000000014 (that's 16 zeros)
# 1.4e-17
# Bailey 7.762
# Murphy index = 6.58/7.762 = .8477

# print('Burkson')
# print(df_features[df_features['ngram']==' bur'].values)
# print(df_features[df_features['ngram']=='burk'].values)
# print(df_features[df_features['ngram']=='urks'].values)
# print(df_features[df_features['ngram']=='rkso'].values)
# print(df_features[df_features['ngram']=='kson'].values)
# print(df_features[df_features['ngram']=='son '].values)
# Burkson 8.18
# Murphy index = 6.58/8.18 = .8044

# print('Sims')
# print(df_features[df_features['ngram']==' sim'].values)
# print(df_features[df_features['ngram']=='sims'].values)
# print(df_features[df_features['ngram']=='ims '].values)
# Sims 8.68
# Murphy index = 6.58/8.68 = .7581

print('Ravichandran')
print(df_features[df_features['ngram']==' rav'].values)
print(df_features[df_features['ngram']=='ravi'].values)
print(df_features[df_features['ngram']=='avic'].values)
print(df_features[df_features['ngram']=='vich'].values)
print(df_features[df_features['ngram']=='icha'].values)
print(df_features[df_features['ngram']=='chan'].values)
print(df_features[df_features['ngram']=='hand'].values)
print(df_features[df_features['ngram']=='andr'].values)
print(df_features[df_features['ngram']=='ndra'].values)
print(df_features[df_features['ngram']=='dran'].values)
print(df_features[df_features['ngram']=='ran '].values)
# Ravichandran 8.38
# Murphy index = 6.58/8.38 = .7852


# print('Platzfelder')
# print(df_features[df_features['ngram']==' pla'].values)
# print(df_features[df_features['ngram']=='plat'].values)
# print(df_features[df_features['ngram']=='latz'].values)
# print(df_features[df_features['ngram']=='atzf'].values)
# print(df_features[df_features['ngram']=='tzfe'].values)
# print(df_features[df_features['ngram']=='zfel'].values)
# print(df_features[df_features['ngram']=='feld'].values)
# print(df_features[df_features['ngram']=='elde'].values)
# print(df_features[df_features['ngram']=='lder'].values)
# print(df_features[df_features['ngram']=='der '].values)

# print('Timothy')
# print(df_features[df_features['ngram']==' tim'].values)
# print(df_features[df_features['ngram']=='timo'].values)
# print(df_features[df_features['ngram']=='imot'].values)
# print(df_features[df_features['ngram']=='moth'].values)
# print(df_features[df_features['ngram']=='othy'].values)
# print(df_features[df_features['ngram']=='thy '].values)
# Timothy 8.33
# Murphy index 6.58/8.33 = .7899

# Murphy index could be Avg log probability of a name divided by the avg log probability of the name Murphy


Murphy
[[' mur' -6.48565252972095]]
[['murp' -6.603435565377333]]
[['urph' -6.603435565377333]]
[['rphy' -6.603435565377333]]
[['phy ' -6.603435565377333]]
Ravichandran
[]
[]
[]
[]
[['icha' -8.682877107057168]]
[['chan' -8.682877107057168]]
[['hand' -8.682877107057168]]
[['andr' -7.9897299264972235]]
[['ndra' -8.682877107057168]]
[]
[['ran ' -7.584264818389059]]


In [84]:
df_features[df_features['ngram']==' mur']

Unnamed: 0,ngram,log_prob
4606,mur,6.485653


In [85]:
df_features[4525:4550]

Unnamed: 0,ngram,log_prob
4525,augh,7.296583
4526,barr,7.296583
4527,brie,7.296583
4528,call,7.296583
4529,cgra,7.296583
4530,coll,7.296583
4531,daly,7.296583
4532,donn,7.296583
4533,dono,7.296583
4534,ealy,7.296583


In [None]:
with open("pickled_model.p", "wb") as p:
    pickle.dump(final_model, p)

In [16]:
final_model.predict_log_proba(['Knickerbocker'])

array([[-0.00583429, -5.14691913]])

In [None]:
?CountVectorizer

Okay, so now I have a model. Now I need to put it in a Streamlit app that can receive input from the user and return a response of some sort.