In [59]:
import pandas as pd

import re
import sys

from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from numpy import sqrt
from numpy import argmax
from sklearn.metrics import accuracy_score
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np

import pickle
# from ngram_config import ngram_creator

In [2]:
df = pd.read_csv('irish_names.csv')

In [3]:
model = make_pipeline(CountVectorizer(),MultinomialNB())

In [None]:
for n in 3-5:
    get ngrams of length n
    stratify the data
        train the model
        

In [75]:
def evaluate_model(y_test, predicted_labels, predicted_probs, fold_no):
    """
    Docstring here
    
    Input: 
    
    """

    # I'm using the F2 score because I want to maximize recall (minimize false negatives)
    # I say that because I want to capture as many Irish names as possible,
    # even if that means saying that others' names are more Irish than they really are.

    # f1 = f1_score(y_test, predicted_labels)
    f2 = fbeta_score(y_test, predicted_labels, average='binary', beta=2)

    cmatrix = confusion_matrix(y_test, predicted_labels)
    tn, fp, fn, tp = confusion_matrix(y_test, predicted_labels).ravel()

    FNR = fn/(fn+tp)

    pred_prob_results = pd.DataFrame(predicted_probs, columns=['irish_prob', 'not_irish_prob'])
    # pred_prob_results['test_index'] = test_index

    fold_summary = {'fold_no':fold_no
                    # ,'f1_score':f1
                    ,'f2_score':f2
                    ,'confusion_matrix':{'tn':tn, 'fp':fp, 'fn':fn, 'tp':tp}
                    ,'predicted_probabilities':pred_prob_results
                    }

    return fold_summary

In [76]:
def train_model(train, test, fold_no, X=['ngrams'], y=['irish_flag'], n=n):

    """
    Docstring here
    """

    fold_summaries = {}

    X_train = train['ngrams'] #.values
    y_train = train['irish_flag']

    X_test = test['ngrams'].values
    y_test = test['irish_flag']

    model.fit(X_train, y_train)
    predicted_labels = model.predict(X_test)
    predicted_probs = model.predict_proba(X_test)  # Return predicted probabilities for X_test data

    y_score = model.predict_proba(X_test)[:,1]  # What does this do?

    # Evaluate Models, Output Model Summary
    fold_summary = evaluate_model(y_test, predicted_labels, predicted_probs, fold_no=fold_no)

    # Aggregate Model Summaries
    fold_summaries[f'ngrams_{n}'] = fold_summary

    return fold_summaries

In [80]:
def summarize_model(df, target, model_type):
    """
    This function trains a model on each of the stratified folds and returns
    a summary of the results.

    Input:
        df: (dataframe) Data to be used. Should have a column of ngrams and a target column
        target: (series) Target column of input df

    """

    skf = StratifiedKFold(n_splits=5)
    
    model_summary_dict = {}
    fold_no = 1

    for train_index, test_index in skf.split(df, target):
        train = df.loc[train_index,:]
        test = df.loc[test_index,:]
        model_summary = train_model(train, test, X=['ngrams'], y=['irish_flag'], n=n, fold_no=fold_no)
        model_summary_dict[f"fold_{fold_no}"] = model_summary
        fold_no += 1
    
    return model_summary_dict

In [90]:
def ngram_creator(term_list, n=3):
    """
    Takes two arguments: a list of words to break into ngrams and the length of the ngram

    Inputs:
        term_list: [list] terms to break into intraword ngrams (example: df['names'])
        n: [int] desired length of ngrams
    
    Outputs:
        ngrams of individual words: e.g., bigrams for 'dog' = ['do','og']
        gram_string_list: list where each element is the set of n-grams for each original record. Use in CountVectorizer
    """

    gram_string_list = []
    gram_length = n

    for i in term_list:
        i = str(i)
        i = re.sub('[^A-Za-z0-9]+', '', i) # Remove any punctuation, spaces, and special characters
        i = i.lower()
        i = "^"+i+"$"  # Add initial and terminal clusters
        word_grams = []
        gram_string = ""
        for j in range(gram_length,100):
            gram = i[j-gram_length:j]
            if len(gram) == gram_length:  # only keep ngrams of the correct length
                word_grams.append(gram)
                gram_string = gram_string + gram + " "
        gram_string = gram_string[:-1]  # Cut that last space off the end there
        gram_string_list.append(gram_string)  # Append the ngrams for the current name as a space-separated string
    
    return gram_string_list

In [91]:
n_length = [3,4,5]

all_model_results = {}

for n in n_length:
    ngrams = ngram_creator(df['name'], n)
    df_grams = pd.DataFrame(zip(ngrams,df['irish_flag']), columns=['ngrams','irish_flag'])

    # print(ngrams)
    # print(df_grams['ngrams'])
    # break

    target = df_grams.loc[:,'irish_flag']

    model_results = summarize_model(df_grams, target, model_type=n)

    all_model_results[f'{n}_grams'] = model_results

{'ngrams_3': {'fold_no': 1, 'f2_score': 0.32019704433497537, 'confusion_matrix': {'tn': 235, 'fp': 14, 'fn': 31, 'tp': 13}, 'predicted_probabilities':      irish_prob  not_irish_prob
0      0.998145        0.001855
1      0.674257        0.325743
2      0.013198        0.986802
3      0.991864        0.008136
4      0.990488        0.009512
..          ...             ...
288    0.994184        0.005816
289    0.999301        0.000699
290    0.994734        0.005266
291    0.999014        0.000986
292    0.999971        0.000029

[293 rows x 2 columns]}}
{'ngrams_3': {'fold_no': 2, 'f2_score': 0.24154589371980673, 'confusion_matrix': {'tn': 227, 'fp': 21, 'fn': 34, 'tp': 10}, 'predicted_probabilities':      irish_prob  not_irish_prob
0      0.997052    2.948185e-03
1      0.206652    7.933485e-01
2      0.201359    7.986407e-01
3      0.999995    4.929512e-06
4      0.993146    6.853827e-03
..          ...             ...
287    0.004269    9.957306e-01
288    0.984464    1.553611e-02
