In [None]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [None]:
df = pd.read_csv("../ORACC-catalogues-030524.csv",
                  encoding="utf-8", index_col="_ - index")
df.shape

# Functions

In [None]:
def vectorize(corpus, tfidf, analyzer, ngram_range, max_df, min_df, max_features, file_keys):

    if tfidf:
        vectorizer = TfidfVectorizer(input="content", lowercase=True, analyzer=analyzer, token_pattern=r"(?u)\b\w+\b", ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features)
    else:
        vectorizer = CountVectorizer(input="content", lowercase=True, analyzer=analyzer, token_pattern=r"(?u)\b\w+\b", ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features)
    
    counts = vectorizer.fit_transform(corpus).toarray()
    # saving the vocab used for vectorization, and switching the dictionary so that the feature index is the key
    vocab = vectorizer.vocabulary_
    switched_vocab = {value: key for key, value in vocab.items()}
    # adding the vocab words to the counts dataframe for easier viewing.
    column_names = []
    x = 0
    while x < len(switched_vocab):
        column_names.append(switched_vocab[x])
        x += 1

    counts_df = pd.DataFrame(counts, index=file_keys, columns=column_names)

    return (counts, counts_df)


def prepare_data(df, category, word_level, tfidf=False, max_df=.85, min_df=20, max_features=None):
    
    df = df[[category, word_level]].copy()
    
    label_encoder = LabelEncoder()
    df["y"] = label_encoder.fit_transform(df[category])
    
    clean_texts = []
    for text in df[word_level].tolist():
        clean_texts.append(text.replace("UNK", "").replace("X", "").replace("-", ""))
        
    if word_level != "unseg_uni":
        counts, counts_df = vectorize(clean_texts, tfidf, "word", (1,1), max_df, min_df, max_features, df.index)
    else:
        counts, counts_df = vectorize(clean_texts, tfidf, "char", (1,3), max_df, min_df, max_features, df.index)
    
    X = counts_df
    y = df["y"]
    
    return X, y, label_encoder.classes_

def train_multinomialNB(X, y):
    
    clf = MultinomialNB()
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    precision_scores = []
    recall_scores = []
    f1_scores = []
    weighted_f1_scores = [] 
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None, zero_division=0)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        
        weighted_f1 = f1_score(y_test, y_pred, average='weighted')
        weighted_f1_scores.append(weighted_f1)
    
    precision_scores = np.array(precision_scores).mean(axis=0)
    recall_scores = np.array(recall_scores).mean(axis=0)
    f1_scores = np.array(f1_scores).mean(axis=0)
    weighted_f1_scores = np.array(weighted_f1_scores).mean()
    
    return precision_scores, recall_scores, f1_scores, weighted_f1_scores

def save_results_table(results, classes, category, word_level, save_name):
    
    weighted_f1 = results[3]
    nan_array = np.full(len(results[0])-1, np.nan)
    weighted_f1 = np.append(np.array(weighted_f1), nan_array)
    new_results = [results[0], results[1], results[2], weighted_f1]
    
    df = pd.DataFrame(new_results, index=["Precision", "Recall", "F1", "Weighted F1"], columns=classes).transpose()
    df.to_csv(f"../reports/MultinomialNB/{save_name}_{category}_{word_level}.csv")

# Main

In [None]:
def run_and_save_results(save_name):
    categories = ["supergenre_160424", "superperiod_160424", "superprovenience_160424"]
    word_levels = ["lemm", "norm", "seg_uni", "unseg_uni"]

    results = []
    for cat in categories:
        # filter out small examples in specific categories:
        if cat == 'superperiod_160424':
            filtered_df = df[(df["superperiod_160424"]!="Unknown")&(df["superperiod_160424"]!="First Millennium")].copy()
        elif cat == 'superprovenience_160424':
            filtered_df = df[(df["superprovenience_160424"]!="East")&(df["superprovenience_160424"]!="Unknown")].copy()
        else:
            filtered_df = df.copy()
        
        for level in word_levels:
            print("------------------------------")
            print(f"current job: {cat, level}")
            X, y, classes = prepare_data(filtered_df, cat, level, tfidf=True)
            print("started scoring")
            scores = train_multinomialNB(X, y)
            save_results_table(scores, classes, cat, level, save_name)
            results.append((cat, level, scores[-1]))
    results_df = pd.DataFrame(results, columns=["Category", "Word Level", "Weighted F1"])
    results_df.to_csv(f"../reports/MultinomialNB/{save_name}_WeightedF1.csv")

In [None]:
# the only parameter this function needs is a unique name to save the results in the reports folder with.
run_and_save_results("unique_id_for_saving_results")