In [None]:
import pandas as pd
import glob
import sklearn
from utils import grab_sections
from collections import Counter
import random
import numpy as np
import spacy
import string
from nltk.corpus import stopwords as stop_words
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
import tqdm

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
nltk.download('stopwords')
stopwords = list(stop_words.words('english'))

In [None]:
def clean_docs(notes: list[str]) -> list[str]:
    clean_notes = []
    punct_set = set(string.punctuation)
    stop_word_set = set(stopwords)
    for doc in notes:
        doc = ' '.join(doc.split())
        spac_doc = nlp(doc)
        tokens = [tok.lemma_ for tok in spac_doc if tok.lemma_ not in punct_set and tok.lemma_ not in stop_word_set]
        clean_notes.append(' '.join(tokens))
    return clean_notes

In [None]:
def undersample(train_docs, train_labels):
        label_counter = Counter(train_labels)
        print('Original Train Label Counts: ')
        print(label_counter)
        majority_class = label_counter.most_common()[0][0]
        minority_class = label_counter.most_common()[-1][0]
        majority_count = label_counter.most_common()[0][1]
        minority_count = label_counter.most_common()[-1][1]
        majority_docs = [doc for i, doc in enumerate(train_docs) if train_labels[i]==majority_class]
        minority_docs = [doc for i, doc in enumerate(train_docs) if train_labels[i]==minority_class]
        majority_labels = [doc for doc in train_labels if doc==majority_class]
        minority_labels = [doc for doc in train_labels if doc==minority_class]
        assert(len(majority_docs)==len(majority_labels)==majority_count)
        assert(len(minority_docs)==len(minority_labels)==minority_count)
        majority_temp = list(zip(majority_docs, majority_labels))
        random.shuffle(majority_temp)
        maj_doc, maj_lab = zip(*majority_temp)
        maj_doc = list(maj_doc)
        maj_lab = list(maj_lab)
        maj_doc = maj_doc[:minority_count]
        maj_lab = maj_lab[:minority_count]
        train_labels ,train_docs = [], []
        train_labels.extend(minority_labels)
        train_labels.extend(maj_lab)
        train_docs.extend(minority_docs)
        train_docs.extend(maj_doc)
        new_counter = Counter(train_labels)
        print('New Train Label Counts: ')
        print(new_counter)
        return train_docs, train_labels

In [None]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)

## Insurance Classification

In [None]:
lab2int = {'low':0, 'reg':1}
int2lab = {0:'low', 1:'reg'}
label = 'insurance'

Load in data

In [None]:
train_file = './insurance_data/insurance_train_phys.csv'
dev_file = './insurance_data/insurance_dev_phys.csv'
test_file = './insurance_data/insurance_test_phys.csv'
train_df = pd.read_csv(train_file, encoding='utf8')
dev_df = pd.read_csv(dev_file, encoding='utf8')
test_df = pd.read_csv(test_file, encoding='utf8')

Preprocess: sectionize, lowercase, tokenize, lemmatize, remove stop words

In [None]:
train_notes = [grab_sections(note, token_len=300).lower() for note in train_df['text'].to_list()]
train_notes = clean_docs(train_notes)
train_labels = [lab2int[lab] for lab in train_df[label].to_list()]
dev_notes = [grab_sections(note, token_len=300).lower() for note in dev_df['text'].to_list()]
dev_notes = clean_docs(dev_notes)
dev_labels = [lab2int[lab] for lab in dev_df[label].to_list()]
test_notes = [grab_sections(note, token_len=300).lower() for note in test_df['text'].to_list()]
test_notes = clean_docs(test_notes)
test_labels = [lab2int[lab] for lab in test_df[label].to_list()]

In [None]:
train_notes, train_labels = undersample(train_notes, train_labels)

In [None]:
bow_converter = CountVectorizer()
tf_transformer = TfidfTransformer()
train_x = bow_converter.fit_transform(train_notes)
train_x = tf_transformer.fit_transform(train_x)
dev_x = bow_converter.transform(dev_notes)
dev_x = tf_transformer.transform(dev_x)
test_x = bow_converter.transform(test_notes)
test_x = tf_transformer.transform(test_x)

In [None]:
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier(loss='log', penalty='l2', max_iter=1000, random_state=seed_val, learning_rate='optimal')
clf_sgd.fit(train_x, train_labels)

In [None]:
preds = clf_sgd.predict(dev_x)
print(classification_report(dev_labels, preds))

In [None]:
clf_gb = GradientBoostingClassifier(loss='deviance', learning_rate=0.01, n_estimators=1000, random_state=seed_val)
clf_gb.fit(train_x, train_labels)

In [None]:
gb_preds = clf_gb.predict(dev_x)
print(classification_report(dev_labels, gb_preds))

#### Inference

In [None]:
test_preds = clf_sgd.predict(test_x)
print(classification_report(test_labels, test_preds))

In [None]:
## Gradient Boosting
gb_test_preds = clf_gb.predict(test_x)
print(classification_report(test_labels, gb_test_preds))

## Race / Ethnicity Classification

In [None]:
lab2int = {'NonWhite':0, 'White_NonHispanic':1}
int2lab = {0:'NonWhite', 1:'White_NonHispanic'}
label = 'Race_group'

Load in data

In [None]:
train_file = './race_eth_data/race_train_phys.csv'
dev_file = './race_eth_data/race_dev_phys.csv'
test_file = './race_eth_data/race_test_phys.csv'
train_df = pd.read_csv(train_file, encoding='utf8')
dev_df = pd.read_csv(dev_file, encoding='utf8')
test_df = pd.read_csv(test_file, encoding='utf8')

Preprocess: sectionize, lowercase, tokenize, lemmatize, remove stop words

In [None]:
train_notes = [grab_sections(note, token_len=300).lower() for note in train_df['text'].to_list()]
train_notes = clean_docs(train_notes)


In [None]:
train_labels = [1 if lab=='White_NonHispanic' else 0 for lab in train_df[label].to_list()]
dev_notes = [grab_sections(note, token_len=300).lower() for note in dev_df['text'].to_list()]
dev_notes = clean_docs(dev_notes)
dev_labels = [1 if lab=='White_NonHispanic' else 0  for lab in dev_df[label].to_list()]
test_notes = [grab_sections(note, token_len=300).lower() for note in test_df['text'].to_list()]
test_notes = clean_docs(test_notes)
test_labels = [1 if lab=='White_NonHispanic' else 0  for lab in test_df[label].to_list()]

In [None]:
train_notes, train_labels = undersample(train_notes, train_labels)

In [None]:
bow_converter = CountVectorizer()
tf_transformer = TfidfTransformer()
train_x = bow_converter.fit_transform(train_notes)
train_x = tf_transformer.fit_transform(train_x)
dev_x = bow_converter.transform(dev_notes)
dev_x = tf_transformer.transform(dev_x)
test_x = bow_converter.transform(test_notes)
test_x = tf_transformer.transform(test_x)

In [None]:
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier(loss='log', penalty='l2', max_iter=1000, random_state=seed_val, learning_rate='optimal')
clf_sgd.fit(train_x, train_labels)

In [None]:
preds = clf_sgd.predict(dev_x)
print(classification_report(dev_labels, preds))

In [None]:
clf_gb = GradientBoostingClassifier(loss='deviance', learning_rate=0.01, n_estimators=1000, random_state=seed_val)
clf_gb.fit(train_x, train_labels)

In [None]:
gb_preds = clf_gb.predict(dev_x)
print(classification_report(dev_labels, gb_preds))

#### Inference

In [None]:
test_preds = clf_sgd.predict(test_x)
print(classification_report(test_labels, test_preds))

In [None]:
## Gradient Boosting
gb_test_preds = clf_gb.predict(test_x)
print(classification_report(test_labels, gb_test_preds))

## Gender Classification

In [None]:
lab2int = {'Female':0, 'Male':1}
int2lab = {0:'Female', 1:'Male'}
label = 'Gender'

Load in data

In [None]:
train_file = './gender_data/gen_train_phys.csv'
dev_file = './gender_data/gen_dev_phys.csv'
test_file = './gender_data/gen_test_phys.csv'
train_df = pd.read_csv(train_file, encoding='utf8')
dev_df = pd.read_csv(dev_file, encoding='utf8')
test_df = pd.read_csv(test_file, encoding='utf8')

Preprocess: sectionize, lowercase, tokenize, lemmatize, remove stop words

In [None]:
train_notes = [grab_sections(note, token_len=300).lower() for note in train_df['text'].to_list()]
train_notes = clean_docs(train_notes)
train_labels = [lab2int[lab] for lab in train_df[label].to_list()]
dev_notes = [grab_sections(note, token_len=300).lower() for note in dev_df['text'].to_list()]
dev_notes = clean_docs(dev_notes)
dev_labels = [lab2int[lab] for lab in dev_df[label].to_list()]
test_notes = [grab_sections(note, token_len=300).lower() for note in test_df['text'].to_list()]
test_notes = clean_docs(test_notes)
test_labels = [lab2int[lab] for lab in test_df[label].to_list()]

In [None]:
train_notes, train_labels = undersample(train_notes, train_labels)

In [None]:
bow_converter = CountVectorizer()
tf_transformer = TfidfTransformer()
train_x = bow_converter.fit_transform(train_notes)
train_x = tf_transformer.fit_transform(train_x)
dev_x = bow_converter.transform(dev_notes)
dev_x = tf_transformer.transform(dev_x)
test_x = bow_converter.transform(test_notes)
test_x = tf_transformer.transform(test_x)

In [None]:
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier(loss='log', penalty='l2', max_iter=1000, random_state=seed_val, learning_rate='optimal')
clf_sgd.fit(train_x, train_labels)

In [None]:
preds = clf_sgd.predict(dev_x)
print(classification_report(dev_labels, preds))

In [None]:
clf_gb = GradientBoostingClassifier(loss='deviance', learning_rate=0.01, n_estimators=1000, random_state=seed_val)
clf_gb.fit(train_x, train_labels)

In [None]:
gb_preds = clf_gb.predict(dev_x)
print(classification_report(dev_labels, gb_preds))

#### Inference

In [None]:
test_preds = clf_sgd.predict(test_x)
print(classification_report(test_labels, test_preds))

In [None]:
## Gradient Boosting
gb_test_preds = clf_gb.predict(test_x)
print(classification_report(test_labels, gb_test_preds))