# Military Service Identification Tool

Please see README file for more information.


In [None]:
#--env setup
import pandas as pd
import numpy as np
import re
import csv
from collections import Counter

#--env setup - import sklearn for different machine learning algorithms 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from nltk.stem import WordNetLemmatizer
from sklearn import metrics

#--env setup - more friendly plotting
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

#--env setup - we need to import nltk elements if not already installed
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
        

## Define functions 

This section declares help functions used by the Tool. 

In [None]:
#--paths
base_tool_dir = 'PATH'

def documentCleaning(document):
    #We clean all document inputs to ensure noise is removed
    stemmer = WordNetLemmatizer()
    
    # Remove all the special characters
    document = re.sub(r'\W', ' ', document)
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Converting to Lowercase
    document = document.lower()
    
    return document
    
def wordRemoval(document):
    # We remove common words which were annotated as possible causing confusion to the classifier
    # Read in master keyword file. This has been developed by a team of annotators
    # This data is not publicly accessible
    word_list_removal = []
    with open(base_tool_dir + 'excluded_terms.csv') as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        for row in csvReader:
            word_list_removal.append(row[0])

    for x_word in word_list_removal:
        document = re.sub(x_word, '', document)
        
    return document

def stopword(string):
    # Remove all stop words defined in NLTK 
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

def milatryTermCheck(document):
    # We remove common words which were annotated as possible causing confusion to the classifier
    # Read in master keyword file 
    word_list = []
    with open(base_tool_dir + 'data//data//military_terms.csv') as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        for row in csvReader:
            word_list.append(row[0])

    if any(word in document for word in word_list):
        return 1
    else:
        return 0
    
def getMilTerms():
    # We remove common words which were annoted as possible causing confusion to the classifier
    # Read in master keyword file. This data is not publicly accessible
    word_list = []
    with open(base_tool_dir + 'military_terms.csv') as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        for row in csvReader:
            word_list.append(row[0])
            
    return word_list
    
def milatryWeights(tdif, training_features):
    # We apply a minor weight based on a set of annotated terms. 
    with open(base_tool_dir + 'weight_terms.csv') as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        for row in csvReader:
            print(row[0])
            try:
                position = tdif.vocabulary_[row[0]]
                training_features[:, position] *= 1.1
                print('Term present')
            except KeyError:
                print('No term')
            
    return training_features


# Building the pipeline

This section builds the machine learning model for the detection of military service in our cohort. 

In [None]:
# Data read in. We export data from CRIS which has been annotated into a .csv file
# This data represents free text medical notes wrriten by a doctor
df_attachment_data = pd.read_csv(base_tool_dir + 'dataset.csv')
df_attachment_data.rename(columns = {'Attachment_Text':'Text'}, inplace = True)
df_attachment_data = df_attachment_data.dropna(subset=['Text'])

# Add internal reference
for index, row in df_attachment_data.iterrows():
    df_attachment_data.loc[index, 'annotation_loc'] = ('att-' + str(index) + ".txt")
    
# Keep only annotated data
df_attachment_data = df_attachment_data[df_attachment_data['annotated'] == 1]
display(df_attachment_data['flag_veteran'].value_counts()) # count the labels

# Clear and process data into usable format
df_attachment_data['TextCleaned'] = ''
for index, row in df_attachment_data.iterrows():
    df_attachment_data.loc[index, 'TextCleaned'] = documentCleaning(row['Text'])
    
# Remove words of confusion
for index, row in df_attachment_data.iterrows():
    df_attachment_data.loc[index, 'TextCleaned'] = wordRemoval(row['TextCleaned'])
    
# Remove stop words
for index, row in df_attachment_data.iterrows():
    df_attachment_data.loc[index, 'TextCleaned'] = stopword(row['TextCleaned'])
    
display(df_attachment_data.head(5))

# Generate label sets ready for training the classifier
df_attachment_data['labels'] = np.where(df_attachment_data['flag_veteran'] == 1, 'Veteran', 'Civilian')
df_attachment_data['category_id'] = df_attachment_data['labels'].factorize()[0]
category_id_df = df_attachment_data[['labels', 'category_id']].drop_duplicates().sort_values('category_id')
id_to_category = dict(category_id_df[['category_id', 'labels']].values)


# Model Development 

In this section we train a classical decision tree based on a fitted matrix of TF-IDF features. 

In [None]:
# Training and testing set generation
X_train, X_test, y_train, y_test, indices_train, 
    indices_test = train_test_split(df_attachment_data['TextCleaned'], 
                                    df_attachment_data['category_id'], df_attachment_data.index, 
                                    random_state=0, test_size=0.40, shuffle=True)

# Obtain the tfid vectors 
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10, max_df=0.5, stop_words='english', max_features=500)

# Fit our data (training)
training_features = tfidf.fit_transform(X_train)
training_features = milatryWeights(tfidf, training_features)
testing_features = tfidf.transform(X_test)

# Viz. the covabulary used in the tfid
print(tfidf.vocabulary_)
pickle.dump(tfidf.vocabulary_,open("feature.pkl","wb"))

# Model training - important that you may want to consider fine tuning these parameters
model = DecisionTreeClassifier(criterion="entropy", max_depth=5, random_state=0)
model.fit(training_features, y_train)

# Save the model to disk
pickle.dump(model, open('finalized_model.sav', 'wb'))


# Test the model

This section tests the machine learning model performance. 

In [None]:
y_pred = model.predict(testing_features)
print(metrics.classification_report(y_test, y_pred))

conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.labels.values, yticklabels=category_id_df.labels.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

for predicted in category_id_df.category_id:
  for actual in category_id_df.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 1:
      print("'{}' predicted as '{}' : {} examples.".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))

for predicted in category_id_df.category_id:
  for actual in category_id_df.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 1:
      print("'{}' predicted as '{}' : {} examples.".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))

df_attachment_data['model_outcome'] = ''
df_attachment_data['model_outcome'].loc[indices_test] = y_pred


# Post validation

We add an extra layer of checks. All predictions are checked against a known list of military terms and phrases. If the record does not contain one of these known words. It is classed as not a military record. 

In [None]:
y_pred_rule = y_pred

for idx, val in enumerate(y_pred_rule):
    if(val == 1):
        y_pred_rule[idx] = milatryTermCheck(df_attachment_data.loc[indices_test[idx]]['TextCleaned']) 
        
df_attachment_data['rule_outcome'] = ''
df_attachment_data['rule_outcome'].loc[indices_test] = y_pred_rule

conf_mat = confusion_matrix(y_test, y_pred_rule)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.labels.values, yticklabels=category_id_df.labels.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

for predicted in category_id_df.category_id:
  for actual in category_id_df.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 1:
      print("'{}' predicted as '{}' : {} examples.".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))

for predicted in category_id_df.category_id:
  for actual in category_id_df.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 1:
      print("'{}' predicted as '{}' : {} examples.".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))

#--final decisions
df_attachment_data.to_csv('output.csv')
