## Pain Change Classifier

In [109]:
#Download and import relevant libraries

import warnings
warnings.filterwarnings('ignore')

import nltk

#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

#Import dataframe libraries
import pandas as pd
import numpy as np
import scipy

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
#from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Import models
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import csr_matrix

from nltk.stem import WordNetLemmatizer

In [110]:
#function to compute graded precision, recall and f-measure

def precision_recall_f_compute(predictions, y_test):
    fn=0
    fp=0
    tp=0
    for pred, real in zip(predictions, y_test):
        if pred == real:
            tp = tp + 1
        elif pred < real:
            fn = fn + 1
            #fn = fn + (real - pred)
        elif pred > real:
            fp = fp + 1
            #fp = fp + (pred - real)
    precision = float(tp)/float((fp+tp))
    recall = float(tp)/(float(fn+tp))
    f = 2.0*precision*recall/(precision+recall)

    return precision, recall, f

In [111]:
#Store patient data in file
file = 'patient_data1.xlsx'

#Store patient data in Pandas dataframe
data = pd.read_excel(file)
data.columns = ['Pain_Relevance', 'Pain_Change', 'Raw_Text'] #Rename columns
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424 entries, 0 to 423
Data columns (total 3 columns):
Pain_Relevance    424 non-null object
Pain_Change       287 non-null object
Raw_Text          400 non-null object
dtypes: object(3)
memory usage: 10.1+ KB


Unnamed: 0,Pain_Relevance,Pain_Change,Raw_Text
0,yes,increase,Patient. Arrived to VDH for pain management an...
1,yes,not sure,Patient will continue on pain control regimen ...
2,yes,not sure,Patient will continue to take pain medications...
3,no,,Patient received from CHC via wheelchair on ro...
4,yes,increase,Patient remains in 8-10/10 pain. PCA initiated.


In [112]:
#Display number of values for pain relevance and pain irrelevance
print("Number of pain relevant data points: ", (data['Pain_Relevance'] == 'yes').sum())
print("Number of pain irrelevant data points: ", (data['Pain_Relevance'] == 'no').sum())
print(data['Raw_Text'][0])

Number of pain relevant data points:  391
Number of pain irrelevant data points:  33
Patient. Arrived to VDH for pain management and IVF while awaiting admission. Pain was 7/10 on arrival in chest bilaterally as documented. Patient. Somewhat anxious as portrayed by irregular breathing pattern with talking however able to tolerate eating. All VSS including pulse ox at 100% on room air. Patient. Given oral valium and IV pain meds. Patient. Appears more calm after valium administration. Will transfer to 5100 via wheelchair with mother at side. ACRN


In [113]:
#Change newline character(s) in Pain Change column to "not sure"
data.loc[data['Pain_Change'] == '\n', 'Pain_Change'] = 'not sure'

#Change null Pain Change values where Pain Relevance is true to "not sure"
data.loc[(data['Pain_Relevance'] == 'yes') & (data['Pain_Change'] == np.NaN), 'Pain_Change'] = 'not sure'

#Drop rows with null Pain Relevance or null Raw Text fields
data = data.dropna(subset=['Pain_Relevance', 'Raw_Text', 'Pain_Change'])
data.info()
data.head()
print("Pain relevant data points with no null values: ", (data['Pain_Relevance'] == 'yes').sum())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 280 entries, 0 to 422
Data columns (total 3 columns):
Pain_Relevance    280 non-null object
Pain_Change       280 non-null object
Raw_Text          280 non-null object
dtypes: object(3)
memory usage: 8.8+ KB
Pain relevant data points with no null values:  280


In [114]:
#Print unique values in Pain Relevance column
print(pd.unique(data['Pain_Relevance']))
print(data['Pain_Relevance'].value_counts())

['yes']
yes    280
Name: Pain_Relevance, dtype: int64


In [115]:
#Print unique values in Pain Change column
print(pd.unique(data['Pain_Change']))
print(data['Pain_Change'].value_counts())

['increase' 'not sure' 'decrease' 'no change']
decrease     145
increase      51
no change     51
not sure      33
Name: Pain_Change, dtype: int64


In [116]:
#Change text to lower case, remove stop words and punctuation
data['Raw_Text'] = data['Raw_Text'].str.lower()

stop_words = set(stopwords.words("english")) 
#ps = PorterStemmer()
ps = WordNetLemmatizer()

def preprocess(row):
    text = row['Raw_Text']
    #print(text) #Test statement
    meaningful_words = [w for w in nltk.tokenize.word_tokenize(text) if not w in stop_words]  
    #print(" ".join(meaningful_words)) #Test statement
    
    proc_words = []
    for w in meaningful_words:
        #proc_words.append(ps.stem(w))
        proc_words.append(ps.lemmatize(w))
    #print(" ".join(proc_words)) #Test statement
    return " ".join(proc_words)

data['Raw_Text'] = data.apply(preprocess, axis=1)

data.head()

Unnamed: 0,Pain_Relevance,Pain_Change,Raw_Text
0,yes,increase,patient . arrived vdh pain management ivf awai...
1,yes,not sure,patient continue pain control regimen 0/10 pain .
2,yes,not sure,patient continue take pain medication without ...
4,yes,increase,patient remains 8-10/10 pain . pca initiated .
5,yes,decrease,patient continues rate pain 8/10 morphine pca ...


In [117]:
# Convert text to string type
data['Str_Text'] = data.Raw_Text.astype(str) 
data.head(10)

Unnamed: 0,Pain_Relevance,Pain_Change,Raw_Text,Str_Text
0,yes,increase,patient . arrived vdh pain management ivf awai...,patient . arrived vdh pain management ivf awai...
1,yes,not sure,patient continue pain control regimen 0/10 pain .,patient continue pain control regimen 0/10 pain .
2,yes,not sure,patient continue take pain medication without ...,patient continue take pain medication without ...
4,yes,increase,patient remains 8-10/10 pain . pca initiated .,patient remains 8-10/10 pain . pca initiated .
5,yes,decrease,patient continues rate pain 8/10 morphine pca ...,patient continues rate pain 8/10 morphine pca ...
6,yes,no change,pain remains 8/10 overnight pt appeared comfor...,pain remains 8/10 overnight pt appeared comfor...
7,yes,increase,"continuous setting increased , patient continu...","continuous setting increased , patient continu..."
8,yes,increase,patient pain increased 8/10 9/10 chest . reach...,patient pain increased 8/10 9/10 chest . reach...
9,yes,decrease,patient report feel like `` getting better .,patient report feel like `` getting better .
10,yes,decrease,patient denies pain shift .,patient denies pain shift .


In [118]:
encoder = LabelEncoder()
encoded_pain = list()
score = 0
for i,row in data.iterrows():
    if row[1] == 'decrease':
        score = 0
    elif row[1] == 'no change':
        score = 1
    elif row[1] == 'not sure':
        score = 2
    elif row[1] == 'increase':
        score = 3
    encoded_pain.append(score)

data['Pain_Change'] = encoded_pain

In [119]:
data.head(10)

Unnamed: 0,Pain_Relevance,Pain_Change,Raw_Text,Str_Text
0,yes,3,patient . arrived vdh pain management ivf awai...,patient . arrived vdh pain management ivf awai...
1,yes,2,patient continue pain control regimen 0/10 pain .,patient continue pain control regimen 0/10 pain .
2,yes,2,patient continue take pain medication without ...,patient continue take pain medication without ...
4,yes,3,patient remains 8-10/10 pain . pca initiated .,patient remains 8-10/10 pain . pca initiated .
5,yes,0,patient continues rate pain 8/10 morphine pca ...,patient continues rate pain 8/10 morphine pca ...
6,yes,1,pain remains 8/10 overnight pt appeared comfor...,pain remains 8/10 overnight pt appeared comfor...
7,yes,3,"continuous setting increased , patient continu...","continuous setting increased , patient continu..."
8,yes,3,patient pain increased 8/10 9/10 chest . reach...,patient pain increased 8/10 9/10 chest . reach...
9,yes,0,patient report feel like `` getting better .,patient report feel like `` getting better .
10,yes,0,patient denies pain shift .,patient denies pain shift .


In [120]:
len(encoded_pain)

280

In [121]:
data.head(10)

Unnamed: 0,Pain_Relevance,Pain_Change,Raw_Text,Str_Text
0,yes,3,patient . arrived vdh pain management ivf awai...,patient . arrived vdh pain management ivf awai...
1,yes,2,patient continue pain control regimen 0/10 pain .,patient continue pain control regimen 0/10 pain .
2,yes,2,patient continue take pain medication without ...,patient continue take pain medication without ...
4,yes,3,patient remains 8-10/10 pain . pca initiated .,patient remains 8-10/10 pain . pca initiated .
5,yes,0,patient continues rate pain 8/10 morphine pca ...,patient continues rate pain 8/10 morphine pca ...
6,yes,1,pain remains 8/10 overnight pt appeared comfor...,pain remains 8/10 overnight pt appeared comfor...
7,yes,3,"continuous setting increased , patient continu...","continuous setting increased , patient continu..."
8,yes,3,patient pain increased 8/10 9/10 chest . reach...,patient pain increased 8/10 9/10 chest . reach...
9,yes,0,patient report feel like `` getting better .,patient report feel like `` getting better .
10,yes,0,patient denies pain shift .,patient denies pain shift .


In [174]:
#Remove punctuation and vectorize text
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv_n1 = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
cv_n2 = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,2),tokenizer = token.tokenize)
cv_n3 = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,3),tokenizer = token.tokenize)
vectorized_data_n1 = cv_n1.fit_transform(data['Str_Text'].copy())
vectorized_data_n2 = cv_n2.fit_transform(data['Str_Text'].copy())
vectorized_data_n3 = cv_n3.fit_transform(data['Str_Text'].copy())

#encoder = LabelEncoder()
#data['Pain_Change'] = encoder.fit_transform(data['Pain_Change'].astype(str))

#Assign labels
y = data[['Pain_Change']]
y = np.ravel(y)

#Select k best features
kx1 = SelectKBest(chi2, k=10)
kx2 = SelectKBest(chi2, k=10)
kx3 = SelectKBest(chi2, k=10)

#Get k best features and assign to feature vectors
x1 = kx1.fit_transform(vectorized_data_n1, y)
x2 = kx2.fit_transform(vectorized_data_n2, y)
x3 = kx3.fit_transform(vectorized_data_n3, y)

x1.shape
x2.shape
x3.shape

(280, 10)

In [175]:
#Display all features for each n-gram model
feature_names_n1 = cv_n1.get_feature_names()
bow_features_n1 = pd.DataFrame(vectorized_data_n1.toarray(), columns = feature_names_n1)
#print(bow_features_n1)

feature_names_n2 = cv_n2.get_feature_names()
bow_features_n2 = pd.DataFrame(vectorized_data_n2.toarray(), columns = feature_names_n2)
#print(bow_features_n2)

feature_names_n3 = cv_n3.get_feature_names()
bow_features_n3 = pd.DataFrame(vectorized_data_n3.toarray(), columns = feature_names_n3)
#print(bow_features_n3)

In [176]:
#Show k-best features for each n-gram model

mask = kx1.get_support(indices=True)
new_features1 = bow_features_n1.columns[mask]
#print(new_features1)

mask = kx2.get_support(indices=True)
new_features2 = bow_features_n2.columns[mask]
#print(new_features2)

mask = kx3.get_support(indices=True)
new_features3 = bow_features_n3.columns[mask]
#print(new_features3)

In [203]:
#Set test ratio and get training and testing sets
test_ratio = 0.3
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y, test_size=test_ratio, shuffle=True, random_state=1)
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y, test_size=test_ratio, shuffle=True, random_state=1)
x3_train, x3_test, y3_train, y3_test = train_test_split(x3, y, test_size=test_ratio, shuffle=True, random_state=1)

print("Number of unigram features: ", x1_train.shape)
print("Number of bigram features: ", x2_train.shape)
print("Number of trigram features: ", x3_train.shape)

Number of unigram features:  (196, 10)
Number of bigram features:  (196, 10)
Number of trigram features:  (196, 10)


In [184]:
#Train and evaluate neural network model---Ordinal classification
mlp = MLPClassifier(hidden_layer_sizes=(5,5), max_iter=10000, random_state=42)
#mlp = MLPClassifier(hidden_layer_sizes=(5,5), max_iter=10000)
mlp = mlp.fit(x1_train, y1_train)
train_pred = mlp.predict(x1_train)
test_pred = mlp.predict(x1_test)

#evaluate PRF for Ordinal Classification using MLP
print(precision_recall_f_compute(test_pred, y1_test))

(0.8055555555555556, 0.5918367346938775, 0.6823529411764706)


In [185]:
#Train and evaluate neural network model---Ordinal classification
mlp = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=10000, random_state=42)
#mlp = MLPClassifier(hidden_layer_sizes=(5,5), max_iter=10000)
mlp = mlp.fit(x1_train, y1_train)
train_pred = mlp.predict(x1_train)
test_pred = mlp.predict(x1_test)

#evaluate PRF for Ordinal Classification using MLP
print(precision_recall_f_compute(test_pred, y1_test))

(0.7692307692307693, 0.6382978723404256, 0.6976744186046512)


In [186]:
#Train evaluate an ordinal logistic classification model
#Train a Logistic Regression Classifier
param_grid_lr = {
         'C': [1.0, 1e2, 1e3, 5e3, 1e4, 5e4, 1e5],
         'solver': ('newton-cg', 'lbfgs', 'sag', 'saga')
          }
lr = GridSearchCV(LogisticRegression(random_state=42, multi_class='multinomial', max_iter=40000), param_grid_lr, cv=5)
#lr = GridSearchCV(LogisticRegression(multi_class='multinomial', max_iter=40000), param_grid_lr, cv=5)
lr = lr.fit(x1_train, y1_train)
lr_train_pred = lr.predict(x1_train)
lr_test_pred = lr.predict(x1_test)

#evaluate PRF for Ordinal Classification using MLP
print(precision_recall_f_compute(lr_test_pred, y1_test))

(0.7837837837837838, 0.6041666666666666, 0.6823529411764706)


In [187]:
#Decision Tree classifier for ordinal classification

dt = DecisionTreeClassifier(random_state=42)
#dt = DecisionTreeClassifier()
dt = dt.fit(x1_train, y1_train)
dt_train_pred = dt.predict(x1_train)
dt_test_pred = dt.predict(x1_test)

#evaluate PRF for Ordinal Classification using decision tree
print(precision_recall_f_compute(dt_test_pred, y1_test))

(0.8378378378378378, 0.62, 0.7126436781609196)


In [188]:
#Train a Random Forest Classifier---Ordinal classification
rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=42)
#rf = RandomForestClassifier(n_estimators=100, max_depth=2)
rf = rf.fit(x1_train, y1_train)
rf_train_pred = rf.predict(x1_train)
rf_test_pred = rf.predict(x1_test)

#evaluate PRF for Ordinal Classification using decision trees
print(precision_recall_f_compute(rf_test_pred, y1_test))

(0.8484848484848485, 0.5490196078431373, 0.6666666666666667)


# Using LDA topics as features to ML models

In [204]:
#Working with LDA topics as features to ML

from nltk.corpus import stopwords
import gensim
import pickle
import gensim
import pyLDAvis
import pyLDAvis.gensim
#import spacy
import pandas as pd
import nltk; nltk.download('stopwords')
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import re
import warnings
from pprint import pprint
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns
%config InlineBackend.figure_formats = ['retina']
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.metrics import fbeta_score
import matplotlib.pyplot as plt

stop_words = stopwords.words('english')
stop_words.extend(['come','order','try','go','get','make','drink','plate','dish','restaurant','place',
                  'would','really','like','great','service','came','got'])


def strip_newline(series):
    return [str(review).replace('\n','') for review in series]

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod
    
def get_corpus(df):
    df['text'] = strip_newline(df.text)
    words = list(sent_to_words(df.text))
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

rev_train = data.copy()


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/amanuel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [205]:
rev_train = rev_train.rename(columns={"Str_Text": "text"})

In [206]:
train_corpus4, train_id2word4, bigram_train4 = get_corpus(rev_train)

In [207]:
lda_train4 = gensim.models.ldamulticore.LdaMulticore(
                           corpus=train_corpus4,
                           num_topics=2,
                           id2word=train_id2word4,
                           chunksize=100,
                           workers=7, # Num. Processing Cores - 1
                           passes=50,
                           eval_every = 1,
                           per_word_topics=True)

In [208]:
lda_train4.print_topics(2,num_words=10)

[(0,
  '0.184*"pca" + 0.150*"cope_supported" + 0.089*"shift" + 0.070*"collaborate" + 0.069*"continues" + 0.067*"dose" + 0.054*"chest" + 0.050*"demand" + 0.050*"remains" + 0.044*"rate"'),
 (1,
  '0.145*"medication" + 0.081*"intervention" + 0.081*"see_emar" + 0.081*"satisfied" + 0.064*"knowledge_deficit" + 0.063*"state_carry" + 0.063*"method" + 0.062*"met_continue" + 0.052*"regarding" + 0.051*"knowledge"')]

In [209]:
train_vecs = []
for i in range(len(rev_train)):
    top_topics = lda_train4.get_document_topics(train_corpus4[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(2)]
    #topic_vec.extend([rev_train.iloc[i].count()]) # counts of notes
    topic_vec.extend([len(rev_train.iloc[i].text)]) # length of notes
    train_vecs.append(topic_vec)

In [210]:
train_vecs[1]

[0.5, 0.5, 49]

In [211]:
X = np.array(train_vecs)

In [212]:
x1_array = csr_matrix(x1, dtype=np.int8).toarray()

In [213]:
X_cumulative = list()
for i,j in zip(x1_array,X):
    X_cumulative.append(np.concatenate((i, j), axis=0))

In [214]:
x_cum_train, x_cum_test, y1_train, y1_test = train_test_split(X_cumulative, y, test_size=test_ratio, shuffle=True, random_state=1)

In [215]:
print(y1_test)

[3 0 0 0 0 2 0 0 1 0 2 1 3 1 3 0 0 3 2 1 1 0 3 0 0 0 2 1 0 1 3 1 3 0 0 0 1
 0 3 3 0 1 1 0 0 0 1 3 0 2 0 2 0 3 0 2 1 0 3 1 2 1 0 0 1 1 2 1 2 0 2 3 2 2
 0 0 0 0 0 1 1 1 0 1]


In [216]:
#For bigram based analysis
x2_array = csr_matrix(x2, dtype=np.int8).toarray()

X_cumulative = list()
for i,j in zip(x2_array,X):
    X_cumulative.append(np.concatenate((i, j), axis=0))

x_cum_train, x_cum_test, y2_train, y2_test = train_test_split(X_cumulative, y, test_size=test_ratio, shuffle=True, random_state=1)


## Train evaluate a Logistic Regression Classifier for concatenated n-grams and lda topics as features

In [150]:
#Train evaluate a Logistic Regression Classifier for concatenated n-grams and lda topics as features
param_grid_lr = {
         'C': [1.0, 1e2, 1e3, 5e3, 1e4, 5e4, 1e5],
         'solver': ('newton-cg', 'lbfgs', 'sag', 'saga')
          }
lr = GridSearchCV(LogisticRegression(random_state=42, multi_class='multinomial', max_iter=40000), param_grid_lr, cv=5)
#lr = GridSearchCV(LogisticRegression(multi_class='multinomial', max_iter=40000), param_grid_lr, cv=5)
lr = lr.fit(x_cum_train, y1_train)
lr_train_pred = lr.predict(x_cum_train)
lr_test_pred = lr.predict(x_cum_test)


#evaluate PRF for Ordinal Classification using logistic regression
print(precision_recall_f_compute(lr_test_pred, y1_test))

(0.7547169811320755, 0.5633802816901409, 0.6451612903225807)


## Train evaluate a Decision Tree Classifier for concatenated n-grams and lda topics as features

In [217]:
#Train evaluate a Decision Tree Classifier for concatenated n-grams and lda topics as features
dt = DecisionTreeClassifier(random_state=42)
#dt = DecisionTreeClassifier()
dt = dt.fit(x_cum_train, y1_train)
dt_train_pred = dt.predict(x_cum_train)
dt_test_pred = dt.predict(x_cum_test)

#evaluate PRF for Ordinal Classification using decision tree
print(precision_recall_f_compute(dt_test_pred, y1_test))

(0.6551724137931034, 0.59375, 0.6229508196721311)


## Train evaluate a Random Forest for concatenated n-grams and lda topics as features

In [169]:
#Train evaluate a Random Forest for concatenated n-grams and lda topics as features
rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=42)
#rf = RandomForestClassifier(n_estimators=100, max_depth=2)
rf = rf.fit(x_cum_train, y2_train)
rf_train_pred = rf.predict(x_cum_train)
rf_test_pred = rf.predict(x_cum_test)

#evaluate PRF for Ordinal Classification using random forest
print(precision_recall_f_compute(rf_test_pred, y1_test))

(0.8444444444444444, 0.4935064935064935, 0.6229508196721312)


## Train evaluate a Feedforward Network for concatenated n-grams and lda topics as features

In [170]:
#Train evaluate a Feedforward Network for concatenated n-grams and lda topics as features
mlp = MLPClassifier(hidden_layer_sizes=(5,5), max_iter=10000, random_state=42)
#mlp = MLPClassifier(hidden_layer_sizes=(5,5), max_iter=10000)
mlp = mlp.fit(x_cum_train, y1_train)
train_pred = mlp.predict(x_cum_train)
test_pred = mlp.predict(x_cum_test)

#evaluate PRF for Ordinal Classification using random forest
print(precision_recall_f_compute(test_pred, y1_test))

(0.8222222222222222, 0.4868421052631579, 0.6115702479338843)


## Working with ML models using only lda topics as features

In [171]:
#traing ML models with lda topics as features
encoder = LabelEncoder()
rev_train['Pain_Change'] = encoder.fit_transform(data['Pain_Change'].astype(str))

#y = data[['Pain_Relevance']]
y = np.array(rev_train.Pain_Change)

### lda features as input for ordinal classification

In [172]:
#lda features as input for ordinal classification

classes = ['decrease','no change', 'not sure', 'increase']

kf = KFold(5, shuffle=True, random_state=42)
cv_lr_f1, cv_lrsgd_f1, cv_svcsgd_f1, cv_dt_f1, cv_mlp_f1, cv_rf_f1  = [], [], [], [], [], []
cv_lr_precision, cv_lrsgd_precision, cv_svcsgd_precision, cv_dt_precision, cv_mlp_precision, cv_rf_precision = [], [], [], [],[], []
cv_lr_recall, cv_lrsgd_recall, cv_svcsgd_recall, cv_dt_recall, cv_mlp_recall, cv_rf_recall = [], [], [], [],[], []


for train_ind, val_ind in kf.split(X, y):
    # Assign CV IDX
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind]
    
    # Scale Data
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)

    # Logisitic Regression
    lr = LogisticRegression(
        class_weight= 'balanced',
        solver='newton-cg',
        fit_intercept=True
    ).fit(X_train_scale, y_train)

    y_pred = lr.predict(X_val_scale)
    
    
    cv_lr_f1.append(precision_recall_f_compute(y_pred, y_val)[2])
    cv_lr_precision.append(precision_recall_f_compute(y_pred, y_val)[0])
    cv_lr_recall.append(precision_recall_f_compute(y_pred, y_val)[1])
    
    
    # Logistic Regression Mini-Batch SGD
    sgd = linear_model.SGDClassifier(
        max_iter=1000,
        tol=1e-3,
        loss='log',
        class_weight='balanced'
    ).fit(X_train_scale, y_train)
    
    y_pred = sgd.predict(X_val_scale)
    
    cv_lrsgd_f1.append(precision_recall_f_compute(y_pred, y_val)[2])
    cv_lrsgd_precision.append(precision_recall_f_compute(y_pred, y_val)[0])
    cv_lrsgd_recall.append(precision_recall_f_compute(y_pred, y_val)[1])
    
    # Decision tree classifier
    dt = DecisionTreeClassifier(random_state=42).fit(X_train_scale, y_train)
    
    y_pred = dt.predict(X_val_scale)
    #cv_dt_f1.append(f1_score(y_val, y_pred, average='weighted'))
    #cv_dt_precision.append(precision_score(y_val, y_pred, average='weighted'))
    #cv_dt_recall.append(recall_score(y_val, y_pred, average='weighted'))
    
    
    cv_dt_f1.append(precision_recall_f_compute(y_pred, y_val)[2])
    cv_dt_precision.append(precision_recall_f_compute(y_pred, y_val)[0])
    cv_dt_recall.append(precision_recall_f_compute(y_pred, y_val)[1])
    
    
    # MLP Classifier
    mlp = MLPClassifier(
        hidden_layer_sizes=(50,50), max_iter=10000, random_state=42).fit(X_train_scale, y_train)
    y_pred = dt.predict(X_val_scale)
    #cv_mlp_f1.append(f1_score(y_val, y_pred, average='weighted'))
    #cv_mlp_precision.append(precision_score(y_val, y_pred, average='weighted'))
    #cv_mlp_recall.append(recall_score(y_val, y_pred, average='weighted'))
    
    cv_mlp_f1.append(precision_recall_f_compute(y_pred, y_val)[2])
    cv_mlp_precision.append(precision_recall_f_compute(y_pred, y_val)[0])
    cv_mlp_recall.append(precision_recall_f_compute(y_pred, y_val)[1])
    
    
    #Random Forest Classifier
    rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=42).fit(X_train_scale, y_train)
    
    y_pred = rf.predict(X_val_scale)
    #cv_rf_f1.append(f1_score(y_val, y_pred, average='weighted'))
    #cv_rf_precision.append(precision_score(y_val, y_pred, average='weighted'))
    #cv_rf_recall.append(recall_score(y_val, y_pred, average='weighted'))
    
    cv_rf_f1.append(precision_recall_f_compute(y_pred, y_val)[2])
    cv_rf_precision.append(precision_recall_f_compute(y_pred, y_val)[0])
    cv_rf_recall.append(precision_recall_f_compute(y_pred, y_val)[1])
    

print(f'Logistic Regression Val f1: {np.mean(cv_lr_f1):.3f} +- {np.std(cv_lr_f1):.3f}')
print(f'Logisitic Regression SGD Val f1: {np.mean(cv_lrsgd_f1):.3f} +- {np.std(cv_lrsgd_f1):.3f}')
#print(f'SVM Huber Val f1: {np.mean(cv_svcsgd_f1):.3f} +- {np.std(cv_svcsgd_f1):.3f}')
print(f'Decision Tree Val f1: {np.mean(cv_dt_f1):.3f} +- {np.std(cv_dt_f1):.3f}')
print(f'MLP Val f1: {np.mean(cv_mlp_f1):.3f} +- {np.std(cv_mlp_f1):.3f}')
print(f'Random Forest Val f1: {np.mean(cv_rf_f1):.3f} +- {np.std(cv_rf_f1):.3f}')

print('\n')


print(f'Logistic Regression Val precision: {np.mean(cv_lr_precision):.3f} +- {np.std(cv_lr_precision):.3f}')
print(f'Logisitic Regression SGD Val precision: {np.mean(cv_lrsgd_precision):.3f} +- {np.std(cv_lrsgd_precision):.3f}')
#print(f'SVM Huber Val precision: {np.mean(cv_svcsgd_precision):.3f} +- {np.std(cv_svcsgd_precision):.3f}')
print(f'Decision Tree Val precision: {np.mean(cv_dt_precision):.3f} +- {np.std(cv_dt_precision):.3f}')
print(f'MLP Val precision: {np.mean(cv_mlp_precision):.3f} +- {np.std(cv_mlp_precision):.3f}')
print(f'Random Forest Val precision: {np.mean(cv_rf_precision):.3f} +- {np.std(cv_rf_precision):.3f}')

print('\n')
print(f'Logistic Regression Val recall: {np.mean(cv_lr_recall):.3f} +- {np.std(cv_lr_recall):.3f}')
print(f'Logisitic Regression SGD Val recall: {np.mean(cv_lrsgd_recall):.3f} +- {np.std(cv_lrsgd_recall):.3f}')
#print(f'SVM Huber Val recall: {np.mean(cv_svcsgd_recall):.3f} +- {np.std(cv_svcsgd_recall):.3f}')
print(f'Decision Tree Val recall: {np.mean(cv_dt_recall):.3f} +- {np.std(cv_dt_recall):.3f}')
print(f'MLP Val recall: {np.mean(cv_mlp_recall):.3f} +- {np.std(cv_mlp_recall):.3f}')
print(f'Random Forest Val recall: {np.mean(cv_rf_recall):.3f} +- {np.std(cv_rf_recall):.3f}')


Logistic Regression Val f1: 0.592 +- 0.083
Logisitic Regression SGD Val f1: 0.380 +- 0.097
Decision Tree Val f1: 0.693 +- 0.076
MLP Val f1: 0.693 +- 0.076
Random Forest Val f1: 0.678 +- 0.074


Logistic Regression Val precision: 0.648 +- 0.110
Logisitic Regression SGD Val precision: 0.323 +- 0.111
Decision Tree Val precision: 0.717 +- 0.058
MLP Val precision: 0.717 +- 0.058
Random Forest Val precision: 0.987 +- 0.016


Logistic Regression Val recall: 0.553 +- 0.091
Logisitic Regression SGD Val recall: 0.494 +- 0.080
Decision Tree Val recall: 0.674 +- 0.104
MLP Val recall: 0.674 +- 0.104
Random Forest Val recall: 0.522 +- 0.081
