In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk


In [24]:
# Loading our data
control = pd.read_csv('control_filtered.csv')
cases = pd.read_csv('cases_filtered.csv')

In [25]:
### Strange - number of control patients does not equal what was expected
control['mrn'].nunique()

2300

In [26]:
### Strange - number of cases patients does not equal what was expected
cases['mrn'].nunique()

201

### joining all notes by patient 

In [29]:
# Fist we will combined all teh control and case notes notes by patient into one note by patient
control = control.groupby(['mrn'], as_index = False).agg({'note_text' : '\n'.join})
cases = cases.groupby(['mrn'], as_index = False).agg({'note_text' : '\n'.join})

In [31]:
# Add a target column to the dataframes to note them as ctcl pateints or controls
cases['target'] = "ctcl"
control['target'] = "other"

In [33]:
# Combining cases and control into one dataframe
all_data = pd.concat([control, cases], axis = 0)

In [35]:
# Lets save this as a csv file in case we need it for future reference 
all_data.to_csv('nlp_all_data.csv')

### preprocessing

In [36]:
### Make sure the items in the notes column are string values
all_data['note_text'] = all_data['note_text'].astype(str)

In [38]:
### set everything to lower case
all_data['note_text'] = all_data['note_text'].str.lower()

In [40]:
### Remove punctuations
punct_to_remove = string.punctuation

def remove_punct(text):
    return text.translate(str.maketrans('', '', punct_to_remove))

all_data['note_text'] = all_data['note_text'].apply(lambda text: remove_punct(text))

In [41]:
all_data

Unnamed: 0,mrn,note_text,target
0,1000005204,final history of present illness syl...,other
1,1000009671,final referring provider dr orli eting...,other
2,1000016467,amended final history of present illness ...,other
3,1000020839,final history of present illness co ...,other
4,1000026299,final history of present illness rog...,other
...,...,...,...
196,1400439110,follow up visit note 1132022 chief c...,ctcl
197,1400442752,history of present illness chris ward is a 62...,ctcl
198,1400480255,i saw and evaluated the patient in the presenc...,ctcl
199,1400604108,history of present illness vishnu pushpakaran...,ctcl


In [44]:
### Removing stop words
from nltk.corpus import stopwords

# This is the list of words thhat will be removed
", ".join(stopwords.words('english'))


"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [47]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

all_data['note_text'] = all_data['note_text'].apply(lambda text: remove_stopwords(text))
all_data

Unnamed: 0,mrn,note_text,target
0,1000005204,final history present illness sylvia fuhrman 9...,other
1,1000009671,final referring provider dr orli etingin 21274...,other
2,1000016467,amended final history present illness melissa ...,other
3,1000020839,final history present illness co flare rosacea...,other
4,1000026299,final history present illness roger altman 66 ...,other
...,...,...,...
196,1400439110,follow visit note 1132022 chief complaint 1 sk...,ctcl
197,1400442752,history present illness chris ward 62 year old...,ctcl
198,1400480255,saw evaluated patient presence resident provid...,ctcl
199,1400604108,history present illness vishnu pushpakaran pil...,ctcl


In [48]:
### saving pre_processed data for future use
all_data.to_csv('nlp_all_data_preprocessed.csv')