In [None]:
import pandas as pd
import numpy as np
import string
import re
import nltk

In [None]:
# Loading our data
cases = pd.read_csv('22_12_8_cases_filtered.cvs')
control = pd.read_csv('22_12_8_control_filtered.csv')

In [None]:
### looking into the number of samples in controls and cases
control['mrn_int'].nunique()

In [None]:
### looking into the number of samples in controls and cases
cases['mrn_int'].nunique()

# Cases

In [None]:
cases = cases[['mrn_int', 'note_text']]

In [None]:
cases

In [None]:
### We want to split the note_texts so that history of present illness is separe from the physical exam info

# new columns
cases[['history','exam']] = cases.note_text.str.split('Physical Exam', expand=True)

## cases - original text notes combined

In [None]:
# first we group all patients by mrn and combined the original text_notes in full
cases_original_combined = cases.groupby(['mrn_int'], as_index = False).agg({'note_text' : '\n'.join})

In [None]:
# Add a target column to the dataframes to note them as ctcl pateints
cases_original_combined['target'] = "ctcl"

In [None]:
cases_original_combined

## Cases - only history

In [None]:
# first we group all patients by mrn and combined the history notee
cases_history_combined = cases.groupby(['mrn_int'], as_index = False).agg({'history' : '\n'.join})

In [None]:
# Add a target column to the dataframes to note them as ctcl pateints
cases_history_combined['target'] = "ctcl"

In [None]:
cases_history_combined

## Cases - only exam notes

In [None]:
exam_data = cases[['mrn_int', 'exam']]
# filter exam data to only inlcude non-na notes
exam_data = exam_data[~exam_data['exam'].isna()]
# group all patients by mrn and combined the exam notes
cases_exam_combined = exam_data.groupby(['mrn_int'], as_index = False).agg({'exam' : '\n'.join})

In [None]:
cases_exam_combined['target'] = "ctcl"

In [None]:
cases_exam_combined

# Controls

In [None]:
control = control[['mrn_int', 'note_text']]

In [None]:
control

In [None]:
### We want to split the note_texts so that history of present illness is separe from the physical exam info

# new columns
control[['history','exam', 'extra']] = control.note_text.str.split('Physical Exam', expand=True)

In [None]:
control

## Control original notes

In [None]:
# first we group all patients by mrn and combined the original notes
control_original_combined = control.groupby(['mrn_int'], as_index = False).agg({'note_text' : '\n'.join})
# add target column
control_original_combined['target'] = "other"

In [None]:
control_original_combined

## Control only history notes

In [None]:
# first we group all patients by mrn and combined the history notes
control_history_combined = control.groupby(['mrn_int'], as_index = False).agg({'history' : '\n'.join})
# add target column
control_history_combined['target'] = "other"

In [None]:
control_history_combined

## Control only exam notes

In [None]:
exam_data = control[['mrn_int', 'exam']]
# filter exam data to only inlcude non-na notes
exam_data = exam_data[~exam_data['exam'].isna()]
# group all patients by mrn and combined the exam notes
control_exam_combined = exam_data.groupby(['mrn_int'], as_index = False).agg({'exam' : '\n'.join})
# add target column
control_exam_combined['target'] = "other"

In [None]:
control_exam_combined

# Merging cases and controls and performing all preprocessing needed

In [None]:
# functions to be used:
### Remove punctuations
punct_to_remove = string.punctuation

def remove_punct(text):
    return text.translate(str.maketrans('', '', punct_to_remove))

In [None]:
### Removing stop words
from nltk.corpus import stopwords

# This is the list of words thhat will be removed
", ".join(stopwords.words('english'))


In [None]:
### function to remove stop words
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

## Original notes

In [None]:
# Combining cases and control into one dataframe
original_combined = pd.concat([control_original_combined, cases_original_combined], axis = 0)

In [None]:
### Make sure the items in the notes column are string values
original_combined['note_text'] = original_combined['note_text'].astype(str)

### set everything to lower case
original_combined['note_text'] = original_combined['note_text'].str.lower()

### Remove punctuation
original_combined['note_text'] = original_combined['note_text'].apply(lambda text: remove_punct(text))

### removing stop words
original_combined['note_text'] = original_combined['note_text'].apply(lambda text: remove_stopwords(text))
original_combined

In [None]:
### saving pre_processed data for future use
original_combined.to_csv('22_12_08_original_data_preprocessed.csv', index = False)

## History notes

In [None]:
# Combining cases and control into one dataframe
history_combined = pd.concat([control_history_combined, cases_history_combined], axis = 0)

In [None]:
history_combined

In [None]:
### Make sure the items in the notes column are string values
history_combined['history'] = history_combined['history'].astype(str)

### set everything to lower case
history_combined['history'] = history_combined['history'].str.lower()

### Remove punctuation
history_combined['history'] = history_combined['history'].apply(lambda text: remove_punct(text))

### removing stop words
history_combined['history'] = history_combined['history'].apply(lambda text: remove_stopwords(text))
history_combined

In [None]:
### saving pre_processed data for future use
history_combined.to_csv('22_12_08_history_data_preprocessed.csv', index = False)

## Exam notes

In [None]:
# Combining cases and control into one dataframe
exam_combined = pd.concat([control_exam_combined, cases_exam_combined], axis = 0)
exam_combined

In [None]:
### Make sure the items in the notes column are string values
exam_combined['exam'] = exam_combined['exam'].astype(str)

### set everything to lower case
exam_combined['exam'] = exam_combined['exam'].str.lower()

### Remove punctuation
exam_combined['exam'] = exam_combined['exam'].apply(lambda text: remove_punct(text))

### removing stop words
exam_combined['exam'] = exam_combined['exam'].apply(lambda text: remove_stopwords(text))
exam_combined

In [None]:
### saving pre_processed data for future use
exam_combined.to_csv('22_12_08_exam_data_preprocessed.csv', index = False)