In [4]:
# Import dependencies
import numpy as np
import pandas as pd
import os
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [113]:
# Read in file
#file_path = Path('/Tables/free_text_df.csv')
df = pd.read_csv('Tables/free_text_df.csv')

In [97]:
df.head(10)

Unnamed: 0,Rank,OrgStudyId,WhyStopped,EnrollmentCount,PrimaryOutcomeMeasure,FlowDropWithdrawType
0,1,BTX-BCI-016-PRT,,3000,To determine BCI test performance by evaluatin...,
1,2,2018-TJ-BCD,,2300,Diagnostic potential of SEMA4C as a biomarker ...,
2,3,Breast cancer,,80,Role of SORCIN in patients with breast cancer,
3,4,BC-BOMET,,30,SENP1 expression,
4,5,241391,,600,Performance of the Syantra DX Breast Cancer te...,
5,6,IL-TM-B1-01,,200,This study is intended to evaluate the sensiti...,
6,7,FH-Risk 2.0 Research Protocol,,271,To explore how much new risk models change bre...,
7,8,ID-RPSBC-01-20201012,,316,Absolute risk difference between breast cancer...,
8,9,IRST174.22,,60000,To compare the cumulative incidence of stage 2...,
9,10,ANILERGİNN,,300,breast cancer incidence after laparoscopic sle...,


In [12]:
df['WhyStopped'].unique()

array([nan, 'Pandemic situation',
       'Technical problem with plasma blood samples obtained from the patients',
       'study did not start and is currently on pause',
       'Principal investigator left the study institution.',
       'The study was stopped prematurely due to insufficient recruitment',
       'No participants enrolled',
       'PI no longer working at Indiana University;',
       'Temporarily paused per study team for interim data review.',
       'Study classified as out of scope by the Ethics Committee (not a project involving human person).',
       'Enrollment into AWARE cohorts1-4 have concluded and the primary objective and core goals for the study were met.',
       'Slow recruitment rate',
       'Sponsor decision to prematurely stop the study, not linked to any safety concern',
       'One participant was accrued, and the study was stopped due to new safety data from the company for M7824 and slow accrual.',
       'The researcher who was able to recruit t

In [114]:
# Extract WhyStopped column and drop null values
df_text = (df['WhyStopped'])
df_text = df_text.dropna()
df_text

32                                     Pandemic situation
47      Technical problem with plasma blood samples ob...
50          study did not start and is currently on pause
54      Principal investigator left the study institut...
84      The study was stopped prematurely due to insuf...
                              ...                        
4967    Technical problem with blood plasma samples ob...
4968    Evolving data with Ipatasertib that changes th...
4970    Study is part of PhD trajectory and currently ...
4981                                           no funding
4986              sponsor on campus training restrictions
Name: WhyStopped, Length: 320, dtype: object

In [115]:
# Tokenize words from WhyStopped responses
from nltk.tokenize import sent_tokenize, word_tokenize
words = str(df_text)
stopped_text = word_tokenize(words)
print(stopped_text)

['32', 'Pandemic', 'situation', '47', 'Technical', 'problem', 'with', 'plasma', 'blood', 'samples', 'ob', '...', '50', 'study', 'did', 'not', 'start', 'and', 'is', 'currently', 'on', 'pause', '54', 'Principal', 'investigator', 'left', 'the', 'study', 'institut', '...', '84', 'The', 'study', 'was', 'stopped', 'prematurely', 'due', 'to', 'insuf', '...', '...', '4967', 'Technical', 'problem', 'with', 'blood', 'plasma', 'samples', 'ob', '...', '4968', 'Evolving', 'data', 'with', 'Ipatasertib', 'that', 'changes', 'th', '...', '4970', 'Study', 'is', 'part', 'of', 'PhD', 'trajectory', 'and', 'currently', '...', '4981', 'no', 'funding', '4986', 'sponsor', 'on', 'campus', 'training', 'restrictions', 'Name', ':', 'WhyStopped', ',', 'Length', ':', '320', ',', 'dtype', ':', 'object']


In [103]:
# Import stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alejandra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [116]:
stopWords = set(stopwords.words('english'))

In [117]:
# Loop through tokenized words to filter out stopwords and append to new list
filtered_text = []

for w in stopped_text:
    if w not in stopWords:
        filtered_text.append(w)

filtered_text

['32',
 'Pandemic',
 'situation',
 '47',
 'Technical',
 'problem',
 'plasma',
 'blood',
 'samples',
 'ob',
 '...',
 '50',
 'study',
 'start',
 'currently',
 'pause',
 '54',
 'Principal',
 'investigator',
 'left',
 'study',
 'institut',
 '...',
 '84',
 'The',
 'study',
 'stopped',
 'prematurely',
 'due',
 'insuf',
 '...',
 '...',
 '4967',
 'Technical',
 'problem',
 'blood',
 'plasma',
 'samples',
 'ob',
 '...',
 '4968',
 'Evolving',
 'data',
 'Ipatasertib',
 'changes',
 'th',
 '...',
 '4970',
 'Study',
 'part',
 'PhD',
 'trajectory',
 'currently',
 '...',
 '4981',
 'funding',
 '4986',
 'sponsor',
 'campus',
 'training',
 'restrictions',
 'Name',
 ':',
 'WhyStopped',
 ',',
 'Length',
 ':',
 '320',
 ',',
 'dtype',
 ':',
 'object']

In [109]:
from sklearn.feature_extraction.text import CountVectorizer

In [110]:
vec = CountVectorizer()
X = vec.fit_transform(filtered_text)

In [111]:
df_filtered = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
df_filtered.head()

Unnamed: 0,32,320,47,4967,4968,4970,4981,4986,50,54,...,sponsor,start,stopped,study,technical,th,the,training,trajectory,whystopped
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
df_text_stopped = df_text_stopped.reset_index(drop=True)
df_text_stopped

0                                     Pandemic situation
1      Technical problem with plasma blood samples ob...
2          study did not start and is currently on pause
3      Principal investigator left the study institut...
4      The study was stopped prematurely due to insuf...
                             ...                        
315    Technical problem with blood plasma samples ob...
316    Evolving data with Ipatasertib that changes th...
317    Study is part of PhD trajectory and currently ...
318                                           no funding
319              sponsor on campus training restrictions
Name: WhyStopped, Length: 320, dtype: object

In [70]:
df_text_stopped.dtypes

dtype('O')

In [71]:
# Change datatype to string
df_text_stopped = str(df_text_stopped)
print(type(df_text_stopped))

<class 'str'>


In [72]:
# Lowercase
print(df_text.lower())

                                             whystopped
32                                   pandemic situation
47    technical problem with plasma blood samples ob...
50        study did not start and is currently on pause
54    principal investigator left the study institut...
84    the study was stopped prematurely due to insuf...
...                                                 ...
4967  technical problem with blood plasma samples ob...
4968  evolving data with ipatasertib that changes th...
4970  study is part of phd trajectory and currently ...
4981                                         no funding
4986            sponsor on campus training restrictions

[320 rows x 1 columns]


In [89]:
from collections import Counter
import re

text = str(df_text_stopped)
 
words = re.findall('\w+', text)
freq = Counter(words).most_common()
for word,count in freq:
    print(f'{word:<4} {"-->":^4} {count:>4}')


with -->     3
study -->     3
Technical -->     2
problem -->     2
plasma -->     2
blood -->     2
samples -->     2
ob   -->     2
and  -->     2
is   -->     2
currently -->     2
on   -->     2
32   -->     1
Pandemic -->     1
situation -->     1
47   -->     1
50   -->     1
did  -->     1
not  -->     1
start -->     1
pause -->     1
54   -->     1
Principal -->     1
investigator -->     1
left -->     1
the  -->     1
institut -->     1
84   -->     1
The  -->     1
was  -->     1
stopped -->     1
prematurely -->     1
due  -->     1
to   -->     1
insuf -->     1
4967 -->     1
4968 -->     1
Evolving -->     1
data -->     1
Ipatasertib -->     1
that -->     1
changes -->     1
th   -->     1
4970 -->     1
Study -->     1
part -->     1
of   -->     1
PhD  -->     1
trajectory -->     1
4981 -->     1
no   -->     1
funding -->     1
4986 -->     1
sponsor -->     1
campus -->     1
training -->     1
restrictions -->     1
Name -->     1
WhyStopped -->     1
Length --

In [45]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [51]:
stop_words = nltk.corpus.stopwords.words('english')

In [54]:
filtered_list = []

In [58]:
# Tokenize the sentence
words = word_tokenize(df_text)
for w in words:
    if w.lower() not in stop_words:
        filtered_list.append(w)
        
filtered_list

['WhyStopped',
 '32',
 'Pandemic',
 'situation',
 '47',
 'Technical',
 'problem',
 'plasma',
 'blood',
 'samples',
 'ob',
 '...',
 '50',
 'study',
 'start',
 'currently',
 'pause',
 '54',
 'Principal',
 'investigator',
 'left',
 'study',
 'institut',
 '...',
 '84',
 'study',
 'stopped',
 'prematurely',
 'due',
 'insuf',
 '...',
 '...',
 '...',
 '4967',
 'Technical',
 'problem',
 'blood',
 'plasma',
 'samples',
 'ob',
 '...',
 '4968',
 'Evolving',
 'data',
 'Ipatasertib',
 'changes',
 'th',
 '...',
 '4970',
 'Study',
 'part',
 'PhD',
 'trajectory',
 'currently',
 '...',
 '4981',
 'funding',
 '4986',
 'sponsor',
 'campus',
 'training',
 'restrictions',
 '[',
 '320',
 'rows',
 'x',
 '1',
 'columns',
 ']',
 'WhyStopped',
 '32',
 'Pandemic',
 'situation',
 '47',
 'Technical',
 'problem',
 'plasma',
 'blood',
 'samples',
 'ob',
 '...',
 '50',
 'study',
 'start',
 'currently',
 'pause',
 '54',
 'Principal',
 'investigator',
 'left',
 'study',
 'institut',
 '...',
 '84',
 'study',
 'stopped',


In [59]:
filtered_list = pd.DataFrame(filtered_list)
filtered_list.head()

Unnamed: 0,0
0,WhyStopped
1,32
2,Pandemic
3,situation
4,47


In [30]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()  
vects = vect.fit_transform(df_text)


In [32]:
td = pd.DataFrame(vects.todense()).iloc[:5]  
td.columns = vect.get_feature_names()
term_document_matrix = td.T
term_document_matrix.columns = ['Doc '+str(i) for i in range(1)]
term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)

In [33]:
term_document_matrix = term_document_matrix.sort_values(by ='total_count',ascending=False)[:25] 


In [34]:
print(term_document_matrix.drop(columns=['total_count']).head(10))

            Doc 0
whystopped      1


In [None]:
# Reduce dimensionality/complexity


In [None]:
# Remove punctuation

In [None]:
# Remove stop words

In [None]:
# Create equivalence classes(lemmatize/stem)

In [None]:
# Filter by Frequency

In [None]:
# Create the document feature matrix