In [13]:
import pandas as pd
import re

In [4]:
df = pd.read_csv ('./clinical_trial_data.csv')
print(df.head())

     NCTNumber                          LeadSponsorName  \
0  NCT06036238  University of California, San Francisco   
1  NCT05882916                            Sue Napierala   
2  NCT05862857  University of California, San Francisco   
3  NCT05845619  University of California, San Francisco   
4  NCT05842122            Fred Hutchinson Cancer Center   

                             InterventionDescription  \
0  The Healthy Living Intervention (HLI) is a bri...   
1  Oral fluid-based HIV self-test kits for second...   
2  Patrons and employees of drinking venues that ...   
3  The pilot intervention will include the follow...   
4  Services delivered: 1) behavioral HIV risk ass...   

  BaselineMeasurePopulationDescription  \
0                                  NaN   
1                                  NaN   
2                                  NaN   
3                                  NaN   
4                                  NaN   

                  DesignInterventionModelDescription  \

In [72]:
df_description = df[['NCTNumber','InterventionDescription']]
print(df_description.head())

     NCTNumber                            InterventionDescription
0  NCT06036238  The Healthy Living Intervention (HLI) is a bri...
1  NCT05882916  Oral fluid-based HIV self-test kits for second...
2  NCT05862857  Patrons and employees of drinking venues that ...
3  NCT05845619  The pilot intervention will include the follow...
4  NCT05842122  Services delivered: 1) behavioral HIV risk ass...


In [73]:
df_description.shape

(160, 2)

In [74]:
df_description.isnull().sum()

NCTNumber                   0
InterventionDescription    12
dtype: int64

In [75]:
df_description_clean = df_description.dropna()
df_description_clean.shape

(148, 2)

In [76]:
df_description_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148 entries, 0 to 159
Data columns (total 2 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   NCTNumber                148 non-null    object
 1   InterventionDescription  148 non-null    object
dtypes: object(2)
memory usage: 3.5+ KB


In [77]:
df_description_clean.loc[:,'InterventionDescription'] = df_description_clean['InterventionDescription'].astype('str')
df_description_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148 entries, 0 to 159
Data columns (total 2 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   NCTNumber                148 non-null    object
 1   InterventionDescription  148 non-null    object
dtypes: object(2)
memory usage: 3.5+ KB


In [78]:
## Identifying Noise through Regular expression

RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')

def impurity (text, min_len=10):
    if text == None or len(text) < min_len:
        return 0
    else:
        #return bool(RE_SUSPICIOUS.search(text))
        return len(RE_SUSPICIOUS.findall(text))/len(text)

In [79]:
df_description_clean.loc[:,'impurity'] = df_description_clean.loc[:,'InterventionDescription'].apply(impurity,min_len=10)
print(df_description_clean.head())


df_description_clean[['InterventionDescription', 'impurity']].sort_values(by='impurity', ascending=False).head(3)

     NCTNumber                            InterventionDescription  impurity
0  NCT06036238  The Healthy Living Intervention (HLI) is a bri...  0.000000
1  NCT05882916  Oral fluid-based HIV self-test kits for second...  0.000000
2  NCT05862857  Patrons and employees of drinking venues that ...  0.000000
3  NCT05845619  The pilot intervention will include the follow...  0.001138
4  NCT05842122  Services delivered: 1) behavioral HIV risk ass...  0.000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_description_clean.loc[:,'impurity'] = df_description_clean.loc[:,'InterventionDescription'].apply(impurity,min_len=10)


Unnamed: 0,InterventionDescription,impurity
8,Tu'Washindi consists of 3 primary components:\...,0.010142
126,When a participant comes for a regular clinic ...,0.00565
128,Economic incentives are given to the study par...,0.004695


In [107]:
def clean(text):
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    # remove brackets
    text = re.sub(r'[:;()]', '', text)
    # remove numbers
    text = re.sub(r'[0-9]','', text)
    return text.strip()

In [123]:
df_description_clean.loc[:,'clean_text'] = df_description_clean.loc[:,'InterventionDescription'].map(clean)
print(df_description_clean.head())
df_description_clean.info()

     NCTNumber                            InterventionDescription  impurity  \
0  NCT06036238  The Healthy Living Intervention (HLI) is a bri...  0.000000   
1  NCT05882916  Oral fluid-based HIV self-test kits for second...  0.000000   
2  NCT05862857  Patrons and employees of drinking venues that ...  0.000000   
3  NCT05845619  The pilot intervention will include the follow...  0.001138   
4  NCT05842122  Services delivered: 1) behavioral HIV risk ass...  0.000000   

                                          clean_text  
0  The Healthy Living Intervention HLI is a brief...  
1  Oral fluid-based HIV self-test kits for second...  
2  Patrons and employees of drinking venues that ...  
3  The pilot intervention will include the follow...  
4  Services delivered  behavioral HIV risk assess...  
<class 'pandas.core.frame.DataFrame'>
Index: 148 entries, 0 to 159
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   ------------

In [124]:
df_description_clean.to_csv("Intervention.csv",index=False)

In [103]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [122]:
count_para_vectorizer = TfidfVectorizer(stop_words=stopwords, min_df=5,max_df=0.7)

count_para_vectors = count_para_vectorizer.fit_transform(df_description_clean["clean_text"])

InvalidParameterError: The 'stop_words' parameter of TfidfVectorizer must be a str among {'english'}, an instance of 'list' or None. Got {'go', 'our', 'just', 'during', 'where', 'also', 'being', 'i', 'eleven', 'across', 'your', 'always', '’ve', 'own', 'herein', 'seemed', 'do', 'amount', 'beforehand', 'either', 'eight', 'both', 'fifty', '’m', 'call', 'onto', 'my', 'so', 'sometimes', 'then', 'by', 'top', 'very', 'for', 'take', 'otherwise', 'somehow', 'did', 'it', 'yet', 'using', 'everywhere', 'with', 'please', "'d", 'show', 'there', 'mostly', 'yours', 'done', 'something', 'per', 'them', 'put', 'together', 'twelve', '‘re', 'him', 'me', 'each', 'due', 'was', 'front', 'would', 'whether', 'n‘t', 'nevertheless', 'her', 'among', 'part', 'as', 'several', 'serious', 'about', 'himself', 'everything', 'ten', 'thereafter', 'thence', 'various', "'ve", 'myself', 'this', 'have', 'hereby', 'any', 'we', 'beyond', 'sixty', 'am', 'afterwards', 'will', 'therefore', 'almost', 'three', 'even', 'be', 'too', 'or', 'most', 'must', 'until', 'only', 'they', 'below', 'make', 'third', 'such', 'thereby', 'however', 'mine', 'which', 'in', 'really', 'every', 'whereas', 'sometime', 'first', 'four', 'except', 'least', 'anything', 'may', 'that', 'whereafter', 'say', 'towards', 'who', 'many', 'off', 'unless', 'whatever', 'amongst', 'nowhere', 'of', 'others', 'hereupon', 'cannot', "'re", 'n’t', 'forty', 'can', 'alone', 'indeed', 'she', 'and', 'one', 'give', '‘ve', 'again', 'his', 'between', 'a', 'when', "'s", 'whole', 'full', 'does', 'throughout', 'ca', 'after', 'nor', 'he', 'out', "'ll", 'hence', 'elsewhere', 'hundred', 'whom', 'doing', 'perhaps', 'empty', 'two', 'the', 'anyway', 'hereafter', 'nine', 'someone', 'is', 'while', 'against', '‘ll', 'could', 'not', 'some', 'side', 'whenever', 'on', 'seem', 'become', 'whereupon', 'already', 'us', 'here', 'those', 'few', 'over', 'because', "n't", 'up', 'through', 'are', 'down', 'before', 'no', 'keep', 'wherein', 'noone', 'well', 'everyone', 'via', 'herself', 'next', 'nobody', 'seems', 'though', '’s', 'upon', 'thus', '‘s', 'becomes', 'still', 'more', 'nothing', 'since', 'within', 'thereupon', 'move', 'somewhere', 'six', 'should', 'an', 'these', 'fifteen', 'back', 'quite', 'last', 'around', 'further', 'formerly', 'yourself', 'along', '‘m', 'became', 'whence', 'at', 'whoever', 'how', 'under', 'their', 'been', 'rather', 'wherever', '‘d', 'ourselves', 'themselves', 'get', 'therein', 'all', 'anywhere', 're', 'without', '’re', 'although', 'now', 'once', 'five', 'whereby', 'another', 'else', 'hers', 'moreover', 'whither', 'yourselves', 'whose', 'into', 'much', 'above', 'other', 'beside', 'to', 'made', 'why', 'namely', 'seeming', "'m", 'neither', 'bottom', 'but', 'anyhow', 'might', 'none', 'used', 'former', 'if', 'latter', 'were', 'name', 'its', 'behind', 'regarding', 'never', 'latterly', 'same', 'has', 'toward', 'anyone', 'less', 'itself', 'than', 'often', 'see', '’d', 'enough', '’ll', 'had', 'thru', 'ever', 'meanwhile', 'besides', 'you', 'twenty', 'from', 'ours', 'becoming', 'what'} instead.