In [78]:
import ir_datasets
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


In [79]:

# Load the dataset
dataset = ir_datasets.load("clinicaltrials/2021/trec-ct-2021")


In [80]:

# Initialize stemmer, lemmatizer, and stop words
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# Function to tokenize, stem, lemmatize, remove stop words and punctuation
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Stemming
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    # Remove stop words and punctuation
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words and token not in punctuation]
    return ' '.join(filtered_tokens)

# Preprocess and concatenate text data for vectorization
preprocessed_texts = []
for doc in dataset.docs_iter():
    concatenated_text = ' '.join([
        preprocess_text(doc.title),
        preprocess_text(doc.summary),
        preprocess_text(doc.detailed_description),
        preprocess_text(doc.eligibility)
    ])
    preprocessed_texts.append(concatenated_text)
    if len(preprocessed_texts) >= 1000:
        break

print("Preprocessing complete.")


Preprocessing complete.


In [81]:

# Vectorize the preprocessed text data
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_texts)

# Create a DataFrame with the TF-IDF features using the sparse matrix
df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=vectorizer.get_feature_names_out(), index=[doc.doc_id for doc in dataset.docs_iter()][:1000])

print(df)


              00      000  002  003  004  005b  006  007  007a  009  ...  \
NCT00000102  0.0  0.00000  0.0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  ...   
NCT00000104  0.0  0.00000  0.0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  ...   
NCT00000105  0.0  0.00000  0.0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  ...   
NCT00000106  0.0  0.00000  0.0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  ...   
NCT00000107  0.0  0.00000  0.0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  ...   
...          ...      ...  ...  ...  ...   ...  ...  ...   ...  ...  ...   
NCT00001126  0.0  0.00000  0.0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  ...   
NCT00001127  0.0  0.00000  0.0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  ...   
NCT00001128  0.0  0.00000  0.0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  ...   
NCT00001129  0.0  0.02865  0.0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  ...   
NCT00001130  0.0  0.00000  0.0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  ...   

             zone  zost  zoster  zosteriform  zovirax  zucker  zyban  zyprexa  \
NCT000

In [82]:
# Second way
import ir_datasets
dataset = ir_datasets.load("clinicaltrials/2021/trec-ct-2021")
for doc in dataset.docs_iter():
    doc # namedtuple<doc_id, title, condition, summary, detailed_description, eligibility>

In [83]:
# Define a function to print a subset of documents
def print_subset(dataset, start, end):
    for i, doc in enumerate(dataset.docs_iter()):
        if start <= i < end:
            print(f"Document ID: {doc.doc_id}")
            print(f"Title: {doc.title}")
            print(f"Condition: {doc.condition}")
            print(f"Summary: {doc.summary}")
            print(f"Detailed description: {doc.detailed_description}")
            print(f"Eligibility: {doc.eligibility}")
            print("\n")  # Add a newline for readability
        elif i >= end:
            break
print_subset(dataset, 0, 100)

Document ID: NCT00000102
Title: Congenital Adrenal Hyperplasia: Calcium Channels as Therapeutic Targets
Condition: 
Summary: 
    
      This study will test the ability of extended release nifedipine (Procardia XL), a blood
      pressure medication, to permit a decrease in the dose of glucocorticoid medication children
      take to treat congenital adrenal hyperplasia (CAH).
    
  
Detailed description: 
    
      This protocol is designed to assess both acute and chronic effects of the calcium channel
      antagonist, nifedipine, on the hypothalamic-pituitary-adrenal axis in patients with
      congenital adrenal hyperplasia. The multicenter trial is composed of two phases and will
      involve a double-blind, placebo-controlled parallel design. The goal of Phase I is to examine
      the ability of nifedipine vs. placebo to decrease adrenocorticotropic hormone (ACTH) levels,
      as well as to begin to assess the dose-dependency of nifedipine effects. The goal of Phase II
   

In [84]:

for doc in dataset.docs_iter():
    if(len(doc.condition)!=0):
              print("condition is not empty")
              print(doc.condition)
print("finish check")

finish check


In [85]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from nltk import pos_tag


In [86]:
from nltk.tokenize import word_tokenize
from typing import List  # Import the List type from the typing module

def correct_sentence_spelling(tokens: List[str]) -> List[str]:
    spell = SpellChecker()
    misspelled = spell.unknown(tokens)
    for i, token in enumerate(tokens):
        if token in misspelled:
            corrected = spell.correction(token)
            if corrected is not None:
                tokens[i] = corrected
    return tokens

In [87]:
# Initialize stemmer, lemmatizer, and stop words
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# Function to tokenize, stem, lemmatize, remove stop words and punctuation
def preprocess_text(text):
    # Tokenize text
    str(text)
    tokens = word_tokenize(text.lower())
    # Stemming
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    # # Part of speech
    # tagged_tokens = pos_tag(tokens)
    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    # #  Spell checking
    # correct_sentence_spelling(tokens)
    # Remove stop words and punctuation
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words and token not in punctuation]
    return filtered_tokens



In [88]:
# Apply preprocessing to each document in the dataset
preprocessed_data = []
counter = 0

for doc in dataset.docs_iter():
    if counter < 1000:
        preprocessed_doc = {
            'doc_id': doc.doc_id,
            'title': preprocess_text(doc.title),
            'summary': preprocess_text(doc.summary),
            'detailed_description': preprocess_text(doc.detailed_description),
            'eligibility': preprocess_text(doc.eligibility)
        }
        preprocessed_data.append(preprocessed_doc)
        counter += 1
    else:
        break

print("Preprocessing complete.")
# Now `preprocessed_data` contains the preprocessed text for the first 1000 documents


Preprocessing complete.


In [89]:
print(preprocessed_data[0])

{'doc_id': 'NCT00000102', 'title': ['congenit', 'adren', 'hyperplasia', 'calcium', 'channel', 'therapeut', 'target'], 'summary': ['thi', 'studi', 'test', 'abil', 'extend', 'releas', 'nifedipin', 'procardia', 'xl', 'blood', 'pressur', 'medic', 'permit', 'decreas', 'dose', 'glucocorticoid', 'medic', 'child', 'take', 'treat', 'congenit', 'adren', 'hyperplasia', 'cah'], 'detailed_description': ['thi', 'protocol', 'design', 'ass', 'acut', 'chronic', 'effect', 'calcium', 'channel', 'antagonist', 'nifedipin', 'hypothalamic-pituitary-adren', 'axi', 'patient', 'congenit', 'adren', 'hyperplasia', 'multicent', 'trial', 'compos', 'two', 'phase', 'involv', 'double-blind', 'placebo-control', 'parallel', 'design', 'goal', 'phase', 'examin', 'abil', 'nifedipin', 'vs.', 'placebo', 'decreas', 'adrenocorticotrop', 'hormon', 'acth', 'level', 'well', 'begin', 'ass', 'dose-depend', 'nifedipin', 'effect', 'goal', 'phase', 'ii', 'evalu', 'long-term', 'effect', 'nifedipin', 'attenu', 'acth', 'releas', 'nifedip

In [90]:
print(preprocessed_data[1])

{'doc_id': 'NCT00000104', 'title': ['doe', 'lead', 'burden', 'alter', 'neuropsycholog', 'develop'], 'summary': ['inner', 'citi', 'child', 'increas', 'risk', 'lead', 'overburden', 'thi', 'turn', 'affect', 'cognit', 'function', 'howev', 'underli', 'neuropsycholog', 'effect', 'lead', 'overburden', 'age-specif', 'effect', 'well', 'delin', 'thi', 'studi', 'part', 'larger', 'studi', 'effect', 'lead', 'overburden', 'develop', 'attent', 'memori', 'larger', 'studi', 'use', 'multi-model', 'approach', 'studi', 'effect', 'lead', 'overburden', 'effect', 'includ', 'event-rel', 'potenti', 'erp', 'electrophysiolog', 'measur', 'attent', 'memori', 'studi', 'everi', 'eight', 'month', 'total', 'three', 'session', 'subject', 'complet', 'erp', 'measur', 'attent', 'memori', 'requir', 'watch', 'variou', 'comput', 'imag', 'wear', 'scalp', 'electrod', 'record', '11', 'site', 'thi', 'test', 'go', 'crc', '30', 'lead', 'overburden', 'child', 'recruit', 'larger', 'studi', 'particip', 'erp', 'studi', 'crc', '30', 'c

In [91]:
print(preprocessed_data[133])

{'doc_id': 'NCT00000248', 'title': ['dose', 'respons', 'trial', 'pergolid', 'cocain', 'depend', '1'], 'summary': ['purpos', 'thi', 'studi', 'evalu', 'efficaci', 'safeti', 'pergolid', 'treatment', 'cocain', 'depend'], 'detailed_description': [], 'eligibility': ['plea', 'contact', 'site', 'inform']}


In [92]:
for element in preprocessed_data[:10]:  # Slicing to get the first 10 elements
    print(element)

{'doc_id': 'NCT00000102', 'title': ['congenit', 'adren', 'hyperplasia', 'calcium', 'channel', 'therapeut', 'target'], 'summary': ['thi', 'studi', 'test', 'abil', 'extend', 'releas', 'nifedipin', 'procardia', 'xl', 'blood', 'pressur', 'medic', 'permit', 'decreas', 'dose', 'glucocorticoid', 'medic', 'child', 'take', 'treat', 'congenit', 'adren', 'hyperplasia', 'cah'], 'detailed_description': ['thi', 'protocol', 'design', 'ass', 'acut', 'chronic', 'effect', 'calcium', 'channel', 'antagonist', 'nifedipin', 'hypothalamic-pituitary-adren', 'axi', 'patient', 'congenit', 'adren', 'hyperplasia', 'multicent', 'trial', 'compos', 'two', 'phase', 'involv', 'double-blind', 'placebo-control', 'parallel', 'design', 'goal', 'phase', 'examin', 'abil', 'nifedipin', 'vs.', 'placebo', 'decreas', 'adrenocorticotrop', 'hormon', 'acth', 'level', 'well', 'begin', 'ass', 'dose-depend', 'nifedipin', 'effect', 'goal', 'phase', 'ii', 'evalu', 'long-term', 'effect', 'nifedipin', 'attenu', 'acth', 'releas', 'nifedip

In [93]:
for element in preprocessed_data[:3]:  # Slicing to get the first 3 elements
    print(element)

{'doc_id': 'NCT00000102', 'title': ['congenit', 'adren', 'hyperplasia', 'calcium', 'channel', 'therapeut', 'target'], 'summary': ['thi', 'studi', 'test', 'abil', 'extend', 'releas', 'nifedipin', 'procardia', 'xl', 'blood', 'pressur', 'medic', 'permit', 'decreas', 'dose', 'glucocorticoid', 'medic', 'child', 'take', 'treat', 'congenit', 'adren', 'hyperplasia', 'cah'], 'detailed_description': ['thi', 'protocol', 'design', 'ass', 'acut', 'chronic', 'effect', 'calcium', 'channel', 'antagonist', 'nifedipin', 'hypothalamic-pituitary-adren', 'axi', 'patient', 'congenit', 'adren', 'hyperplasia', 'multicent', 'trial', 'compos', 'two', 'phase', 'involv', 'double-blind', 'placebo-control', 'parallel', 'design', 'goal', 'phase', 'examin', 'abil', 'nifedipin', 'vs.', 'placebo', 'decreas', 'adrenocorticotrop', 'hormon', 'acth', 'level', 'well', 'begin', 'ass', 'dose-depend', 'nifedipin', 'effect', 'goal', 'phase', 'ii', 'evalu', 'long-term', 'effect', 'nifedipin', 'attenu', 'acth', 'releas', 'nifedip

In [94]:
data_map = {}

# Iterate through each element in preprocessed_data
for element in preprocessed_data:
    doc_id = element['doc_id']
    # Store the other data (excluding doc_id) as the value
    data_map[doc_id] = {
        'title': element['title'],
        'summary': element['summary'],
        'detailed_description': element['detailed_description'],
        'eligibility': element['eligibility']
    }

In [95]:
corpus=data_map
documents = list(corpus.values())

# TODO : ADD YOUR OWN TOKENIZER & PREPROCESSOR !
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the documents
tfidf_matrix = vectorizer.fit_transform(documents)

df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index=corpus.keys())

df

AttributeError: 'dict' object has no attribute 'lower'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus=data_map
documents = list(corpus.values())


from nltk.corpus import stopwords
import string

# Define a preprocessor function
def preprocess_text(text):
    # Convert to lowercase
    str(text)
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Define a tokenizer function
def tokenize_text(text):
    return word_tokenize(text)

# Now, pass these functions to the TfidfVectorizer  tokenizer=tokenize_text, preprocessor=preprocess_text
vectorizer = TfidfVectorizer()


# Fit the vectorizer to the documents
tfidf_matrix = vectorizer.fit_transform(documents)

df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index=corpus.keys())

df

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Assuming data_map is a dictionary where values are the documents
corpus = data_map
documents = [str(doc) for doc in corpus.values()]  # Convert all values to strings

# TODO: ADD YOUR OWN TOKENIZER & PREPROCESSOR!
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Create a DataFrame with the TF-IDF results
df = pd.DataFrame(tfidf_matrix, columns=vectorizer.get_feature_names_out(), index=corpus.keys())

print(df)


In [None]:
csv_file_path = 'dataset.csv'
df = pd.read_csv(csv_file_path)
df

In [None]:
print(df.head())


In [None]:
print(df.tail())


In [None]:
import ir_datasets
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Load the dataset
dataset = ir_datasets.load("clinicaltrials/2021/trec-ct-2021")

# Initialize stemmer, lemmatizer, and stop words
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# Function to tokenize, stem, lemmatize, remove stop words and punctuation
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Stemming
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    # Remove stop words and punctuation
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words and token not in punctuation]
    return ' '.join(filtered_tokens)

# Preprocess and concatenate text data for vectorization
preprocessed_texts = []
for doc in dataset.docs_iter():
    concatenated_text = ' '.join([
        preprocess_text(doc.title),
        preprocess_text(doc.summary),
        preprocess_text(doc.detailed_description),
        preprocess_text(doc.eligibility)
    ])
    preprocessed_texts.append(concatenated_text)
    if len(preprocessed_texts) >= 1000:
        break

print("Preprocessing complete.")

# Vectorize the preprocessed text data
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_texts)

# Create a DataFrame with the TF-IDF features using the sparse matrix
df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=vectorizer.get_feature_names_out(), index=[doc.doc_id for doc in dataset.docs_iter()][:1000])

print(df)
