In [1]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample medical transcript
medical_transcript = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer to the medical transcript and transform it into BoW representation
bow_matrix = vectorizer.fit_transform(medical_transcript)

# Convert the BoW matrix to an array for better readability
bow_array = bow_matrix.toarray()

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Print the BoW representation
print("Bag of Words (BoW) Representation:")
print("Features:", feature_names)
print("BoW Matrix:\n", bow_array)


Bag of Words (BoW) Representation:
Features: ['and' 'antibiotics' 'as' 'breath' 'bronchitis' 'confirmed' 'cough'
 'diagnosis' 'elevated' 'examination' 'inhaler' 'of' 'patient' 'physical'
 'prescribed' 'presented' 'revealed' 'shortness' 'symptoms' 'temperature'
 'wheezing' 'with']
BoW Matrix:
 [[1 0 0 1 0 0 1 0 0 0 0 2 1 0 0 1 0 1 1 0 0 1]
 [1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 1 1 0]
 [1 1 1 0 1 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0]]


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample medical transcript
medical_transcript = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer to the medical transcript and transform it into TF-IDF representation
tfidf_matrix = tfidf_vectorizer.fit_transform(medical_transcript)

# Convert the TF-IDF matrix to an array for better readability
tfidf_array = tfidf_matrix.toarray()

# Get the feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Print the TF-IDF representation
print("TF-IDF Representation:")
print("Features:", feature_names)
print("TF-IDF Matrix:\n", tfidf_array)

TF-IDF Representation:
Features: ['and' 'antibiotics' 'as' 'breath' 'bronchitis' 'confirmed' 'cough'
 'diagnosis' 'elevated' 'examination' 'inhaler' 'of' 'patient' 'physical'
 'prescribed' 'presented' 'revealed' 'shortness' 'symptoms' 'temperature'
 'wheezing' 'with']
TF-IDF Matrix:
 [[0.17531933 0.         0.         0.29684142 0.         0.
  0.29684142 0.         0.         0.         0.         0.59368285
  0.29684142 0.         0.         0.29684142 0.         0.29684142
  0.29684142 0.         0.         0.29684142]
 [0.2344005  0.         0.         0.         0.         0.
  0.         0.         0.39687454 0.39687454 0.         0.
  0.         0.39687454 0.         0.         0.39687454 0.
  0.         0.39687454 0.39687454 0.        ]
 [0.21786941 0.36888498 0.36888498 0.         0.36888498 0.36888498
  0.         0.36888498 0.         0.         0.36888498 0.
  0.         0.         0.36888498 0.         0.         0.
  0.         0.         0.         0.        ]]


In [3]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Sample medical transcript
medical_transcript = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Tokenize the medical transcript
tokenized_transcript = [word_tokenize(sentence.lower()) for sentence in medical_transcript]

# Train Word2Vec model on the tokenized medical transcript
word2vec_model = Word2Vec(tokenized_transcript, vector_size=100, window=5, min_count=1, sg=1)

# Print the vector representation of each word
print("Word Embeddings:")
for word in word2vec_model.wv.index_to_key:
    print(word, ":", word2vec_model.wv[word])


ModuleNotFoundError: No module named 'gensim'

In [7]:
import gensim.downloader as api
import numpy as np

# Load pre-trained GloVe word vectors
glove_model = api.load("glove-wiki-gigaword-100")

# Example medical transcript
medical_transcript = "Patient presented with symptoms of fever and cough. Diagnosis revealed pneumonia."

# Tokenize the medical transcript
tokens = medical_transcript.lower().split()

# Initialize an empty list to store embeddings
embeddings = []

# Iterate through tokens and get GloVe embeddings
for token in tokens:
    try:
        embedding = glove_model[token]
        embeddings.append(embedding)
    except KeyError:
        # Handle out-of-vocabulary words
        print(f"Word '{token}' not found in GloVe vocabulary.")

# Convert the list of embeddings to a numpy array
embeddings = np.array(embeddings)

# Calculate the mean embedding
mean_embedding = np.mean(embeddings, axis=0)

print("Mean Embedding:", mean_embedding)


Word 'cough.' not found in GloVe vocabulary.
Word 'pneumonia.' not found in GloVe vocabulary.
Mean Embedding: [-1.02584451e-01  7.92255998e-03  3.10465693e-01  1.03223354e-01
 -1.36028349e-01  2.85505563e-01  2.39101991e-01  1.67928442e-01
 -2.67617822e-01 -1.03004426e-01 -2.67057031e-01 -5.42732747e-03
  4.66193825e-01  2.11903751e-01  6.56169236e-01 -9.69854370e-02
  6.66448921e-02 -2.61384994e-01 -1.22886449e-01 -1.60737187e-01
 -2.75147647e-01 -3.15886050e-01 -2.05300465e-01  1.46979541e-01
  4.58371118e-02  1.38336673e-01  4.18594331e-01 -3.73859525e-01
 -1.19682997e-01 -2.34338894e-01  1.60960123e-01  3.93518895e-01
 -1.06725112e-01  1.61731556e-01 -1.46150663e-01  1.59684867e-01
  1.51260555e-01  1.25550210e-01 -2.95739532e-01  6.16921037e-02
 -3.28458577e-01  1.24426335e-01  3.54154408e-02 -4.29866672e-01
  6.43779933e-02  1.58677876e-01  9.24467742e-02 -2.92243838e-01
 -3.62777710e-03 -4.24950004e-01  3.06643337e-01 -1.36291683e-01
  4.96131144e-02  8.28071177e-01  1.04464954e

In [8]:
import gensim.downloader as api
import numpy as np

# Load pre-trained FastText word vectors
fasttext_model = api.load("fasttext-wiki-news-subwords-300")

# Example medical transcript
medical_transcript = "Patient presented with symptoms of fever and cough. Diagnosis revealed pneumonia."

# Tokenize the medical transcript
tokens = medical_transcript.lower().split()

# Initialize an empty list to store embeddings
embeddings = []

# Iterate through tokens and get FastText embeddings
for token in tokens:
    try:
        embedding = fasttext_model[token]
        embeddings.append(embedding)
    except KeyError:
        # Handle out-of-vocabulary words
        print(f"Word '{token}' not found in FastText vocabulary.")

# Convert the list of embeddings to a numpy array
embeddings = np.array(embeddings)

# Calculate the mean embedding
mean_embedding = np.mean(embeddings, axis=0)

print("Mean Embedding:", mean_embedding)

Word 'cough.' not found in FastText vocabulary.
Word 'pneumonia.' not found in FastText vocabulary.
Mean Embedding: [-1.15479492e-02  1.09176766e-02  6.12921081e-03  2.10567117e-02
 -1.70978010e-02 -1.76176019e-02 -7.24244025e-03 -1.05920002e-01
  1.07433334e-04  5.74176665e-03 -2.19144449e-02 -4.38010879e-02
  4.31474783e-02 -3.20506990e-02 -1.56641100e-02 -1.58629343e-02
  7.13680312e-02  1.67137105e-02  8.18797722e-02  1.85287651e-02
 -2.26281881e-02 -5.41232247e-03 -5.98284416e-03  7.47917742e-02
 -1.34712793e-02 -1.67029221e-02 -2.20218785e-02  3.46966349e-02
  7.08832443e-02  2.07872465e-02 -8.33849981e-03  2.12269053e-02
  1.70863513e-02 -4.29130457e-02  1.48584531e-03  2.18419880e-02
 -3.90627645e-02  1.13719329e-02  2.87994351e-02  1.65983085e-02
 -6.79734442e-03 -3.78390364e-02 -4.23700130e-03 -4.55022231e-03
  1.62924430e-03  1.81227550e-02  2.74424013e-02  1.77063867e-02
 -1.49371773e-02  3.79516929e-03  1.16935940e-02 -1.09610008e-02
 -2.04605982e-02 -2.99371779e-02 -5.161

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample medical transcript
medical_transcript = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Define the range of n-grams (e.g., 2-grams to 5-grams)
min_n = 2
max_n = 5

# Initialize the CountVectorizer with character n-grams
vectorizer = CountVectorizer(analyzer='char', ngram_range=(min_n, max_n))

# Fit the vectorizer to the medical transcript and transform it into character n-gram representation
char_ngram_matrix = vectorizer.fit_transform(medical_transcript)

# Convert the character n-gram matrix to an array for better readability
char_ngram_array = char_ngram_matrix.toarray()

# Get the feature names (character n-grams)
feature_names = vectorizer.get_feature_names_out()

# Print the character n-gram representation
print("Character n-gram Representation:")
print("Features:", feature_names)
print("Character n-gram Matrix:\n", char_ngram_array)


Character n-gram Representation:
Features: [' a' ' an' ' and' ' and ' ' ant' ' anti' ' as' ' as ' ' as b' ' b' ' br'
 ' bre' ' brea' ' bro' ' bron' ' c' ' co' ' con' ' conf' ' cou' ' coug'
 ' e' ' el' ' ele' ' elev' ' ex' ' exa' ' exam' ' i' ' in' ' inh' ' inha'
 ' o' ' of' ' of ' ' of b' ' of c' ' p' ' pr' ' pre' ' pres' ' r' ' re'
 ' rev' ' reve' ' s' ' sh' ' sho' ' shor' ' sy' ' sym' ' symp' ' t' ' te'
 ' tem' ' temp' ' w' ' wh' ' whe' ' whee' ' wi' ' wit' ' with' ', ' ', p'
 ', pr' ', pre' 'ag' 'agn' 'agno' 'agnos' 'al' 'al ' 'al e' 'al ex' 'ale'
 'aled' 'aled ' 'aler' 'aler.' 'am' 'ami' 'amin' 'amina' 'an' 'and' 'and '
 'and i' 'and s' 'and w' 'ant' 'anti' 'antib' 'as' 'as ' 'as b' 'as br'
 'at' 'ate' 'ated' 'ated ' 'ath' 'ath.' 'ati' 'atie' 'atien' 'atio'
 'ation' 'atu' 'atur' 'ature' 'be' 'bed' 'bed ' 'bed a' 'bi' 'bio' 'biot'
 'bioti' 'br' 'bre' 'brea' 'breat' 'bro' 'bron' 'bronc' 'ca' 'cal' 'cal '
 'cal e' 'ch' 'chi' 'chit' 'chiti' 'co' 'con' 'conf' 'confi' 'cou' 'coug'
 'coug

In [10]:
import re
from collections import Counter

# Sample medical transcript
medical_transcript = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Concatenate all sentences into a single string
text = ' '.join(medical_transcript)

# Define the number of iterations for BPE algorithm
num_iterations = 10

# Perform Byte Pair Encoding
for _ in range(num_iterations):
    # Calculate character frequencies
    char_freq = Counter(text)
    
    # Find the most common character pair
    most_common_pair = max(char_freq, key=char_freq.get)
    
    # Replace the most common character pair with a new symbol
    new_symbol = most_common_pair.replace(' ', '_')
    text = re.sub(re.escape(most_common_pair), new_symbol, text)

# Print the result of Byte Pair Encoding
print("Byte Pair Encoding Result:")
print(text)


Byte Pair Encoding Result:
Patient_presented_with_symptoms_of_cough_and_shortness_of_breath._Physical_examination_revealed_elevated_temperature_and_wheezing._Diagnosis_confirmed_as_bronchitis,_prescribed_antibiotics_and_inhaler.


In [11]:
from sklearn.feature_extraction.text import HashingVectorizer

# Sample medical transcript
medical_transcript = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Initialize the HashingVectorizer
vectorizer = HashingVectorizer(n_features=1000, alternate_sign=False)

# Transform the medical transcript into a hashed vector representation
hashed_vector = vectorizer.transform(medical_transcript)

# Convert the hashed vector to an array for better readability
hashed_array = hashed_vector.toarray()

# Print the hashed vector representation
print("Hashed Vector Representation:")
print(hashed_array)


Hashed Vector Representation:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [12]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Sample medical transcript
medical_transcript = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Tokenize the medical transcript
tokenized_transcript = [word_tokenize(sentence.lower()) for sentence in medical_transcript]

# Tag each tokenized sentence with a unique ID
tagged_data = [TaggedDocument(words=words, tags=[str(i)]) for i, words in enumerate(tokenized_transcript)]

# Initialize and train the Doc2Vec model
model = Doc2Vec(vector_size=100, window=2, min_count=1, workers=4, epochs=100)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Infer vectors for the medical transcript
inferred_vector = model.infer_vector(tokenized_transcript[0])

# Print the inferred vector
print("Inferred Vector for the First Sentence:")
print(inferred_vector)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\siva7\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Inferred Vector for the First Sentence:
[-1.1136020e-02 -4.8494600e-03  1.6627514e-03  7.4614370e-03
  6.6040161e-06 -7.5934301e-03 -5.9609051e-04  2.8972214e-03
 -8.3302148e-03  1.2111573e-03  2.9504893e-03 -1.8201616e-03
 -6.3358005e-03 -7.2267842e-03  3.1805965e-03 -3.8422627e-04
  4.0237317e-03  3.5190799e-03 -7.6869172e-03 -6.6963811e-03
  5.5732892e-04 -2.4530129e-03 -8.0813887e-04  9.1940360e-03
 -5.4831645e-03  2.4649317e-03 -1.3437697e-02  1.5908031e-03
 -9.8421471e-04 -1.8603043e-03  1.5207157e-04  3.4204770e-03
 -2.6636729e-03 -4.3324213e-03 -8.9364303e-03  6.1663757e-03
 -1.4795577e-03 -2.2712150e-03 -9.2690848e-03 -4.3508477e-04
  1.0315078e-02  2.3778703e-03  3.6113393e-03 -4.3615983e-03
  2.5333001e-03 -2.8603885e-03  2.1185563e-03  3.3065800e-03
  9.8981638e-04 -8.1872474e-03 -4.1686646e-03 -4.0235552e-03
 -1.0311445e-02 -9.6029993e-03 -5.7981513e-03  1.1725518e-02
  4.0487880e-03  5.4513095e-03 -4.0713158e-03  2.8072281e-03
  3.2607757e-03  1.1349788e-02  9.6196206e-03

In [13]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Sample medical transcript
medical_transcript = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Tokenize the medical transcript
tokenized_transcript = [word_tokenize(sentence.lower()) for sentence in medical_transcript]

# Create a dictionary from the tokenized transcript
dictionary = Dictionary(tokenized_transcript)

# Create a bag of words representation of the transcript
bow_corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_transcript]

# Initialize and train the LDA model
lda_model = LdaModel(corpus=bow_corpus, id2word=dictionary, num_topics=2, passes=10)

# Print the topics and their top words
print("LDA Topics:")
for topic_id, topic_words in lda_model.print_topics():
    print(f"Topic {topic_id + 1}: {topic_words}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\siva7\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LDA Topics:
Topic 1: 0.075*"." + 0.075*"and" + 0.074*"temperature" + 0.074*"revealed" + 0.074*"wheezing" + 0.074*"elevated" + 0.074*"examination" + 0.074*"physical" + 0.026*"of" + 0.025*"shortness"
Topic 2: 0.076*"and" + 0.076*"." + 0.076*"of" + 0.045*"as" + 0.045*"antibiotics" + 0.045*"bronchitis" + 0.045*"," + 0.045*"prescribed" + 0.045*"diagnosis" + 0.045*"inhaler"
