In [1]:
import numpy as np

In [2]:
# load in data
abstracts = []
with open('medline/MED.ALL', 'r') as f:
    # split by abstracts
    abstracts_1 = f.read().split('\n.I ')
    # remove W.
    abstracts = [abstr.split('\n.W\n')[1] for abstr in abstracts_1]
    
#normalize
abstracts = [abstr.lower() for abstr in abstracts]

In [3]:
import nltk
# get stopwords
stopwords = []
with open('smart.txt', 'r') as f:
    stopwords = f.read().split('\n')
    # remove empty strings
    stopwords = [word for word in stopwords if word]
    # remove apostrophes in words
    stopwords = [word.replace('\'', '') for word in stopwords]
    # stem words
    stemmer = nltk.stem.PorterStemmer()
    stopwords = [stemmer.stem(word) for word in stopwords]

In [4]:
import re

def get_Words(abstract):
    words = re.split(r'\s+|[-().,;!?\"]+', abstract.lower())
    # remove empty strings
    words = [word for word in words if word]
    # remove apostrophes in words
    words = [word.replace('\'', '') for word in words]
    # remove words with non-alphabetic characters
    words = [word for word in words if word.isalpha()]
    # stem words
    stemmer = nltk.stem.PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    # remove stopwords
    words = [word for word in words if word not in stopwords]
    return words

In [5]:
Q2 =" the relationship of blood and cerebrospinal fluid oxygen concentrations or partial pressures.  a method of interest is polarography."
Q2 = get_Words(Q2)
print(Q2)

['relationship', 'blood', 'cerebrospin', 'fluid', 'oxygen', 'concentr', 'partial', 'pressur', 'method', 'interest', 'polarographi']


In [6]:
unique_words = set()
data = []
data.append(Q2)
for abstract in abstracts:
    words = get_Words(abstract)
    data.append(words)
    unique_words.update(words)
print(len(unique_words))
unique_words = list(unique_words)

8573


In [7]:
# tf-idf with sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# encode document
data_asStr = [' '.join(words) for words in data]
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
X = vectorizer.fit_transform(data_asStr)
unique_words = vectorizer.get_feature_names_out()
# summarize encoded vector
print(X.shape)


(1034, 8573)


In [8]:
# singluar values of X
U, s, V = np.linalg.svd(X.toarray())
print(s)

[4.88995983 3.37401632 3.18145602 ... 0.35583243 0.29032049 0.11891892]


In [9]:
def get_k_approx(k):
    X_k = np.array(X)*0.0
    for j in range(k):
        u = (U[:,j] * s[j]).reshape(-1, 1)
        v = V[j,:].reshape(1, -1)
        X_k += u @ v
    return X_k


In [10]:
A_100 = get_k_approx(100)
A_500 = get_k_approx(500)

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
relevant_documents = [80 ,90 ,162,187,236,237,258,289,290,292,293,294,296,300,301,303]

def test_performance(A_test, tol):
    Q2 = A_test[0,:]
    try:
        Q2_dense = Q2.toarray()
    except:
        Q2_dense = np.asarray(Q2)
    True_positives = 0
    False_positives = 0
    False_negatives = 0
    True_negatives = 0
    for i in range(1, A_test.shape[0]):
        try:
            A_test_i_dense = A_test[i,:].toarray()
        except:
            A_test_i_dense = np.asarray(A_test[i,:])
        sim = cosine_similarity(Q2_dense, A_test_i_dense)[0][0]
        if i in relevant_documents:
            if sim > tol:
                True_positives += 1
            else:
                False_negatives += 1
        else:
            if sim > tol:
                False_positives += 1
            else:
                True_negatives += 1
    return True_positives, False_positives, False_negatives, True_negatives

In [12]:
matricies = [X, A_100, A_500]
matrix_names = ['full', 'A_100', 'A_500']
tols = [0.1, 0.25, 0.55]

for matrix, matrix_name in zip(matricies, matrix_names):
    print(matrix_name)
    for tol in tols:
        print("for tollerance:",tol)
        True_positives, False_positives, False_negatives, True_negatives = test_performance(matrix, tol)
        print(True_positives, False_positives, False_negatives, True_negatives)
        precision = 0
        if True_positives + False_positives > 0:
            precision = True_positives / (True_positives + False_positives)
        recall = True_positives / (True_positives + False_negatives)
        print('precision:', precision)
        print('recall:', recall)
        print()
    print()
    print()
    print()
    print()

full
for tollerance: 0.1
8 16 8 1001
precision: 0.3333333333333333
recall: 0.5

for tollerance: 0.25
3 1 13 1016
precision: 0.75
recall: 0.1875

for tollerance: 0.55
0 0 16 1017
precision: 0
recall: 0.0





A_100
for tollerance: 0.1
15 137 1 880
precision: 0.09868421052631579
recall: 0.9375

for tollerance: 0.25
12 27 4 990
precision: 0.3076923076923077
recall: 0.75

for tollerance: 0.55
3 3 13 1014
precision: 0.5
recall: 0.1875





A_500
for tollerance: 0.1
11 43 5 974
precision: 0.2037037037037037
recall: 0.6875

for tollerance: 0.25
5 5 11 1012
precision: 0.5
recall: 0.3125

for tollerance: 0.55
1 0 15 1017
precision: 1.0
recall: 0.0625







In [22]:
vec = X[1,:]
print(vec)
# get index for every row in sparse vector
for i in vec.indices:
    print(unique_words[i], vec[0,i])

  (0, 1600)	0.24862625419346504
  (0, 4600)	0.4618998241996097
  (0, 2749)	0.466329236892405
  (0, 5841)	0.175171065027332
  (0, 4295)	0.37734970437268756
  (0, 3112)	0.2784605812407015
  (0, 2931)	0.06333259780664943
  (0, 2711)	0.07007515434072176
  (0, 75)	0.05115687206200372
  (0, 1375)	0.17899098371833566
  (0, 1956)	0.05158773654742772
  (0, 2756)	0.28546691402236163
  (0, 1403)	0.07849580037280529
  (0, 1880)	0.24918325446043196
  (0, 7012)	0.0490378356382711
  (0, 5275)	0.05087599392674935
  (0, 7061)	0.06598361254690573
  (0, 7079)	0.09303660158372126
  (0, 6521)	0.07495469064825185
  (0, 4336)	0.06749610888644275
  (0, 7340)	0.08533198077937452
  (0, 1913)	0.13923029062035075
  (0, 7078)	0.07698330403326828
correl 0.24862625419346504
matern 0.4618998241996097
fetal 0.466329236892405
plasma 0.175171065027332
level 0.37734970437268756
glucos 0.2784605812407015
free 0.06333259780664943
fatti 0.07007515434072176
acid 0.05115687206200372
coeffici 0.17899098371833566
determin 0.051