In [3]:
import numpy as np

In [4]:
# load in data
abstracts = []
with open('medline/MED.ALL', 'r') as f:
    # split by abstracts
    abstracts_1 = f.read().split('\n.I ')
    # remove W.
    abstracts = [abstr.split('\n.W\n')[1] for abstr in abstracts_1]
    
#normalize
abstracts = [abstr.lower() for abstr in abstracts]

In [5]:
import nltk
# get stopwords
stopwords = []
with open('smart.txt', 'r') as f:
    stopwords = f.read().split('\n')
    # remove empty strings
    stopwords = [word for word in stopwords if word]
    # remove apostrophes in words
    stopwords = [word.replace('\'', '') for word in stopwords]
    # stem words
    stemmer = nltk.stem.PorterStemmer()
    stopwords = [stemmer.stem(word) for word in stopwords]

In [6]:
import re

def get_Words(abstract):
    words = re.split(r'\s+|[-\[\](){}<>.,;:!?\'\"]+', abstract.lower())
    # remove empty strings
    words = [word for word in words if word]
    # remove apostrophes in words
    words = [word.replace('\'', '') for word in words]
    # remove words with non-alphabetic characters
    words = [word for word in words if word.isalpha()]
    # stem words
    stemmer = nltk.stem.PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    # remove stopwords
    words = [word for word in words if word not in stopwords]
    return words

In [7]:
Q2 =" the relationship of blood and cerebrospinal fluid oxygen concentrations or partial pressures.  a method of interest is polarography."
Q2 = get_Words(Q2)
print(Q2)

['relationship', 'blood', 'cerebrospin', 'fluid', 'oxygen', 'concentr', 'partial', 'pressur', 'method', 'interest', 'polarographi']


In [8]:
unique_words = set()
data = []
data.append(Q2)
for abstract in abstracts:
    words = get_Words(abstract)
    data.append(words)
    unique_words.update(words)
print(len(unique_words))
unique_words = list(unique_words)

8576


In [9]:
# tf-idf with sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(unique_words)
# encode document
data = [' '.join(words) for words in data]
X = vectorizer.transform(data)
# summarize encoded vector
print(X.shape)


(1034, 8576)


In [10]:
# singluar values of X
U, s, V = np.linalg.svd(X.toarray())
print(s)

[6.77106271 4.85606775 4.05078193 ... 0.26425233 0.21799523 0.08695066]


In [11]:
def get_k_approx(k):
    X_k = np.array(X)*0.0
    for j in range(k):
        u = (U[:,j] * s[j]).reshape(-1, 1)
        v = V[j,:].reshape(1, -1)
        X_k += u @ v
    return X_k


In [12]:
A_100 = get_k_approx(100)
A_500 = get_k_approx(500)

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
relevant_documents = [80 ,90 ,162,187,236,237,258,289,290,292,293,294,296,300,301,303]

def test_performance(A_test, tol):
    Q2 = A_test[0,:]
    try:
        Q2_dense = Q2.toarray()
    except:
        Q2_dense = np.asarray(Q2)
    True_positives = 0
    False_positives = 0
    False_negatives = 0
    True_negatives = 0
    for i in range(1, A_test.shape[0]):
        try:
            A_test_i_dense = A_test[i,:].toarray()
        except:
            A_test_i_dense = np.asarray(A_test[i,:])
        sim = cosine_similarity(Q2_dense, A_test_i_dense)[0][0]
        if i in relevant_documents:
            if sim > tol:
                True_positives += 1
            else:
                False_negatives += 1
        else:
            if sim > tol:
                False_positives += 1
            else:
                True_negatives += 1
    return True_positives, False_positives, False_negatives, True_negatives

In [32]:
matricies = [X, A_100, A_500]
matrix_names = ['full', 'A_100', 'A_500']
tols = [0.1, 0.25, 0.55]

for matrix, matrix_name in zip(matricies, matrix_names):
    print(matrix_name)
    for tol in tols:
        print("for tollerance:",tol)
        True_positives, False_positives, False_negatives, True_negatives = test_performance(matrix, tol)
        print(True_positives, False_positives, False_negatives, True_negatives)
        precision = 0
        if True_positives + False_positives > 0:
            precision = True_positives / (True_positives + False_positives)
        recall = True_positives / (True_positives + False_negatives)
        print('precision:', precision)
        print('recall:', recall)
        if precision + recall > 0:
            print('f1:', 2 * precision * recall / (precision + recall))
        else:
            print('f1:', 0)
        print('accuracy:', (True_positives + True_negatives) / (True_positives + True_negatives + False_positives + False_negatives))
        print()
    print()
    print()
    print()
    print()

full
for tollerance: 0.1
12 51 4 966
precision: 0.19047619047619047
recall: 0.75
f1: 0.3037974683544304
accuracy: 0.9467570183930301

for tollerance: 0.25
4 3 12 1014
precision: 0.5714285714285714
recall: 0.25
f1: 0.34782608695652173
accuracy: 0.9854791868344628

for tollerance: 0.55
0 0 16 1017
precision: 0
recall: 0.0
f1: 0
accuracy: 0.9845111326234269





A_100
for tollerance: 0.1


TypeError: np.matrix is not supported. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html