In [12]:
import os
import numpy as np
import string
import scipy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn import preprocessing
import PySimpleGUI as sg

### 1. Przygotuj duży (> 1000 elementów) zbiór dokumentów tekstowych w języku angielskim 

### 2. Określ słownik słów kluczowych potrzebny do wyznaczenia wektorów cech bag-of-words 

In [13]:
def preprocessWords(language,words):
    sno = nltk.stem.SnowballStemmer(language)
    cachedStopWords = stopwords.words(language)
    result = []
    words = words.split() #spliting line

    for w in words:
        punctuation = str.maketrans(dict.fromkeys(string.punctuation)) 
        w = w.translate(punctuation) #getting rid of punctuation
        w = w.lower() #making all letters lower
        w = sno.stem(w) #stemming
        if w not in cachedStopWords:
            result.append(w)
    return result

In [14]:
def prepareAllWords(language,directory):
    allWords = {}
    files = os.listdir(directory)
    files.sort()
    for i in range(len(files)):
        fileName = directory + files[i]
        f = open(fileName,"r", encoding="utf-8")
        for line in f:
            words = preprocessWords(language,line)
            for w in words:
                if w not in allWords:
                    allWords[w] = len(allWords) 
                    
        if (i % 200 == 0):
            print("{:.2f}".format(i/len(files)*100), end="\r")

    print("Done")
    return allWords

### 4.Zbuduj rzadką macierz wektorów cech term-by-document matrix w której wektory cech ułożone są kolumnowo Am×n = [d1|d2| . . . |dn] (m jest liczbą termów w słowniku, a n liczbą dokumentów)

In [15]:
def prepareTermByDocumentMatrix(language,directory,allWords):
    files = os.listdir(directory)
    files.sort()

    A = [[0 for _ in range (len(files))] for _ in range (len(allWords))]
    wordsCount = 0
    
    for i in range(len(files)):
        fileName = directory + files[i]
        f = open(fileName,"r", encoding="utf-8")
        for line in f:
            words = preprocessWords(language,line)
            for w in words:
                wordsCount += 1
                if w in allWords:
                    A[allWords[w]][i] += 1
                    
        for j in range (len(allWords)):
            if(A[j][i] != 0):
                A[j][i] = A[j][i]/wordsCount
        
        if (i % 200 == 0):
            print("{:.2f}".format(i/len(files)*100), end="\r")


    print("Done")
    return np.array(A)

### 5. Przetwórz wstępnie otrzymany zbiór danych mnożąc elementy bag-of-words przez inverse document frequency

IDF(w) = log N/nw ~ gdzie nw jest liczbą dokumentów, w których występuje słowo w, a N jest całkowitą liczbą dokumentów.

In [16]:
def IDF(matrix):
    N = len(matrix[0])
    idf = []
    for i in range (len(matrix)):
        nw = len(matrix[i][matrix[i]>0])        
        idf.append(np.log(N/nw))
        
    return idf

In [17]:
def changeMatrixIDF(matrix,idf):
    result = [[0 for _ in range (len(matrix[0]))] for _ in range (len(allWords))]
    for i in range (len(result)):
        for j in range (len(result[0])):
            result[i][j] = matrix[i][j] * idf[i]
            
        if (i % 200 == 0):
            print("{:.2f}".format(i/len(result)*100), end="\r")

    print("Done")
    return np.array(result)

### 6. Napisz program pozwalający na wprowadzenie zapytania (w postaci sekwencji słów) przekształcanego następnie do reprezentacji wektorowej q (bag-of-words). Program ma zwrócić k dokumentów najbardziej zbliżonych do podanego zapytania q. Użyj korelacji między wektorami jako miary podobieństwa

In [18]:
def search(query, numberOfResults, matrix):
    files = os.listdir('./wiki/')
    files.sort()
    
    textList = preprocessWords('english',query)
    text = ""  

    queryVector = [0 for _ in range (len(allWords))]

    for w in textList:
        if w in allWords:
            text += w + " "
            queryVector[allWords[w]] += 1
    
    queryVector = np.array(queryVector)
    queryNorm = np.linalg.norm(queryVector)
    
    result = []
    
    for i in range(len(matrix[0])):
        product = queryVector.T @ matrix[:,i]
        divider = queryNorm * np.linalg.norm(matrix[:,i])
        cosTheta = product / divider
        result.append((cosTheta, files[i]))
    
    result.sort(key = lambda tup: tup[0], reverse=True)
        
    return result[:numberOfResults]

In [19]:
def normalizeVec(vector):
    return vector/np.linalg.norm(vector)

In [20]:
def searchNorm(query, numberOfResults, matrix):
    files = os.listdir('./wiki/')
    files.sort()
    
    textList = preprocessWords('english',query)
    text = ""  

    queryVector = [0 for _ in range (len(allWords))]

    for w in textList:
        if w in allWords:
            text += w + " "
            queryVector[allWords[w]] += 1
    
    queryVector = np.array(queryVector)
    queryVectorNorm = normalizeVec(queryVector)
    
    result = []
    
    for i in range(len(matrix[0])):
        documentNorm = normalizeVec(matrix[:,i])
        product = queryVectorNorm.T @ documentNorm
        cosTheta = np.abs(product)
        result.append((cosTheta, files[i]))
    
    result.sort(key = lambda tup: tup[0], reverse=True)
        
    return result[:numberOfResults]

### 8. W celu usunięcia szumu z macierzy A zastosuj SVD i low rank approximation

In [21]:
def lowRankAprox(matrix,k):
    u, s, v = scipy.sparse.linalg.svds(matrix, k=k)
    return u @ np.diag(s) @ v

### 9. Porównaj działanie programu bez usuwania szumu i z usuwaniem szumu. Dla jakiej wartości k wyniki wyszukiwania są najlepsze (subiektywnie). Zbadaj wpływ przekształcenia IDF na wyniki wyszukiwania.


In [22]:
#preparation
allWords = prepareAllWords('english','./wiki/')  
A = prepareTermByDocumentMatrix('english','./wiki/',allWords)
idf = IDF(A)
matrixIDF = changeMatrixIDF(A,idf)

Done1
Done1
Done2


In [23]:
svds = []

svds.append(lowRankAprox(matrixIDF,10))
svds.append(lowRankAprox(matrixIDF,50))
svds.append(lowRankAprox(matrixIDF,100))
svds.append(lowRankAprox(matrixIDF,200))

In [24]:
sg.theme('LightGray1')
while True:
    resultButtons = []
    layout = [[sg.Input(do_not_clear=True)],
              [sg.Button('Search'),sg.Exit()]]
            
    searchWindow = sg.Window('Search').Layout(layout)
    event, query = searchWindow.Read()
    searchWindow.Close()
    if event is 'Exit' or event is None:
        break
    else:
        results = searchNorm(query[0],5, matrixIDF)
        resultButtons = []
        for doc in results:
            resultButtons.append([sg.Button(doc[1])])
        resultsWindow = sg.Window('Results for your query').Layout(resultButtons)
        openEvent = resultsWindow.Read()
        if openEvent[0] is not None:       
            f = open('wiki/' + openEvent[0], 'r', encoding="utf-8")
            sg.Popup(openEvent[0], f.read())


In [26]:
query = "australia biggest state"

print("without IDF")
results = searchNorm(query,5,A)
print("Search results for \"{}\":".format(query))
for document in results:
    print(document[1],"| Correlation:", "{:.2f}".format(document[0]))
    
print("\nwith IDF")
results = searchNorm(query,5,matrixIDF)
print("Search results for \"{}\":".format(query))
for document in results:
    print(document[1],"| Correlation:", "{:.2f}".format(document[0]))

print("\nsvds: 10,50,100,200")
for matrix in svds:
    print()
    results = search(query,3, matrix)
    print("Search results for \"{}\":".format(query))
    for document in results:
        print(document[1],"| Correlation:", "{:.2f}".format(document[0]))

without IDF
Search results for "australia biggest state":
Sovereign_state.txt | Correlation: 0.47
State.txt | Correlation: 0.46
South_Australia.txt | Correlation: 0.44
Western_Australia.txt | Correlation: 0.43
Australia.txt | Correlation: 0.39

with IDF
Search results for "australia biggest state":
Western_Australia.txt | Correlation: 0.32
Australia.txt | Correlation: 0.31
State.txt | Correlation: 0.29
Sovereign_state.txt | Correlation: 0.28
South_Australia.txt | Correlation: 0.21

svds: 10,50,100,200

Search results for "australia biggest state":
Rowing.txt | Correlation: 0.04
Malaysia.txt | Correlation: 0.04
Sydney.txt | Correlation: 0.04

Search results for "australia biggest state":
Australia.txt | Correlation: 0.30
Prime_Minister_of_Australia.txt | Correlation: 0.29
Australian_Labor_Party.txt | Correlation: 0.29

Search results for "australia biggest state":
Australia.txt | Correlation: 0.31
Western_Australia.txt | Correlation: 0.30
Prime_Minister_of_Australia.txt | Correlation: 0

In [27]:
print(idf[allWords['state']])
print(idf[allWords['australia']])
print(idf[allWords['biggest']])

1.1317978376537638
1.8127497451198908
2.684588714423212
