In [364]:
import wikipediaapi as wa
import os.path
import string
import scipy.sparse as sparse
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict
import json

# nltk.download('punkt')
# nltk.download('stopwords')

wiki_wiki = wa.Wikipedia('en')
max_depth = 2
res_dir = "res"
dir = res_dir + os.sep + "stoner"
ps = PorterStemmer()


def valid_title(title):
    words = {"Wikipedia", "Help", "Category", ":", "\"", "/", "?", "List", "*", "<", ">", "|"}
    for word in words:
        if word in title:
            return False
    return True


def visit_links(page, rec_level):
    if rec_level > max_depth:
        return

    links = page.links
    for key in sorted(links.keys()):
        title = links[key].title
        if valid_title(title):
            visit_page(title, rec_level + 1)
            

def print_categorymembers(categorymembers, level=0, max_level=1):
    for c in categorymembers.values():
        print("%s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
        if c.ns == wa.Namespace.CATEGORY and level < max_level:
            print_categorymembers(c.categorymembers, level=level + 1, max_level=max_level)


def visit_page(page, rec_level):
    
    page_py = wiki_wiki.page(page)
    file_name = dir + page_py.title + ".txt"
    
    if os.path.exists(file_name):
        return
    
    words = page_py.text.split()[:200]
    if len(words) > 100:
        
        with open(dir + os.sep + page_py.title + ".txt", 'w+', encoding="utf-8") as file:
            file.write(page_py.title+"\n")
            for word in words:
                file.write(word+' ')
        visit_links(page_py, rec_level)


def get_files():
    categories = wiki_wiki.page("Category:Stoner rock musical groups")

    for member in categories.categorymembers:
        visit_page(member, 0)

## Dane

Aby pozyskać dokumenty wykorzystano Wikipedia API. Za jego pomocą pobrano około 50 tysięcy artykułów z Wikipedii (zapisano pierwsze 200 słów każdego z artykułów wraz z ich tytułem). Artykuły pobierano w następujący sposób: rozpoczęto od listy zespołów z kategorii Stoner rock musical groups, a następnie rekurencyjnie w każdym arykule wchodzono do kolejnych odnośników do artykułów Wikipedii. Ominięto artykuły mające mało treści, a także takie, które były kolejnymi listami. Uzyskane w ten sposób dokumenty nie są spójne tematycznie. Z maksymalnym poziomem zagłębienia rekurencji równym 2 uzyskano już różnorodną tematykę otrzymanych tekstów.

Link do wykorzystanego API:
https://github.com/martin-majlis/Wikipedia-API

In [365]:
def get_terms(dir):
    terms = set({})
    document_count = 0
    documents_dict = {}
    i = 0
    for subdir, dirs, files in os.walk(dir):
        for file in files:
            documents_dict[file] = i
            i += 1
            file_path = dir + os.sep + file
            document_count+=1
            with open(file_path, encoding="utf-8") as f:
                text = f.read()
                words = text_to_words(text)
                for word in words:
                    terms.add(word)
    return np.sort(list(terms)), documents_dict

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

def text_to_words(text):
    text = clear_text(text)
    words = word_tokenize(text)
    words = remove_stop_words(words)
    return words


def clear_text(text):
    text = "".join([c for c in text if c not in string.punctuation])
    return text.lower()


def remove_stop_words(words):
    stop_words = set(stopwords.words('english'))
    words = [ps.stem(w) for w in words if (w not in stop_words and is_ascii(w))]
    return words

Powyższy kod odpowiada za wstępne przetworzenie dokumentów i uzyskanie słownika termów. Słownik ten został uzyskany jako unia wszystkich słów występujących we wszystkich dokumentach. Usunięto znaki interpunkcyjne, wszystkie litery zamieniono na małe i zastosowano stemming (PorterStemmer z bilioteki nltk). Usunięto takze stop words (listę stop words uzyskano z biblioteki nltk). W funkcji get_terms stworzono także słownik dokumentów, aby w późniejszych etapach mieć łatwy dostęp do indeksu danego dokumentu.

In [366]:
terms, documents_dict = get_terms(dir)
print(terms)

['0' '00' '000' ... 'zzap64' 'zzebra' 'zzzzz']


In [367]:
print(len(terms))
terms_dict = {}
for i in range(len(terms)):
    terms_dict[terms[i]] = i

197360


Za pomocą słownika zaindeksowano posortowane termy. Następnie oba słowniki zapisano do plików o rozszerzeniu .json.

In [368]:
print(len(terms_dict))
print(len(documents_dict))

197360
50293


In [369]:
json.dump(terms_dict, open( res_dir + os.sep + "terms_dict.json", 'w' ))
json.dump(documents_dict, open( res_dir + os.sep + "documents_dict.json", 'w' ))

In [370]:
def load_dicts():
    terms_dict = json.load( open( res_dir + os.sep + "terms_dict.json" ) )
    documents_dict = json.load( open( res_dir + os.sep + "documents_dict.json" ) )
    return terms_dict, documents_dict

## Obliczenie bag of words i term by document matrix

In [377]:
def get_term_by_document_matrix(terms_dict, dir, documents_dict):
    
    terms_count = len(terms_dict)
    documents_count = len(documents_dict)
    
    tbd = sparse.lil_matrix((terms_count, documents_count))
    for subdir, dirs, files in os.walk(dir):
        for file in files:
            file_path = dir + os.sep + file
            with open(file_path, encoding="utf-8") as f:
                for word in text_to_words(f.read()):
                    tbd[terms_dict[word], documents_dict[file]] += 1
    return sparse.csc_matrix(tbd)

In [378]:
tbd = get_term_by_document_matrix(terms_dict, dir, documents_dict)
sparse.save_npz(res_dir + os.sep + 'tbd_no_idf.npz', tbd)

In [479]:
print(tbd.shape)

(197360, 50293)


Utworzono macierz term by document poprzez wyznaczenie dla każdego dokumentu wektora bag of words. Do utworzenia macierzy term by document wykorzystano macierz lil z scipy.sparse.

In [504]:
def apply_idf(tbd, terms_count, documents_count):
    idf = []
    tbd_csr = sparse.csr_matrix(tbd)
    i = 0
    for row in tbd_csr:
        doc_count = row.count_nonzero()
        idf_i = np.log(documents_count/doc_count)
        idf.append(idf_i)
        row = row * idf_i
    return idf, tbd_csr

In [505]:
idf, tbd_csr = apply_idf(tbd, len(terms_dict), len(documents_dict))

In [506]:
tbd_csc = sparse.csc_matrix(tbd_csr)
sparse.save_npz(res_dir + os.sep + 'tbd.npz', tbd_csc)

Następnie wyznaczono inverse document frequency dla każdego dokumentu. Liczbę dokumentów, w której wystąpiło dane słowo obliczno jako liczbę niezerowych komórek w rzędzie macierzy term by document. Tak przetworzoną macierz term by document zapisano do pliku.

In [382]:
def get_col_norms(tbd, documents_count):
    
    col_norms = []
    
    for i in range(documents_count):
        col_norm = sparse.linalg.norm(tbd.getcol(i))
        col_norms.append(col_norm)
    return sparse.csc_matrix(col_norms)

In [383]:
col_norms = get_col_norms(tbd_csc, len(documents_dict))

In [384]:
sparse.save_npz(res_dir + os.sep + 'col_norms.npz', col_norms)

In [385]:
def print_results(results):
    for res in results:
        file_path = dir+os.sep+res[1]
        with open(file_path, encoding="utf-8") as f:
            text = f.read()
            print(text)
            print("\n")

In [386]:
def query_to_bow(query, terms_dict):

    query_words = text_to_words(query)
    query_words = clean_query(query_words, terms_dict)

    if not query_words:
        return []

    query_bow = sparse.lil_matrix((len(terms_dict), 1))

    for word in query_words:
        query_bow[terms_dict[word], 0] += 1
    return query_bow

def clean_query(query_words, terms_dict):
    query_words = [word for word in query_words if word in terms_dict]
    return query_words

Powyższe funkcje, używane we wszystkich wariantach wyszukiwarki przekształca zapytanie w wektor bag of words. Najpierw poddaje tekst tym samym przekształceniom, co dokumenty (usunięcie interpunkcji i stop words, stemming), usuwa z zapytania słowa niebędące w słowniku, a następnie tworzy wektor bag of words w postaci macierzy rzadkiej.

## Wyszukiwanie bez wcześniejszej normalizacji macierzy

In [387]:
def find_documents_not_normalized(query, k):
    
    terms_dict, documents_dict = load_dicts()
    
    query_bow = query_to_bow(query, terms_dict)
    
    if query_bow == []:
        return []
    
    query_bow_t = query_bow.transpose()
    
    tbd = sparse.load_npz(res_dir + os.sep + 'tbd.npz')
    col_norms = sparse.load_npz(res_dir + os.sep + 'col_norms.npz')
    results = []
    
    query_norm = sparse.linalg.norm(query_bow)
    
    for document in documents_dict.keys():
        i = documents_dict[document]
        col = tbd.getcol(i)
        res = query_bow_t.dot(col)[0,0] / (col_norms.getcol(i)[0,0] * query_norm)
        if i < k:
            results.append((res, document))
        else:
            results = sorted(results)
            if res > results[0][0]:
                results[0] = (res, document)
                
    return results

In [388]:
results = find_documents_not_normalized("songwriter of Porcupine Tree", 10)
print_results(results)

Screaming Trees
Screaming Trees were an American rock band formed in Ellensburg, Washington in 1985 by vocalist Mark Lanegan, guitarist Gary Lee Conner, bass player Van Conner and drummer Mark Pickerel. Pickerel had been replaced by Barrett Martin by the time the band reached its most successful period. Although widely associated with grunge, the band's sound incorporated hard rock and psychedelic elements. During Screaming Trees' existence the band released seven studio albums, five EPs, and three compilations. Screaming Trees is known as one of the pioneers of grunge along with the Melvins, Mudhoney, U-Men, Skin Yard, Soundgarden, Green River, and Malfunkshun. Screaming Trees rose to fame as part of the grunge movement of the early 1990s, along with bands such as Alice in Chains, Pearl Jam, Nirvana, and Soundgarden and was one of the most successful underground music acts of the 1990s. The band achieved one top ten single on the Modern Rock Tracks charts. Screaming Trees were plagued

Powyższa funkcja wyszukuje k dokumentów najbardziej podobnych do podanego zapytania. Wczytywana macierz term by document ma nieznormalizowane kolumny. Przy obliczaniu korelacji kolumna macierzy i wektor bag of words wprowadzonego zapytania dzielone są przez ich normy. Funkcję przetestowano dla zapytania "songwriter of Porcupine Tree". Artykuł o Stevenie Wilsonie, którego szukano, był czwartym wynikiem wyszukiwania. Pozostałe wyniki to artykuły, w których często występują słowa "songwriter" lub "tree".

## Wyszukiwanie wykorzystując znormalizowaną macierz term by document

In [507]:
def normalize_tbd(tbd_csc):
    return normalize(tbd_csc, axis = 0)

In [508]:
tbd_csc = normalize_tbd(tbd_csc)

In [509]:
sparse.save_npz(res_dir + os.sep + 'tbd_normalized.npz', tbd_csc)

In [392]:
def find_documents_normalized(query, k):
    
    terms_dict, documents_dict = load_dicts()
    
    query_bow = query_to_bow(query, terms_dict)
    
    if query_bow == []:
        return []
    
    query_bow = normalize(query_bow, axis = 1)
    query_bow_t = query_bow.transpose()
    
    tbd = sparse.load_npz(res_dir + os.sep + 'tbd_normalized.npz')
    
    results = []
    
    for document in documents_dict.keys():
        i = documents_dict[document]
        col = tbd.getcol(i)
        res = query_bow_t.dot(col)[0,0]
        if i < k:
            results.append((res, document))
        else:
            results = sorted(results)
            if res > results[0][0]:
                results[0] = (res, document)
                
    return results

In [393]:
results = find_documents_normalized("songwriter of Porcupine Tree", 10)
print_results(results)

Screaming Trees
Screaming Trees were an American rock band formed in Ellensburg, Washington in 1985 by vocalist Mark Lanegan, guitarist Gary Lee Conner, bass player Van Conner and drummer Mark Pickerel. Pickerel had been replaced by Barrett Martin by the time the band reached its most successful period. Although widely associated with grunge, the band's sound incorporated hard rock and psychedelic elements. During Screaming Trees' existence the band released seven studio albums, five EPs, and three compilations. Screaming Trees is known as one of the pioneers of grunge along with the Melvins, Mudhoney, U-Men, Skin Yard, Soundgarden, Green River, and Malfunkshun. Screaming Trees rose to fame as part of the grunge movement of the early 1990s, along with bands such as Alice in Chains, Pearl Jam, Nirvana, and Soundgarden and was one of the most successful underground music acts of the 1990s. The band achieved one top ten single on the Modern Rock Tracks charts. Screaming Trees were plagued

W przypadku powyższej funkcji macierz term by document wczytywana z pliku jest macierzą o znormalizowanych kolumnach. Dzięki temu wyszukiwanie przebiega szybciej. Wynik zapytania dla hasła "songwriter of Procupine Tree" pokrywa się z wynikiem uzyskanym przy zastosowaniu poprzedniej funkcji.

## Wyszukiwanie z użyciem Latent Semantic Indexing

In [518]:
def apply_svd(tbd, k):
    svd = TruncatedSVD(n_components=k).fit(tbd)
    tbd = svd.transform(tbd)
    return tbd, svd

In [522]:
tbd = sparse.load_npz(res_dir + os.sep + 'tbd_normalized.npz')
tbd_trans, svd = apply_svd(tbd, 500)
sparse.save_npz(res_dir + os.sep + 'tbd_trans.npz', sparse.csc_matrix(tbd_trans))
sparse.save_npz(res_dir + os.sep + 'svd_compoments_.npz', sparse.csc_matrix(svd.components_))

KeyboardInterrupt: 

Powyższa funkcja apply_svd stosuje svd i low rand approximation aby ograniczyć szum w macierzy term by document matrix.
Do zastosowania svd użyto biblioteki sklearn. Po zastosowaniu svd do plików zapisywane są macierze tbd po transformacji i svd.components_.

In [None]:
def find_documents_svd(query, k):
    
    terms_dict, documents_dict = load_dicts()
    
    query_bow = query_to_bow(query, terms_dict)
    
    if query_bow == []:
        return []
    
    query_bow = normalize(query_bow, axis = 1)
    query_bow_t = query_bow.transpose()
    query_bow_t = np.array(query_bow_t)
    tbd_trans = sparse.load_npz(res_dir + os.sep + 'tbd_trans.npz')
    svd_components = sparse.load_npz(res_dir + os.sep + 'svd_compoments_.npz')
    print(tbd_trans)
    
    similarities = query_bow_t.dot(tbd_trans)
    similarities = similarities.dot(svd_components).getrow(0).toarray()[0]
    
    i = 0
    results = []
    for document in documents_dict.keys():
        i = documents_dict[document]
        res = similarities[i]
        if i < k:
            results.append((res, document))
        else:
            results = sorted(results)
            if res > results[0][0]:
                results[0] = (res, document)
        
    return results

In [None]:
results = find_documents_svd("Steven John Wilson (born 3 November 1967) is an English musician, singer, songwriter and record producer, most closely associated with the progressive rock genre", 10)
print_results(results)

In [None]:
results = find_documents_svd("songwriter of Porcupine Tree", 10)
print_results(results)

In [None]:
results = find_documents_svd("singer and songwriter of Porcupine Tree", 10)
print_results(results)

In [None]:
results = find_documents_svd("Steven Wilson", 10)
print_results(results)

In [None]:
results = find_documents_svd("cat", 10)
print_results(results)

In [None]:
results = find_documents_svd("(Felis catus) is a domestic species of small carnivorous mammal", 10)
print_results(results)

In [None]:
results = find_documents_svd("it has a strong flexible body, quick reflexes, sharp teeth and retractable claws adapted to killing small prey", 10)
print_results(results)

In [441]:
results = find_documents_normalized("it has a strong flexible body, quick reflexes, sharp teeth and retractable claws adapted to killing small prey", 10)
print_results(results)

Melanism
The term melanism refers to black pigment and is derived from the Greek: μελανός. Melanism is the increased development of the dark-colored pigment melanin in the skin or hair. Pseudomelanism, also called abundism, is another variant of pigmentation, identifiable by dark spots or enlarged stripes, which cover a large part of the body of the animal, making it appear melanistic.The morbid deposition of black matter, often of a malignant character causing pigmented tumors, is called melanosis. Adaptation Melanism related to the process of adaptation is called adaptive. Most commonly, dark individuals become fitter to survive and reproduce in their environment as they are better camouflaged. This makes some species less conspicuous to predators, while others, such as black panthers, use it as a foraging advantage during night hunting. Typically, adaptive melanism is heritable: A dominant allele, which is entirely or nearly entirely expressed in the phenotype, is responsible for th

In [405]:
results = find_documents_normalized("(Felis catus) is a domestic species of small carnivorous mammal", 10)
print_results(results)

National Coalition Against Domestic Violence
National Coalition Against Domestic Violence (NCADV) is a 501(c)(3) nonprofit organization with the mission to be the voice of victims and survivors of domestic violence. National Coalition Against Domestic Violence's vision is to create a culture where domestic violence is not tolerated and where society empowers victims and survivors while holding abusers accountable. Current Work Legislative Policy Advocacy National Coalition Against Domestic Violence works with members of Congress to improve legislation dealing with domestic violence. Responding to the problem of domestic violence offenders who fight with victims for custody of their children, National Coalition Against Domestic Violence advocates for legislation that keeps the best interest of the children in mind.In 1994, National Coalition Against Domestic Violence was part of a team to pass the Violence Against Women Act to provide funding for investigation into domestic violence and

In [406]:
results = find_documents_normalized("singer and songwriter of Porcupine Tree", 10)
print_results(results)

Joshua Tree, California
Joshua Tree is a census-designated place (CDP) in San Bernardino County, California, United States. The population was 7,414 at the 2010 census. At approximately 2,700 feet (820 meters) above sea level, Joshua Tree and its surrounding communities are located in the High Desert of California. The center of the business district in Joshua Tree is on California State Route 62. Geography Joshua Tree is located in the Mojave Desert at 34°8′N 116°19′W.According to the United States Census Bureau, the CDP has a total all land area of 95.9 km² (37.0 mi²). Joshua Tree, California is home to Joshua Tree National Park. Joshua Tree shares the border to its east with Twentynine Palms, California, its western border with Yucca Valley, California, its northwestern border with Landers, California, and its southern border is Coachella Valley, California. Demographics 2010 The 2010 United States Census reported that Joshua Tree had a population of 7,414. The population density wa

In [407]:
results = find_documents_normalized("Steven Wilson", 10)
print_results(results)

Harold Wilson
James Harold Wilson, Baron Wilson of Rievaulx (11 March 1916 – 24 May 1995) was a British Labour politician who served as Prime Minister of the United Kingdom from 1964 to 1970 and 1974 to 1976. Entering Parliament in 1945, Wilson was appointed a parliamentary secretary in the Attlee ministry and rose quickly through the ministerial ranks; he became Secretary for Overseas Trade in 1947 and was elevated to Cabinet shortly thereafter as President of the Board of Trade. In opposition to the next Conservative government, he served as Shadow Chancellor (1955–1961) and Shadow Foreign Secretary (1961–1963). After Labour Party leader Hugh Gaitskell died suddenly in 1963, Wilson won the subsequent leadership election. After narrowly winning the 1964 general election, Wilson saw an increased majority in a snap election in 1966. Wilson's first period as Prime Minister coincided with a period of low unemployment and relative economic prosperity, though hindered by significant problem

In [408]:
results = find_documents_normalized("cat", 10)
print_results(results)

Cat People (1942 film)
Cat People is a 1942 American horror film directed by Jacques Tourneur, produced by Val Lewton, and starring Simone Simon, Kent Smith, Jane Randolph and Tom Conway. The plot focuses on a Serbian fashion illustrator in New York City who believes herself to be descended from a race of people who shape shift into panthers when sexually aroused or angered. DeWitt Bodeen wrote the original screenplay, which was based on Lewton's short story The Bagheeta, published in 1930. Shot in Los Angeles, Cat People premiered in New York City on December 5, 1942, and was given a wide theatrical release on Christmas Day. The film was a moderate critical and commercial success at the time of its release. It was followed by one sequel, The Curse of the Cat People (1944). In the intervening years, Cat People was subject of critical reappraisal, and noted for its visual influence, particularly the work of cinematographer Nicholas Musuraca. In 1993, the film was selected for preservati

Zastosowanie LSI wpłynęło znacząco na rodzaj znajdywanych artykułów. Tematycznie były bardziej powiązane z wyszukiwaną frazą. Jednak kluczowe artykuły pojawiły się dalej w wynikach, niż przy zastosowaniu jedynie macierzy znormalizowanej. Prawdopodobnie również negatywnie wpłynęła na wyszukanie konkretnych nazw własnych.

## Wpływ IDF na wyniki wyszukiwań

Sprawdzono także wpływ przekształcenia IDF na wyniki w przypadku macierzy znormalizowanej.

In [510]:
def normalize_no_idf():
    tbd_no_idf = sparse.load_npz(res_dir + os.sep + 'tbd_no_idf.npz')
    tbd_no_idf = normalize_tbd(tbd_no_idf)
    sparse.save_npz(res_dir + os.sep + 'tbd_no_idf_normalized.npz', tbd_no_idf)

    
normalize_no_idf()

In [511]:
def normalized_without_idf(query, k):
    terms_dict, documents_dict = load_dicts()
    
    query_bow = query_to_bow(query, terms_dict)
    
    if query_bow == []:
        return []
    
    query_bow = normalize(query_bow, axis = 1)
    query_bow_t = query_bow.transpose()
    
    tbd = sparse.load_npz(res_dir + os.sep + 'tbd_no_idf_normalized.npz')

    
    results = []
    
    for document in documents_dict.keys():
        i = documents_dict[document]
        col = tbd.getcol(i)
        res = query_bow_t.dot(col)[0,0]
        if i < k:
            results.append((res, document))
        else:
            results = sorted(results)
            if res > results[0][0]:
                results[0] = (res, document)
                
    return results
    

In [512]:
results = normalized_without_idf("it has a strong flexible body, quick reflexes, sharp teeth and retractable claws adapted to killing small prey", 10)
print_results(results)

Melanism
The term melanism refers to black pigment and is derived from the Greek: μελανός. Melanism is the increased development of the dark-colored pigment melanin in the skin or hair. Pseudomelanism, also called abundism, is another variant of pigmentation, identifiable by dark spots or enlarged stripes, which cover a large part of the body of the animal, making it appear melanistic.The morbid deposition of black matter, often of a malignant character causing pigmented tumors, is called melanosis. Adaptation Melanism related to the process of adaptation is called adaptive. Most commonly, dark individuals become fitter to survive and reproduce in their environment as they are better camouflaged. This makes some species less conspicuous to predators, while others, such as black panthers, use it as a foraging advantage during night hunting. Typically, adaptive melanism is heritable: A dominant allele, which is entirely or nearly entirely expressed in the phenotype, is responsible for th

In [513]:
results = normalized_without_idf("(Felis catus) is a domestic species of small carnivorous mammal", 10)
print_results(results)

National Coalition Against Domestic Violence
National Coalition Against Domestic Violence (NCADV) is a 501(c)(3) nonprofit organization with the mission to be the voice of victims and survivors of domestic violence. National Coalition Against Domestic Violence's vision is to create a culture where domestic violence is not tolerated and where society empowers victims and survivors while holding abusers accountable. Current Work Legislative Policy Advocacy National Coalition Against Domestic Violence works with members of Congress to improve legislation dealing with domestic violence. Responding to the problem of domestic violence offenders who fight with victims for custody of their children, National Coalition Against Domestic Violence advocates for legislation that keeps the best interest of the children in mind.In 1994, National Coalition Against Domestic Violence was part of a team to pass the Violence Against Women Act to provide funding for investigation into domestic violence and

In [514]:
results = normalized_without_idf("singer and songwriter of Porcupine Tree", 10)
print_results(results)

Joshua Tree, California
Joshua Tree is a census-designated place (CDP) in San Bernardino County, California, United States. The population was 7,414 at the 2010 census. At approximately 2,700 feet (820 meters) above sea level, Joshua Tree and its surrounding communities are located in the High Desert of California. The center of the business district in Joshua Tree is on California State Route 62. Geography Joshua Tree is located in the Mojave Desert at 34°8′N 116°19′W.According to the United States Census Bureau, the CDP has a total all land area of 95.9 km² (37.0 mi²). Joshua Tree, California is home to Joshua Tree National Park. Joshua Tree shares the border to its east with Twentynine Palms, California, its western border with Yucca Valley, California, its northwestern border with Landers, California, and its southern border is Coachella Valley, California. Demographics 2010 The 2010 United States Census reported that Joshua Tree had a population of 7,414. The population density wa

In [515]:
results = normalized_without_idf("Steven Wilson", 10)
print_results(results)

Harold Wilson
James Harold Wilson, Baron Wilson of Rievaulx (11 March 1916 – 24 May 1995) was a British Labour politician who served as Prime Minister of the United Kingdom from 1964 to 1970 and 1974 to 1976. Entering Parliament in 1945, Wilson was appointed a parliamentary secretary in the Attlee ministry and rose quickly through the ministerial ranks; he became Secretary for Overseas Trade in 1947 and was elevated to Cabinet shortly thereafter as President of the Board of Trade. In opposition to the next Conservative government, he served as Shadow Chancellor (1955–1961) and Shadow Foreign Secretary (1961–1963). After Labour Party leader Hugh Gaitskell died suddenly in 1963, Wilson won the subsequent leadership election. After narrowly winning the 1964 general election, Wilson saw an increased majority in a snap election in 1966. Wilson's first period as Prime Minister coincided with a period of low unemployment and relative economic prosperity, though hindered by significant problem

In [516]:
results = normalized_without_idf("cat", 10)
print_results(results)

Cat People (1942 film)
Cat People is a 1942 American horror film directed by Jacques Tourneur, produced by Val Lewton, and starring Simone Simon, Kent Smith, Jane Randolph and Tom Conway. The plot focuses on a Serbian fashion illustrator in New York City who believes herself to be descended from a race of people who shape shift into panthers when sexually aroused or angered. DeWitt Bodeen wrote the original screenplay, which was based on Lewton's short story The Bagheeta, published in 1930. Shot in Los Angeles, Cat People premiered in New York City on December 5, 1942, and was given a wide theatrical release on Christmas Day. The film was a moderate critical and commercial success at the time of its release. It was followed by one sequel, The Curse of the Cat People (1944). In the intervening years, Cat People was subject of critical reappraisal, and noted for its visual influence, particularly the work of cinematographer Nicholas Musuraca. In 1993, the film was selected for preservati

## Prosty interfejs webowy

Stworzono prosty interfejs webowy w frameworku Flask. Do jego utworzenia najważniejsze funkcje przeniesiono do odpowiednich (Search oraz TextPreprocessor). Instancja klasy Search jest tworzona przy uruchomieniu aplikacji i służy do wykonywania zapytań. Wcześniej obliczone macierze odczytuje z plików. Aplikacja webowa znajduje się w pliku search_webpage.py. Dostępne są opcje wyszukiwania: korzystając z LSI, korzystając z macierzy znormalizowanej, korzystając z macierzy nieznormalizowanej. Część wizualna zrealizowana została za pomocą Bootstrap.