In [4]:
import lxml.etree as et

from dictionary import *
from utils import *
from document import *
from graph import *


def parse_xml(filename):
    context = et.iterparse(filename, tag='document')

    for (_, elem) in context:
        content = elem[0].text
        url = elem[1].text
        doc_id = int(elem[2].text)
        elem.clear()

        try:
            doc = Document(decode_base64_cp1251(content), doc_id, decode_base64_cp1251(url))
            stats = doc.calc_doc_stats()
            docs_stat.append(stats)
            graph.add_document(stats)
            dictionary.add_doc_words(doc.words)
        except:
            print("Unable to parse " + str(doc_id))
        break


XML_FOLDER = "byweb_for_course"


docs_stat = []
mystem = Mystem()
dictionary = Dictionary()
graph = LinkGraph()

for filename in os.listdir(XML_FOLDER):
    if filename.endswith(".xml"):
        parse_xml(XML_FOLDER + os.sep + filename)

In [None]:
print("Total documents count: " + str(len(docs_stat)))
print("Average document length: " + float_to_str(average_doc_size_in_words(docs_stat)) + " words")
print("Average document length: " + float_to_str(average_doc_size_in_bytes(docs_stat)) + " bytes")
print("Average text content to HTML content ratio: " + float_to_str(average_doc_text_to_html_ratio(docs_stat)))
plot_histogram(
    data=[stat.words_cnt for stat in docs_stat],
    title="Length in words distribution",
    x_axis_title="words",
    y_axis_title="documents",
    step=250
)
plot_histogram(
    data=[stat.bytes_cnt for stat in docs_stat],
    title="Length in bytes distribution",
    x_axis_title="bytes",
    y_axis_title="documents",
    step=5000
)

print("Collection stop words ratio: " + float_to_str(dictionary.stop_words_proportion(in_collection=True)))
print("Collection latin words ratio: " + float_to_str(dictionary.latin_words_proportion(in_collection=True)))
print("Dictionary latin words ratio: " + float_to_str(dictionary.latin_words_proportion(in_collection=False)))
print("Dictionary average word length: " + float_to_str(dictionary.average_dic_word_len()))
print("Collection average word length: " + float_to_str(dictionary.average_word_len()))

print("Words with largest collection frequency: " +
      str(dictionary.most_popular_word(lambda v: dictionary.dict[v].cnt)))
print("Words with smallest inverse document frequency: " +
      str(dictionary.most_popular_word(lambda v: math.log10(len(docs_stat) / dictionary.dict[v].doc_cnt),
                                       get_max=False)))

rf = rank_frequency({word: value.cnt for word, value in dictionary.dict.items()})
plot_line(rf, "Rank-frequency", "rank", "frequency")

In [None]:
graph.show()