In [1]:
import xml.etree.ElementTree as et
import math
from statistics import mean

from pymystem3 import Mystem

from dictionary import *
from document import *
from graph import *


def average_doc_size_in_words(docs):
    return mean([doc.words_cnt for doc in docs])


def average_doc_size_in_bytes(docs):
    return mean([doc.bytes_cnt for doc in docs])


def average_doc_text_to_html_ratio(docs):
    return mean([doc.text_to_html_ratio for doc in docs])


def inverse_document_frequency(dictionary, docs):
    return {word: math.log10(len(docs) / dictionary[word].doc_cnt) for word in dictionary}


def stem_words(words, stem):
    return stem.lemmatize(" ".join(words))


def parse_xml(filename):
    docs = []
    tree = et.parse(filename)
    root = tree.getroot()

    # TODO remove this for final result
    cnt = 0

    for child in root:
        if child.tag == "document":
            content = child[0].text
            url = child[1].text
            doc_id = int(child[2].text)
            try:
                doc = Document(decode_base64_cp1251(content), doc_id, decode_base64_cp1251(url))
                docs.append(doc)
            except:
                print("Unable to parse " + url)
        # TODO remove this for final result
        if cnt == 50:
            break
        cnt += 1

    return docs


XML_FOLDER = "byweb_for_course"


docs_stat = []
mystem = Mystem()
dictionary = Dictionary()
graph = LinkGraph()

for filename in os.listdir(XML_FOLDER):
    if filename.endswith(".xml"):
        docs = parse_xml(XML_FOLDER + os.sep + filename)
        for doc in docs:
            doc_stats = doc.calc_doc_stats()
            docs_stat.append(doc_stats)
            graph.add_document(doc_stats)
            stemmed_words = [word.lower() for word in stem_words(doc.words, mystem) if word.isalnum()]
            dictionary.add_doc_words(stemmed_words)
    break  # remove it


In [2]:
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)


def plot_histogram(data, title, x_axis_title, y_axis_title, step):
    histogram = go.Histogram(
        x=data,
        xbins=dict(
            start=0,
            end=max(data),
            size=step
        ),
    )
    layout = go.Layout(
        title=title,
        xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text=x_axis_title)),
        yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text=y_axis_title))
    )
    fig = go.Figure(data=[histogram], layout=layout)
    iplot(fig)


def float_to_str(n):
    return "{0:.2f}".format(n)


XML_FOLDER = "byweb_for_course"

docs_stat = []
mystem = Mystem()
dictionary = Dictionary()

for filename in os.listdir(XML_FOLDER):
    if filename.endswith(".xml"):
        docs = parse_xml(XML_FOLDER + os.sep + filename)
        for doc in docs:
            docs_stat.append(doc.calc_doc_stats())
            stemmed_words = [word.lower() for word in stem_words(doc.words, mystem) if word.isalnum()]
            dictionary.add_doc_words(stemmed_words)

print("Total documents count: " + str(len(docs_stat)))
print("Average document length: " + float_to_str(average_doc_size_in_words(docs_stat)) + " words")
print("Average document length: " + float_to_str(average_doc_size_in_bytes(docs_stat)) + " bytes")
print("Average text content to HTML content ratio: " + float_to_str(average_doc_text_to_html_ratio(docs_stat)))
plot_histogram(
    data=[stat.words_cnt for stat in docs_stat],
    title="Length in words distribution",
    x_axis_title="words",
    y_axis_title="documents",
    step=250
)
plot_histogram(
    data=[stat.bytes_cnt for stat in docs_stat],
    title="Length in bytes distribution",
    x_axis_title="bytes",
    y_axis_title="documents",
    step=5000
)

print("Collection stop words ratio: " + float_to_str(dictionary.stop_words_proportion(in_collection=True)))
print("Collection latin words ratio: " + float_to_str(dictionary.latin_words_proportion(in_collection=True)))
print("Dictionary latin words ratio: " + float_to_str(dictionary.latin_words_proportion(in_collection=False)))
print("Dictionary average word length: " + float_to_str(dictionary.average_dic_word_len()))
print("Collection average word length: " + float_to_str(dictionary.average_word_len()))

print("Words with largest collection frequency: " +
      str(dictionary.most_popular_word(lambda v: dictionary.dict[v].cnt)))
print("Words with smallest inverse document frequency: " +
      str(dictionary.most_popular_word(lambda v: math.log10(len(docs_stat) / dictionary.dict[v].doc_cnt), get_max=False)))

Total documents count: 102
Average document length: 1517.02 words
Average document length: 51890.25 bytes
Average text content to HTML content ratio: 0.19


Collection stop words ratio: 0.30
Collection latin words ratio: 0.13
Dictionary latin words ratio: 0.19
Dictionary average word length: 7.77
Collection average word length: 5.51
Words with largest collection frequency: [(2395, ['сон']), (2667, ['что']), (3748, ['в']), (4303, ['и']), (4309, ['вы'])]
Words with smallest inverse document frequency: [(0.022, ['в']), (0.035, ['на']), (0.05, ['и']), (0.064, ['форум']), (0.074, ['с'])]


In [3]:
graph.show()