In [1]:
import xml.etree.ElementTree as et
import math
import operator
from statistics import mean

from pymystem3 import Mystem

from dictionary import *
from document import *
from graph import *


def average_doc_size_in_words(docs):
    return mean([doc.words_cnt for doc in docs])


def average_doc_size_in_bytes(docs):
    return mean([doc.bytes_cnt for doc in docs])


def average_doc_text_to_html_ratio(docs):
    return mean([doc.text_to_html_ratio for doc in docs])


def average_word_length(words):
    return mean([len(word) for word in words])


def inverse_document_frequency(dictionary, docs):
    return {word: math.log10(len(docs) / dictionary[word].doc_cnt) for word in dictionary}


def most_popular_word(dictionary, limit=5, get_max=True):
    sorted_dictionary = sorted(dictionary.items(), key=operator.itemgetter(1))
    dictionary_top = sorted_dictionary[-limit:] if get_max else sorted_dictionary[:limit]
    max_values = [entry[1] for entry in dictionary_top]
    return [word for (word, _) in sorted_dictionary if dictionary[word] in max_values]


def stem_words(words, stem):
    return stem.lemmatize(" ".join(words))


def parse_xml(filename):
    docs = []
    tree = et.parse(filename)
    root = tree.getroot()

    # TODO remove this for final result
    cnt = 0

    for child in root:
        if child.tag == "document":
            content = child[0].text
            url = child[1].text
            doc_id = int(child[2].text)
            doc = Document(decode_base64_cp1251(content), doc_id, decode_base64_cp1251(url))
            docs.append(doc)
        # TODO remove this for final result
        if cnt == 50:
            break;
        cnt += 1

    return docs


XML_FOLDER = "byweb_for_course"


docs_stat = []
mystem = Mystem()
dictionary = Dictionary()
graph = LinkGraph()

for filename in os.listdir(XML_FOLDER):
    if filename.endswith(".xml"):
        docs = parse_xml(XML_FOLDER + os.sep + filename)
        for doc in docs:
            doc_stats = doc.calc_doc_stats()
            docs_stat.append(doc_stats)
            graph.add_document(doc_stats)
            stemmed_words = [word.lower() for word in stem_words(doc.words, mystem) if word.isalnum()]
            dictionary.add_doc_words(stemmed_words)
    break  # remove it


In [2]:
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

def plot_histogram(data, title, x_axis_title, y_axis_title, step):
    histogram = go.Histogram(
        x=data,
        xbins=dict(
            start=0,
            end=max(data),
            size=step
        ),
    )
    layout = go.Layout(
        title=title,
        xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text=x_axis_title)),
        yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text=y_axis_title))
    )
    fig = go.Figure(data=[histogram], layout=layout)
    fig.show()

print("Total documents count: " + str(len(docs_stat)))
print("Average document length: " + str(average_doc_size_in_words(docs_stat)) + " words")
print("Average document length: " + str(average_doc_size_in_bytes(docs_stat)) + " bytes")
print("Average text content to HTML content ratio: " + str(average_doc_text_to_html_ratio(docs_stat)))


plot_histogram(
    data=[stat.words_cnt for stat in docs_stat],
    title="Length in words distribution",
    x_axis_title="words",
    y_axis_title="documents",
    step=250
)
plot_histogram(
    data=[stat.bytes_cnt for stat in docs_stat],
    title="Length in bytes distribution",
    x_axis_title="bytes",
    y_axis_title="documents",
    step=5000
)

print("Collection stop words ratio: " + str(dictionary.stop_words_proportion(in_collection=True)))
print("Collection latin words ratio: " + str(dictionary.latin_words_proportion(in_collection=True)))
print("Dictionary latin words ratio: " + str(dictionary.latin_words_proportion(in_collection=False)))
print("Dictionary average word length: " + str(average_word_length(dictionary.dict.keys())))

print("Words with largest collection frequency: " + 
      str(most_popular_word({word: dictionary.dict[word].cnt for word in dictionary.dict})))
print("Words with smallest inverse document frequency: " +
      str(most_popular_word(inverse_document_frequency(dictionary.dict, docs_stat), get_max=False)))

Total documents count: 51
Average document length: 621.3921568627451 words
Average document length: 47313.74509803922 bytes
Average text content to HTML content ratio: 0.19455271362406043


AttributeError: 'Figure' object has no attribute 'show'

In [None]:
graph.show()