In [3]:
import xml.etree.ElementTree as et
import math
import operator
from statistics import mean

from pymystem3 import Mystem

from dictionary import *
from document import *
from graph import *


def average_doc_size_in_words(docs):
    return mean([doc.words_cnt for doc in docs])


def average_doc_size_in_bytes(docs):
    return mean([doc.bytes_cnt for doc in docs])


def average_doc_text_to_html_ratio(docs):
    return mean([doc.text_to_html_ratio for doc in docs])


def rank_frequency(dictionary):
    sorted_dictionary = sorted(dictionary.items(), key=operator.itemgetter(1))
    return [(i, sorted_dictionary[i][1]) for i in range(len(sorted_dictionary))]


def stem_words(words, stem):
    return stem.lemmatize(" ".join(words))


def parse_xml(filename):
    docs = []
    tree = et.parse(filename)
    root = tree.getroot()

    # TODO remove this for final result
    cnt = 0

    for child in root:
        if child.tag == "document":
            content = child[0].text
            url = child[1].text
            doc_id = int(child[2].text)
            try:
                doc = Document(decode_base64_cp1251(content), doc_id, decode_base64_cp1251(url))
                docs.append(doc)
            except:
                print("Unable to parse " + url)
        # TODO remove this for final result
        if cnt == 50:
            break
        cnt += 1

    return docs


XML_FOLDER = "byweb_for_course"


docs_stat = []
mystem = Mystem()
dictionary = Dictionary()
graph = LinkGraph()

for filename in os.listdir(XML_FOLDER):
    if filename.endswith(".xml"):
        docs = parse_xml(XML_FOLDER + os.sep + filename)
        for doc in docs:
            doc_stats = doc.calc_doc_stats()
            docs_stat.append(doc_stats)
            graph.add_document(doc_stats)
            stemmed_words = [word.lower() for word in stem_words(doc.words, mystem) if word.isalnum()]
            dictionary.add_doc_words(stemmed_words)
    break  # remove it


In [None]:
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

    
def plot_histogram(data, title, x_axis_title, y_axis_title, step):
    histogram = go.Histogram(
        x=data,
        xbins=dict(
            start=0,
            end=max(data),
            size=step
        ),
    )
    plot_figure(histogram, title, x_axis_title, y_axis_title)


def plot_line(data, title, x_axis_title, y_axis_title):
    line_graph = go.Line(x=[entry[0] for entry in data], y=[entry[1] for entry in data])
    plot_figure(line_graph, title, x_axis_title, y_axis_title)


def plot_figure(data, title, x_axis_title, y_axis_title):
    layout = go.Layout(
        title=title,
        xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text=x_axis_title)),
        yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text=y_axis_title))
    )
    fig = go.Figure(data=[data], layout=layout)
    iplot(fig)   


def float_to_str(n):
    return "{0:.2f}".format(n)


XML_FOLDER = "byweb_for_course"

docs_stat = []
mystem = Mystem()
dictionary = Dictionary()

for filename in os.listdir(XML_FOLDER):
    if filename.endswith(".xml"):
        docs = parse_xml(XML_FOLDER + os.sep + filename)
        for doc in docs:
            docs_stat.append(doc.calc_doc_stats())
            stemmed_words = [word.lower() for word in stem_words(doc.words, mystem) if word.isalnum()]
            dictionary.add_doc_words(stemmed_words)

print("Total documents count: " + str(len(docs_stat)))
print("Average document length: " + float_to_str(average_doc_size_in_words(docs_stat)) + " words")
print("Average document length: " + float_to_str(average_doc_size_in_bytes(docs_stat)) + " bytes")
print("Average text content to HTML content ratio: " + float_to_str(average_doc_text_to_html_ratio(docs_stat)))
plot_histogram(
    data=[stat.words_cnt for stat in docs_stat],
    title="Length in words distribution",
    x_axis_title="words",
    y_axis_title="documents",
    step=250
)
plot_histogram(
    data=[stat.bytes_cnt for stat in docs_stat],
    title="Length in bytes distribution",
    x_axis_title="bytes",
    y_axis_title="documents",
    step=5000
)

print("Collection stop words ratio: " + float_to_str(dictionary.stop_words_proportion(in_collection=True)))
print("Collection latin words ratio: " + float_to_str(dictionary.latin_words_proportion(in_collection=True)))
print("Dictionary latin words ratio: " + float_to_str(dictionary.latin_words_proportion(in_collection=False)))
print("Dictionary average word length: " + float_to_str(dictionary.average_dic_word_len()))
print("Collection average word length: " + float_to_str(dictionary.average_word_len()))

print("Words with largest collection frequency: " +
      str(dictionary.most_popular_word(lambda v: dictionary.dict[v].cnt)))
print("Words with smallest inverse document frequency: " +
      str(dictionary.most_popular_word(lambda v: math.log10(len(docs_stat) / dictionary.dict[v].doc_cnt), get_max=False)))

rf = rank_frequency({word: dictionary.dict[word].cnt for word in dictionary.dict})
plot_line(rf, "Rank-frequency", "rank", "frequency")

In [None]:
graph.show()