In [35]:
import os
import re
import sys
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path 
from bs4 import BeautifulSoup

warnings.filterwarnings('ignore')

%matplotlib inline
plt.style.use('ggplot')

from pylab import rcParams
rcParams['figure.figsize'] = 12, 6

In [2]:
def clear_outlier(arr, IQR_clear=True):
    arr = arr[~np.isnan(arr)]
    if IQR_clear:
        q1, q3 = np.percentile(arr, [25, 75])
        iqr = q3 - q1

        lower_bound = q1 - (iqr * 1.5)
        upper_bound = q3 + (iqr * 1.5)
        mask = (arr < upper_bound) & (arr > lower_bound)
        arr = arr[mask]
    return arr


def get_arrays(path, column):
    # filenames
    root_path = Path(path)
    p = root_path.glob('**/*')
    files = [x for x in p if x.is_file()]

    # create empty np.array
    num_of_pd_dfs = len(files)
    num_of_blocks = 72
    total_time_array = np.empty((num_of_blocks,
                                 num_of_pd_dfs))
    total_time_array[:] = np.NaN

    # fill np.array
    for num, file in enumerate(files):
        df = pd.read_csv(file, index_col=0)
        total_time_array[df.s_id.values, num] = df[column].values
    return total_time_array


"""
def get_median(arr, IQR_clear=True):
    values_arr = []
    indexes = [i for i in range(0, len(arr))]
    for num, i in enumerate(arr):
        values_arr.append(np.median(clear_outlier(i, IQR_clear)))

    val_dict = dict(zip(indexes, values_arr))
    return val_dict
    
    """


def get_mean(arr, IQR_clear=True, mean_condition=False):
    values_arr = []
    indexes = [i for i in range(0, len(arr))]
    
    if mean_condition:
        for num, i in enumerate(arr):
            values_arr.append(
                np.mean(clear_outlier(i, IQR_clear)))    #среднее
    else:
        for num, i in enumerate(arr):
            values_arr.append(
                np.median(clear_outlier(i, IQR_clear)))  #медиана

    val_dict = dict(zip(indexes, values_arr))
    return val_dict

## Total times 

In [3]:
reports_path = 'data/DTEK_text/reports'
total_times = get_arrays(reports_path, "total")
total_times

array([[125.95199943,  23.66100049,  56.36100006, ...,  15.10199976,
         16.34800005,   0.        ],
       [ 28.21700072,  19.18499899,  33.41300035, ...,   5.54099989,
          6.62500024,   0.        ],
       [ 18.07299972,   8.23799992,   8.27499986, ...,  20.65699983,
          1.43199992,          nan],
       ...,
       [ 12.5389998 ,  45.63899946,  11.96499991, ...,   0.        ,
         18.85499978,          nan],
       [ 13.61199999,  32.87000012,  12.46199989, ...,   0.2579999 ,
          8.69099998,          nan],
       [ 24.07500029,  38.23599958,  51.50900006, ...,   1.46099973,
         17.97199965,          nan]])

In [4]:
median_times_dict = get_mean(total_times, True, False) #IQR=True, median
top_5_min_ind = sorted(
    median_times_dict, key=median_times_dict.get, reverse=False)[:5]
top_5_max_ind = sorted(
    median_times_dict, key=median_times_dict.get, reverse=True)[:5]

print(f"min gazetime blocks: {top_5_min_ind}")
print(f"max gazetime blocks: {top_5_max_ind}")

min gazetime blocks: [49, 48, 43, 60, 16]
max gazetime blocks: [4, 15, 5, 11, 0]


In [5]:
print(median_times_dict)

{0: 31.373999834060672, 1: 18.692999839782715, 2: 4.598999738693237, 3: 17.724000334739685, 4: 39.256500005722046, 5: 35.639000654220574, 6: 5.371999740600586, 7: 9.385000228881836, 8: 11.242000102996826, 9: 12.09800100326538, 10: 4.40149998664856, 11: 34.46800017356873, 12: 19.621000170707703, 13: 29.305499792099, 14: 16.502999544143677, 15: 37.112999677658074, 16: 2.6454999446868896, 17: 10.40150010585785, 18: 28.39800024032593, 19: 7.467000126838684, 20: 8.941999435424805, 21: 4.732500076293945, 22: 23.61899983882904, 23: 21.00000011920929, 24: 21.28100037574768, 25: 19.64049994945526, 26: 9.722000122070312, 27: 23.6949999332428, 28: 11.240999937057495, 29: 12.875, 30: 7.187000036239624, 31: 15.143500328063965, 32: 9.996999979019165, 33: 19.05899977684021, 34: 24.361999988555908, 35: 18.67599964141846, 36: 24.76449954509735, 37: 10.227500081062317, 38: 9.714000105857851, 39: 13.55299997329712, 40: 9.994999408721924, 41: 24.414000511169434, 42: 6.646000146865845, 43: 1.05099999904632

In [6]:
def open_html(path):
    with open(path, 'rb') as f:
        return f.read()


def clean_text(block: str):
    """
    Очистить блоки от пунктуации и лишнего шума такого как:
     - переносы строки (\n), 
     - сноски([1]), 
     - объединить цифры('200 000' -> '200000')
     ...

    Args:
        block (str): [блок из html]

    Returns:
        [type]: ["чистый" блок]
    """
    a = re.sub(
        r'(\d{3}\s\d{3})|(\d{1}\s\d{3}\s\d{3})|(\d{2}\s\d{3}\s\d{3})', 'flag', block)     
    b = re.findall(
        r'(\d{3}\s\d{3})|(\d{1}\s\d{3}\s\d{3})|(\d{2}\s\d{3}\s\d{3})', block) #поиск чисел вида (100 000 / 1 000 000 / 11 000 000)

    num_list = []
    for i in b:
        [num_list.append(j) for j in i if j]

    for i in range(0, len(num_list)):
        num_list[i] = num_list[i].replace(' ', '')
        

    for i in range(0, len(num_list)):
        a = a.replace('flag', num_list[i], 1)
    
    
    
    block = re.sub(r'\n', ' ', block)     # очистка от переносов строки (\n)
    
    block = re.sub(r'\[\d]', '', block)     # очистка от сносок ([1])
    block = re.sub(r'\s\у\.\е\.', 'ye', block)     #слияние 00_у.е. в 00уе
    block = re.findall(r'\w+', block)  #разбиение на слова (удаление символов)
    block = ' '.join(block)
    
    #block = re.sub(r'\s\у\s\е', 'ye', block)

    return block


def calc_words(block):

    sep_bloc_list = block.split(' ')
    words_quantity = len(sep_bloc_list)

    return words_quantity


def calc_speed(time, quantity):

    try:
        speed_arr = [quantity[i] / time[i] for i in range(0, len(time))]
    except ZeroDivisionError:
        print('zero devision')

    return speed_arr

# parse blocks 
wp = open_html("data/material.html")
soup = BeautifulSoup(wp, 'html.parser')
all_paragraphs = soup.select('div>p, ul, table')
waste_paragraphs = soup.select('li > ul')
material = [clean_text(item.get_text(" ", strip=True))
            for item in all_paragraphs if item not in waste_paragraphs]

In [7]:
material

['Вводная о компании Акселератор SmartTech специализируется на EdTech онлайн образование и HR Tech стартапах Инициатором и основателем этого проекта был один из передовых бизнесменов страны заинтересованный в развитии подобных проектов который выступал главным спонсором акселератора первые 3 года Этот проект с самого начала планировался как одно из направлений действующего бизнеса и с этого года учредитель ожидает что акселератор выйдет на самоокупаемость и расширит сферы деятельности',
 'Учредитель не принимает участия в управлении акселератором однако является активным участником комиссии по выбору страртапов для акселерации и питчинговых сессий Так же проводит мастер класс по бизнес стратегии Он считает своей личной миссией развитие сферы образования как одной из ключевых с точки зрения стратегии развития страны',
 'Программа акселерации длится 10 месяцев и стартует в феврале Годовой план акселерации выглядит таким образом',
 '1 Этап Февраль Март Набор участников стартапов Акселерат

##### Reading speed:

In [8]:
# calc reading speed
words_count = [calc_words(i) for i in material]
reading_speed = calc_speed(median_times_dict, words_count)
reading_speed[:5]

[2.0717791911707035,
 2.46081422961884,
 3.2615787893612946,
 1.9183028299406406,
 1.5538827962530692]

In [9]:
# option 1
times_in_min = np.fromiter(median_times_dict.values(), dtype=float) / 60
words_count = np.array(words_count)
reading_velocity = words_count / times_in_min
reading_velocity

array([ 124.30675147,  147.64885378,  195.69472736,  115.0981698 ,
         93.23296778,  127.94971566,  234.54952733,  166.22269174,
        160.11385728,  158.70390484,  354.42462904,  104.44470181,
        137.60766406,   83.94328769,  101.7996756 ,  109.93452525,
        272.16027785,  265.34634158,  141.55926354,  305.34350626,
        228.13689653,  228.20918808,  185.44392353,  185.71428466,
        188.90089418,  189.40454721,  283.89219969,  245.62143981,
        314.91860331,  274.95145631,  325.58786534,  245.6499435 ,
        288.08642653,  163.70218986,  187.17675077,  176.697369  ,
        273.77900319,  357.85871141,  352.06917467,  221.35320637,
        252.12607795,  371.09854224,  478.48328765, 1484.30066738,
        338.40117135,  476.13144281,  796.4601545 ,  842.69668509,
       1186.26435039, 1859.15482969,  862.40137777,  409.43248626,
        955.90959581,  391.06145759,  257.52357907,  407.67384568,
        482.63592952,  403.76849873,  748.41142014,  502.23574

In [10]:
# option 2
reading_speed_all = words_count.reshape(-1,1) / (total_times/60)
reading_speed_all[reading_speed_all == np.inf] = np.NaN
reading_velocity_median_dict = get_mean(reading_speed_all, True, False) #IQR=True, median
reading_velocity_median_dict

{0: 63.76195540639786,
 1: 97.30987501202578,
 2: 104.54901286986097,
 3: 86.10155761207349,
 4: 69.79307749202056,
 5: 100.52383140419776,
 6: 129.1057212743272,
 7: 101.70417339126374,
 8: 88.75301842370983,
 9: 96.8034682510335,
 10: 169.97167188262512,
 11: 75.85494868610132,
 12: 79.80587314247197,
 13: 51.94805221254269,
 14: 55.1497078278594,
 15: 75.12220264354514,
 16: 104.06128188012919,
 17: 112.00183731969054,
 18: 91.79664565040007,
 19: 100.8943211553343,
 20: 99.77501663426105,
 21: 104.78234714043892,
 22: 129.52676865565613,
 23: 122.66635263878626,
 24: 132.31303550635863,
 25: 121.40595920705655,
 26: 154.17271650586244,
 27: 165.62909929938377,
 28: 153.77264138899488,
 29: 161.3270741185806,
 30: 137.4530080885996,
 31: 142.14750319480456,
 32: 133.9472557479823,
 33: 113.44298835437627,
 34: 123.90372858634453,
 35: 116.05007789603536,
 36: 181.08490699402088,
 37: 203.20833267982175,
 38: 193.37658250259716,
 39: 123.04158941392554,
 40: 147.32534330254268,
 41: 

In [11]:
top_5_min_ind_speed = sorted(reading_velocity_median_dict,
                             key=reading_velocity_median_dict.get, reverse=False)[:5]
top_5_max_ind_speed = sorted(reading_velocity_median_dict,
                             key=reading_velocity_median_dict.get, reverse=True)[:5]


print(f"min reading speed blocks: {top_5_min_ind_speed}")
print(f"max reading speed blocks: {top_5_max_ind_speed}")

min reading speed blocks: [71, 13, 14, 0, 70]
max reading speed blocks: [49, 50, 43, 47, 52]


##### Comebacks:

In [12]:
res_comeback_array = get_arrays(reports_path, "comebacks")
comeback_medians_dict = get_mean(res_comeback_array, False, True)   #IQR = False, mean
comeback_medians_dict

{0: 2.826923076923077,
 1: 1.4711538461538463,
 2: 0.504950495049505,
 3: 1.4059405940594059,
 4: 1.5346534653465347,
 5: 1.46,
 6: 0.46,
 7: 0.58,
 8: 0.78,
 9: 0.64,
 10: 0.5,
 11: 1.77,
 12: 1.25,
 13: 1.7,
 14: 1.28,
 15: 1.63,
 16: 0.33,
 17: 0.98,
 18: 1.4646464646464648,
 19: 0.9090909090909091,
 20: 0.8787878787878788,
 21: 0.36363636363636365,
 22: 0.8585858585858586,
 23: 0.696969696969697,
 24: 0.6565656565656566,
 25: 0.797979797979798,
 26: 0.5454545454545454,
 27: 0.7575757575757576,
 28: 0.494949494949495,
 29: 0.6060606060606061,
 30: 0.45454545454545453,
 31: 0.696969696969697,
 32: 0.43434343434343436,
 33: 0.7676767676767676,
 34: 0.7272727272727273,
 35: 0.5656565656565656,
 36: 0.6632653061224489,
 37: 0.35353535353535354,
 38: 0.30303030303030304,
 39: 0.45918367346938777,
 40: 0.37755102040816324,
 41: 0.41836734693877553,
 42: 0.21649484536082475,
 43: 0.08247422680412371,
 44: 0.21649484536082475,
 45: 0.12371134020618557,
 46: 0.12244897959183673,
 47: 0.09183

In [13]:
top_10_min_ind = sorted(comeback_medians_dict,
                        key=comeback_medians_dict.get, reverse=False)[:10]
top_10_max_ind = sorted(comeback_medians_dict,
                        key=comeback_medians_dict.get, reverse=True)[:10]


print(f"min comebacks blocks: {top_10_min_ind}")
print(f"max comebacks blocks: {top_10_max_ind}")

min comebacks blocks: [49, 48, 43, 47, 50, 52, 46, 45, 57, 53]
max comebacks blocks: [0, 71, 11, 70, 13, 15, 4, 1, 18, 5]


- graphics(x=median_dict.keys(), 
           y=median_dict.values())
- regular exp comments
- fix 200000 у.е.->20000уе
- replace median to mean
- concat all data to one dataframe 
 e.g. median_df with cols=["block_id", "toal_time", "reading_speed", "comeback"]

In [14]:
import plotly 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)


In [15]:
def plot_data(dictionary, name):
    
    median_keys = np.fromiter(dictionary.keys(), dtype=int)
    median_values = np.fromiter(dictionary.values(), dtype=float)
    
    
    trace0 = go.Scatter(
            x = median_keys, y = median_values, name='data'
            )
    
    
    data = [trace0]
    layout = {'title': 'Statistics for {}'.format(name)}

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [16]:
def bar_plot(dictionary):
    median_keys = np.fromiter(dictionary.keys(), dtype=int).astype('str')
    median_values = np.fromiter(dictionary.values(), dtype=float)
    
    median_keys_str = []
    [median_keys_str.append(i + '_') for i in median_keys]
    
    
    layout = {'title': 'Statistics for comebacks', 'xaxis': {'title': 'blocks'}, 'yaxis': {'title': "name"}}
    
    fig = go.Figure([go.Bar(x=median_keys_str, y=median_values)], layout=layout)
    fig.update_layout(font_size=10, xaxis_tickangle=30)
    fig.update_layout(
        width = 1170,
            height = 500) 

    
    fig.show()

In [17]:
bar_plot(comeback_medians_dict)

In [18]:
plot_data(median_times_dict, 'reading times') 

In [19]:
plot_data(reading_velocity_median_dict, 'reading velocity')

In [20]:
plot_data(comeback_medians_dict, 'reading comebacks')

In [21]:
median_df = pd.DataFrame(data = [median_times_dict, reading_velocity_median_dict, comeback_medians_dict]).T
median_df.columns = ["total_time", "reading_speed", "comeback"]
median_df['block_id'] = median_df.index
#median_df.set_index('block_id', inplace=True)

In [22]:
median_df

Unnamed: 0,total_time,reading_speed,comeback,block_id
0,31.3740,63.761955,2.826923,0
1,18.6930,97.309875,1.471154,1
2,4.5990,104.549013,0.504950,2
3,17.7240,86.101558,1.405941,3
4,39.2565,69.793077,1.534653,4
...,...,...,...,...
67,20.5580,130.964884,1.333333,67
68,18.1050,139.103557,1.115789,68
69,16.8690,87.384877,1.297872,69
70,14.2235,65.774585,1.723404,70


In [23]:
def plot_data1(df, name):
    
    
    
    trace0 = go.Scatter(
            x = df.index, y = df[name], name='data'
            )
    
    
    data = [trace0]
    layout = {'title': 'Statistics for {}'.format(name), 'xaxis': {'title': 'blocks'}, 'yaxis': {'title': name}}
    
    fig = go.Figure(data=data, layout=layout)
    
    iplot(fig)

In [24]:
def bar_plot1(df, name):
    
    ind = np.array(df.index).astype(str)
    layout = {'title': 'Statistics for {}'.format(name), 'xaxis': {'title': 'blocks'}, 'yaxis': {'title': name}}
    
    fig = go.Figure([go.Bar(x=ind, y=df[name])], layout=layout)

    # fig.update_traces(texttemplate='%{text.2s}', textposition='outside')
    fig.show()

In [25]:
a = np.array(median_df.index).astype(str)
a
#[",".join(item) for item in a.astype(str)]

array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34',
       '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45',
       '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56',
       '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67',
       '68', '69', '70', '71'], dtype='<U21')

In [26]:
plot_data1(median_df[['reading_speed']], 'reading_speed')

In [27]:
plot_data1(median_df[['total_time']], 'total_time')

In [28]:
bar_plot1(median_df[['comeback']], 'comeback')

In [29]:
import plotly.express as px


fig = px.bar(median_df, y='comeback', x='block_id', text='comeback')
fig.update_traces(texttemplate='%{text:.3}', textposition='outside')

fig.update_layout(uniformtext_minsize=4, uniformtext_mode='show')

fig.show()

In [33]:
a = np.array(median_df.index).astype(str)

p = []
for i in a:
    p.append(i+'_')
    
fig = go.Figure()
fig.add_trace(go.Bar(
    x=p,
    y=median_df['comeback'],
    # name='kjlt',
    #marker_color='indianred'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(font_size=10, xaxis_tickangle=0)
fig.update_layout(
    width = 1170,
height = 500)
fig.show()

In [None]:
a