# Analysis

Do analysis across a number of files.

In [None]:
# ignore whitespace warnings
%env SPACY_WARNING_IGNORE=W008

import ipywidgets as widgets
import itertools
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
from dotenv import load_dotenv
import os

# offline mode
py.init_notebook_mode(connected=False)

Re-run this cell when Python code in the repository changes.

In [None]:
import importlib
import fismatic.core as fismatic
import fismatic.helpers as helpers
importlib.reload(fismatic)
importlib.reload(helpers);

## Load files

In [None]:
load_dotenv()
SSP_DOCS_FOLDER = os.getenv('SSP_DOCS_FOLDER')

path_widget = widgets.Text(description="Path:", value=SSP_DOCS_FOLDER)
display(path_widget)

In [None]:
files = fismatic.get_files(path_widget.value)
control_sets = [fismatic.control_set_for(f) for f in files]

## Compare files

In [None]:
stats = [fismatic.stats_for(cs) for cs in control_sets]
df = pd.DataFrame(stats)
df.set_index("Filename", inplace=True)
df

In [None]:
control_token_counts = helpers.flatten([cs.implementation_token_counts() for cs in control_sets])

data = [go.Histogram(x=control_token_counts)]
layout = go.Layout(
    title="Control token counts",
    xaxis={
        "title": "Number of tokens"
    },
    yaxis={
        "title": "Number of controls"
    }
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='basic histogram')

In [None]:
from collections import Counter
control_names = helpers.flatten([cs.control_names() for cs in control_sets])
counter = Counter(control_names)
top_controls = counter.most_common(20)
pd.DataFrame(top_controls, columns=["Control", "# occurrences"])