# Random analysis

In [None]:
import numpy as np
from pathlib import Path
import plotly.graph_objects as go

In [None]:
DATA_DIR = 'similarity-graph/random/'

## Load data

In [None]:
files = Path(DATA_DIR).glob('*.npy')

data_dict = dict()
for file in files:
    # Load adj matrix and transform it into a graph
    content = np.load(file)
    content = 1 - content
    
    # Remove diagonal
    np.fill_diagonal(content, -1)
    content = content.flatten()
    content = content[np.argwhere(content != -1)[:,0]]
    
    # Compute the cumulative distrib
    data_dict[file.stem] = content
    
data_dict.keys()

## Plot data

In [None]:
dataset = 'random100-full-en'

fig = go.Figure()
fig.add_trace(go.Histogram(x=data_dict[dataset], name=dataset,
                           histnorm='probability', cumulative_enabled=True))
    
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.50)
fig.show()

## Statistical threshold

In [None]:
p = 0.05

for key in data_dict:
    vals, counts = np.unique(data_dict[key], return_counts=True)
    counts = np.cumsum(counts)
    counts = 1 - np.cumsum(counts)/np.sum(counts)
    
    threshold_idx = np.argmax(counts <= p)
    threshold = vals[threshold_idx]
    print(key, 'threshold:', threshold)

## License
<small>Copyright (C) 2020 MaLGa ML4DS 

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see &lt;https://www.gnu.org/licenses/&gt;.</small>