In [1]:
import sys
import itertools

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from matplotlib import pyplot as plt
from sklearn.metrics import mutual_info_score
import networkx as nx
# https://networkx.github.io/documentation/stable/tutorial.html
import visJS2jupyter
import visJS2jupyter.visJS_module as visJS_module
# http://compbio.ucsd.edu/bringing-interactivity-network-visualization-jupyter-notebooks-visjs2jupyter/

sys.path.append("..") # Adds higher directory to python modules path for importing from src dir
from src.datasets import NyseStocksDataset, NyseSecuritiesDataset
from src.nlp_utils import *

%matplotlib inline
%load_ext autotime
%load_ext autoreload
%autoreload 2

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
ds = NyseStocksDataset('OCMvOC-3C', file_path='../data/nyse/prices-split-adjusted.csv', features=['open', 'close', 'movement', 'vix_open', 'vix_close'])
securities = NyseSecuritiesDataset(file_path='../data/nyse/securities.csv')
ds.load()
securities.load()

HBox(children=(IntProgress(value=0, max=470), HTML(value='')))


time: 20.4 s


In [4]:
# features = pd.read_csv('cointegration.csv', index_col=0)
coints = pd.read_csv('../reports/cointegration-10-to-12.csv', index_col=0).stack()
coocs = pd.read_csv('../data/preprocessed/occurrences/cooccurrences.csv', index_col=0).stack().astype(float)
features = pd.merge(coocs.reset_index(), coints.reset_index(), on=['level_0', 'level_1'], how='outer').set_index(['level_0', 'level_1']).fillna(0)
features.columns = ['cooccurrence', 'cointegration']

FileNotFoundError: File b'../reports/cointegration-10-to-12.csv' does not exist

time: 285 ms


In [None]:
def generate_threshold_counts(features):
    # Remove duplicate entries
    features = features[list((compA < compB) for ((compA, compB), _) in features.iterrows())]
    
    # Select threshold to have in the end roughly the `n` largest edges left
    amount_counts = features.groupby('cooccurrence').count()
    amount_counts.columns = ['count']

    threshold_counts = amount_counts[::-1].cumsum()[::-1]
    return threshold_counts

threshold_counts = generate_threshold_counts(features)

def top_edges(features, n=100):
    threshold = threshold_counts[(threshold_counts['count'] > n) & (threshold_counts['count'].shift(-1) <= n)].index[0]
    return features[features['cooccurrence'] > threshold]

In [1]:
# https://github.com/ucsd-ccbb/visJS2jupyter/blob/master/visJS2jupyter/visJS_module.py
# http://compbio.ucsd.edu/bringing-interactivity-network-visualization-jupyter-notebooks-visjs2jupyter/
def display_interactive_graph(G, output_file=None):
    # Prepare graph data
    V = list(G.nodes())
    E = list(G.edges())
    pos = nx.spring_layout(G)

    V_enriched = [(x, securities.get_company_name(x), securities.get_industry(x)) for x in V]
    colors = plot.get_colors(np.unique([x[2] for x in V_enriched]))

    nodes_dict = [{"id":n,
                   "title": f'{comp} ({industry})',
                   "color": colors[industry],
                   "border_width": 0.3,
                   "x":pos[n][0]*1000,
                   "y":pos[n][1]*1000} for (n, comp, industry) in V_enriched]
    node_map = dict(zip(V, range(len(V))))
    edges_dict = [{"id": f'{coocs[E[i]]:n} articles', "source": node_map[E[i][0]], "target": node_map[E[i][1]],
                   "width": 5 * coocs[E[i]] / features.cooccurrence.max()} for i in range(len(E))]
    return visJS_module.visjs_network(nodes_dict, edges_dict, time_stamp=1000000, node_size_multiplier=7,
                                      edge_width_field='width', edge_label_field='none',
                                      graph_height=500, graph_width=900, export_network=bool(output_file), export_file=output_file)

In [2]:
def generate_graph(edges):
    edges = [(idx[0], idx[1], { 'cooc': max(val.cooccurrence / features.cooccurrence.max(), 0.2) })
             for idx, val in edges.iterrows()]
    G = nx.Graph(title='number_of_shared_articles')
    G.add_weighted_edges_from([(x[0], x[1], x[2]['cooc']) for x in edges])
    return G

In [None]:
# 1occ -> 17147, 2cooc -> 9155, 5cooc -> 3969, 10cooc -> 2131, 25cooc -> 975, 185cooc -> 97, 272cooc -> 50
edges = top_edges(features, 50)
G = generate_graph(edges)
# display_interactive_graph(G, output_file=f'article_amounts_top{len(edges)}.json')
display_interactive_graph(G)

In [189]:
def ApEn(U, m, r):

    def _maxdist(x_i, x_j):
        return max([abs(ua - va) for ua, va in zip(x_i, x_j)])

    def _phi(m):
        x = [[U[j] for j in range(i, i + m - 1 + 1)] for i in range(N - m + 1)]
        C = [len([1 for x_j in x if _maxdist(x_i, x_j) <= r]) / (N - m + 1.0) for x_i in x]
        return (N - m + 1.0)**(-1) * sum(np.log(C))

    N = len(U)

    return abs(_phi(m+1) - _phi(m))

# Usage example
U = np.array([85, 80, 89] * 17)
print(ApEn(U, 2, 3))
# 1.0996541105257052e-05

randU = np.random.choice([85, 80, 89], size=17*3)
print(ApEn(randU, 2, 3))

1.0996541105257052e-05
0.7282717838314103
time: 210 ms


In [175]:
x = np.sin(np.arange(100)).round(1)
ApEn(x, 2, 3)

0.0

time: 200 ms


In [202]:
import scipy
x = np.array([1, 0, -1, 0, -1, -1, -1])
y = np.array([0, 1, 1, 0, 0, 1, 1])
scipy.stats.pearsonr(x, y)

(-0.5095246653650682, 0.2427680009919806)

time: 160 ms


In [198]:
a=[1,4,6]
b=[1,2,3]

time: 183 ms


In [None]:

ApEn(randU, 2, 3)