In [350]:
from urllib.request import urlopen
import json
from sklearn.preprocessing import normalize
import numpy as np
from sklearn.metrics.pairwise import rbf_kernel
import time
import random

from yahoofinancials import YahooFinancials
import math
from sklearn.manifold import MDS
from sklearn.neighbors import LocalOutlierFactor as LOF
from sklearn.svm import OneClassSVM
from scipy.stats import entropy
from matplotlib import pyplot as plt
from scipy.stats import energy_distance
import seaborn as sns
import datetime
import warnings
from scipy.special import kl_div
import csv
from datetime import date, timedelta
warnings.filterwarnings('ignore')

## Statistical metrics processing

1. Make a function to parse JSON files from https://financialmodelingprep.com/
2. Base on Apple ticket, retrieve the list of key statistics. The only one excluded - date.

In [286]:
def get_jsonparsed_data(url):
    response = urlopen(url)
    data = response.read().decode("utf-8")
    return json.loads(data)

url = ("https://financialmodelingprep.com/api/v3/company-key-metrics/AAPL?period=quarter")
apple = get_jsonparsed_data(url)

keys = []
for key in apple['metrics'][0]:
    if not key == 'date':
        keys.append(key)
keys

['Revenue per Share',
 'Net Income per Share',
 'Operating Cash Flow per Share',
 'Free Cash Flow per Share',
 'Cash per Share',
 'Book Value per Share',
 'Tangible Book Value per Share',
 'Shareholders Equity per Share',
 'Interest Debt per Share',
 'Market Cap',
 'Enterprise Value',
 'PE ratio',
 'Price to Sales Ratio',
 'POCF ratio',
 'PFCF ratio',
 'PB ratio',
 'PTB ratio',
 'EV to Sales',
 'Enterprise Value over EBITDA',
 'EV to Operating cash flow',
 'EV to Free cash flow',
 'Earnings Yield',
 'Free Cash Flow Yield',
 'Debt to Equity',
 'Debt to Assets',
 'Net Debt to EBITDA',
 'Current ratio',
 'Interest Coverage',
 'Income Quality',
 'Dividend Yield',
 'Payout Ratio',
 'SG&A to Revenue',
 'R&D to Revenue',
 'Intangibles to Total Assets',
 'Capex to Operating Cash Flow',
 'Capex to Revenue',
 'Capex to Depreciation',
 'Stock-based compensation to Revenue',
 'Graham Number',
 'Graham Net-Net',
 'Working Capital',
 'Tangible Asset Value',
 'Net Current Asset Value',
 'Invested Cap

In [287]:
SP500_symbols = ['AAPL', 'ABT', 'ABBV', 'ACN', 'ACE', 'ADBE', 'ADT', 'AAP', 'AES', 'AET', 'AFL', 'AMG', 'A', 'GAS', 'ARE', 'APD', 'AKAM', 'AA', 'AGN', 'ALXN', 'ALLE', 'ADS', 'ALL', 'ALTR', 'MO', 'AMZN', 'AEE', 'AAL', 'AEP', 'AXP', 'AIG', 'AMT', 'AMP', 'ABC', 'AME', 'AMGN', 'APH', 'APC', 'ADI', 'AON', 'APA', 'AIV', 'AMAT', 'ADM', 'AIZ', 'T', 'ADSK', 'ADP', 'AN', 'AZO', 'AVGO', 'AVB', 'AVY', 'BHI', 'BLL', 'BAC', 'BK', 'BCR', 'BXLT', 'BAX', 'BBT', 'BDX', 'BBBY', 'BRK.B', 'BBY', 'BLX', 'HRB', 'BA', 'BWA', 'BXP', 'BSX', 'BMY', 'BRCM', 'BF.B', 'CHRW', 'CA', 'CVC', 'COG', 'CAM', 'CPB', 'COF', 'CAH', 'HSIC', 'KMX', 'CCL', 'CAT', 'CBG', 'CBS', 'CELG', 'CNP', 'CTL', 'CERN', 'CF', 'SCHW', 'CHK', 'CVX', 'CMG', 'CB', 'CI', 'XEC', 'CINF', 'CTAS', 'CSCO', 'C', 'CTXS', 'CLX', 'CME', 'CMS', 'COH', 'KO', 'CCE', 'CTSH', 'CL', 'CMCSA', 'CMA', 'CSC', 'CAG', 'COP', 'CNX', 'ED', 'STZ', 'GLW', 'COST', 'CCI', 'CSX', 'CMI', 'CVS', 'DHI', 'DHR', 'DRI', 'DVA', 'DE', 'DLPH', 'DAL', 'XRAY', 'DVN', 'DO', 'DTV', 'DFS', 'DISCA', 'DISCK', 'DG', 'DLTR', 'D', 'DOV', 'DOW', 'DPS', 'DTE', 'DD', 'DUK', 'DNB', 'ETFC', 'EMN', 'ETN', 'EBAY', 'ECL', 'EIX', 'EW', 'EA', 'EMC', 'EMR', 'ENDP', 'ESV', 'ETR', 'EOG', 'EQT', 'EFX', 'EQIX', 'EQR', 'ESS', 'EL', 'ES', 'EXC', 'EXPE', 'EXPD', 'ESRX', 'XOM', 'FFIV', 'FB', 'FAST', 'FDX', 'FIS', 'FITB', 'FSLR', 'FE', 'FISV', 'FLIR', 'FLS', 'FLR', 'FMC', 'FTI', 'F', 'FOSL', 'BEN', 'FCX', 'FTR', 'GME', 'GPS', 'GRMN', 'GD', 'GE', 'GGP', 'GIS', 'GM', 'GPC', 'GNW', 'GILD', 'GS', 'GT', 'GOOGL', 'GOOG', 'GWW', 'HAL', 'HBI', 'HOG', 'HAR', 'HRS', 'HIG', 'HAS', 'HCA', 'HCP', 'HCN', 'HP', 'HES', 'HPQ', 'HD', 'HON', 'HRL', 'HSP', 'HST', 'HCBK', 'HUM', 'HBAN', 'ITW', 'IR', 'INTC', 'ICE', 'IBM', 'IP', 'IPG', 'IFF', 'INTU', 'ISRG', 'IVZ', 'IRM', 'JEC', 'JBHT', 'JNJ', 'JCI', 'JOY', 'JPM', 'JNPR', 'KSU', 'K', 'KEY', 'GMCR', 'KMB', 'KIM', 'KMI', 'KLAC', 'KSS', 'KRFT', 'KR', 'LB', 'LLL', 'LH', 'LRCX', 'LM', 'LEG', 'LEN', 'LVLT', 'LUK', 'LLY', 'LNC', 'LLTC', 'LMT', 'L', 'LOW', 'LYB', 'MTB', 'MAC', 'M', 'MNK', 'MRO', 'MPC', 'MAR', 'MMC', 'MLM', 'MAS', 'MA', 'MAT', 'MKC', 'MCD', 'MCK', 'MJN', 'MMV', 'MDT', 'MRK', 'MET', 'KORS', 'MCHP', 'MU', 'MSFT', 'MHK', 'TAP', 'MDLZ', 'MON', 'MNST', 'MCO', 'MS', 'MOS', 'MSI', 'MUR', 'MYL', 'NDAQ', 'NOV', 'NAVI', 'NTAP', 'NFLX', 'NWL', 'NFX', 'NEM', 'NWSA', 'NEE', 'NLSN', 'NKE', 'NI', 'NE', 'NBL', 'JWN', 'NSC', 'NTRS', 'NOC', 'NRG', 'NUE', 'NVDA', 'ORLY', 'OXY', 'OMC', 'OKE', 'ORCL', 'OI', 'PCAR', 'PLL', 'PH', 'PDCO', 'PAYX', 'PNR', 'PBCT', 'POM', 'PEP', 'PKI', 'PRGO', 'PFE', 'PCG', 'PM', 'PSX', 'PNW', 'PXD', 'PBI', 'PCL', 'PNC', 'RL', 'PPG', 'PPL', 'PX', 'PCP', 'PCLN', 'PFG', 'PG', 'PGR', 'PLD', 'PRU', 'PEG', 'PSA', 'PHM', 'PVH', 'QRVO', 'PWR', 'QCOM', 'DGX', 'RRC', 'RTN', 'O', 'RHT', 'REGN', 'RF', 'RSG', 'RAI', 'RHI', 'ROK', 'COL', 'ROP', 'ROST', 'RLD', 'R', 'CRM', 'SNDK', 'SCG', 'SLB', 'SNI', 'STX', 'SEE', 'SRE', 'SHW', 'SPG', 'SWKS', 'SLG', 'SJM', 'SNA', 'SO', 'LUV', 'SWN', 'SE', 'STJ', 'SWK', 'SPLS', 'SBUX', 'HOT', 'STT', 'SRCL', 'SYK', 'STI', 'SYMC', 'SYY', 'TROW', 'TGT', 'TEL', 'TE', 'TGNA', 'THC', 'TDC', 'TSO', 'TXN', 'TXT', 'HSY', 'TRV', 'TMO', 'TIF', 'TWX', 'TWC', 'TJX', 'TMK', 'TSS', 'TSCO', 'RIG', 'TRIP', 'FOXA', 'TSN', 'TYC', 'UA', 'UNP', 'UNH', 'UPS', 'URI', 'UTX', 'UHS', 'UNM', 'URBN', 'VFC', 'VLO', 'VAR', 'VTR', 'VRSN', 'VZ', 'VRTX', 'VIAB', 'V', 'VNO', 'VMC', 'WMT', 'WBA', 'DIS', 'WM', 'WAT', 'ANTM', 'WFC', 'WDC', 'WU', 'WY', 'WHR', 'WFM', 'WMB', 'WEC', 'WYN', 'WYNN', 'XEL', 'XRX', 'XLNX', 'XL', 'XYL', 'YHOO', 'YUM', 'ZBH', 'ZION', 'ZTS']

In [288]:
# retrieve the statistical data for all SP500 companies 
data = []
for symbol in SP500_symbols:
    url = "https://financialmodelingprep.com/api/v3/company-key-metrics/{}?period=quarter".format(symbol)
    data.append(get_jsonparsed_data(url))

In [289]:
# save only those companies which has metrics
cleaned_data = []
for symbol in data:
    try:
        symbol['metrics']
        cleaned_data.append(symbol)
    except:
        pass

In [290]:
# save only those companies which has all 41 metric
full_info_data = []
for symbol in cleaned_data:
    try:
        if len(symbol['metrics']) == 41:
            full_info_data.append(symbol)
    except:
        pass

convert date from dictionary key-value representation, to the matrix of features
resulted `database` variable is 3-D. 
1. First dimention - the list of periods (quarterly)
2. Second dimention - the list of companies
3. Third - list of features described given company in a given period (taken from key metrics)

So, for example [0][2] - is an array of features for the company with index 2 in the first quartal

In [291]:
databases = []
for i in range(41):
    companies = []
    for symbol in full_info_data:
        company = []
        for key in keys:
            try:
                company.append(float(symbol['metrics'][i][key]))
            except:
                company.append(0)
        companies.append(company)
    databases.append(companies)

In [292]:
# in each quartal we need to define the kernal = distance matrix between companies. 
# For that firstly normalize the data, and then calculate rbf kernel
def kernalise_database(database, norm="max", gamma=1)
    kernalizes_databases = []
    for base in databases:
        normal = normalize(np.array(base), norm="max")
        kernalizes_databases.append(rbf_kernel(normal, normal, gamma=1))
    return kernalizes_databases

In [363]:
# the list of all symbols which are in the database (~has full set of statistics in a given period of time)
keys_list = []
for symbol in full_info_data:
    keys_list.append(symbol['symbol'])
print(len(keys_list))

390


# Prices processing

In [294]:
def convert_data(point):
    return point['adjclose']

def create_distance_matrix(historical_prices, approach='corr'):
    prices_matrix = []
#   need to normalize data by the percantage of change -> escape effect of different scale
    for stock_prices in historical_prices:
        new_array = stock_prices.copy() # single stock time serie
        for i, element in enumerate(stock_prices):
            if i == 0 or stock_prices[i-1] == 0:
                new_array[i] = 10E-20 #initial change is zero
            else:
                new_array[i] = (stock_prices[i] - stock_prices[i-1])/stock_prices[i-1]
            
            if math.isnan(new_array[i]) or new_array[i] == 0:
                new_array[i] = 10E-20
#         build matrix back
        if len(prices_matrix) > 0:
            prices_matrix = np.vstack((prices_matrix, new_array))
        else:
            prices_matrix = np.array(new_array)
            
#     now prices_matrix - contains list of price changes for each company in a given period
# the next step - create a kernel (distance matrix) For that there are three possible approaches

# defaul one - correlation
    if approach=='corr':
        return np.absolute(np.corrcoef(prices_matrix))
    
#     entropy between two series
    if approach=='KL':
        distances_matrix = np.zeros((len(prices_matrix), len(prices_matrix)))
        for i, stock in enumerate(prices_matrix):
            for j, stock_2 in enumerate(prices_matrix):
                distances_matrix[i][j] = entropy(stock, qk=stock_2)
        return distances_matrix
    
#     custom - two companies closer when they have more corelated changed (the same direction of change in a day)
    if approach=='custom':
        distances_matrix = np.zeros((len(prices_matrix), len(prices_matrix)))
        np.shape(distance_matrix)
        for i, stock in enumerate(prices_matrix):
            for j, stock_2 in enumerate(prices_matrix):
                k = 0
                for q, price in enumerate(stock):
                    if (price > 0 and stock_2[q] > 0) or (price <= 0 and stock_2[q] <= 0):
                        k += 1
                if abs(k/len(stock) - 0.5) * 2 > 1:
                    print(k, abs(k/len(stock) - 0.5) * 2)
                distances_matrix[i][j] = abs(k/len(stock) - 0.5) * 2
        return distances_matrix

# from YahooFinancial dataset to price matrix
def create_prices_matrix(historical_stock_prices):
    prices_matrix = []
    company_names = []
    for stock_name in historical_stock_prices:
        try:
            new_array = list(map(convert_data, historical_stock_prices[stock_name]['prices']))
            if len(prices_matrix) > 0:
                prices_matrix = np.vstack((prices_matrix, new_array))
            else:
                prices_matrix = np.array(new_array)
            company_names.append(stock_name)
        except:
            pass
    return prices_matrix, company_names   

In [296]:
# find indexes of stat-dataset which doesn't have corresponding proper prices-set - this indexes will be removed
# in order to make stat-matrixes the same dimensionality with price-matrixes
def find_extra_keys(names):
    extra_symbols = []
    for i, key in enumerate(keys_list):
        if not key in names:
            extra_symbols.append(i)
    return extra_symbols

In [315]:
# returns two 3d matrix. The first one - prices, second - statistical
def load_prices_information():
    # original price matrixes (without normalisation and calculating the distance)
    price_original_matrixes = []
    # list of the companie's name which has full price history in a given quartal
    price_matrix_names = []
    # statistical matrixes without companies which doesn't have complete prices 
    adjusted_statistical_matrix = []
    # number of calendar days in each quartal
    intervals = [91, 91, 89, 90]
    # start date, and then with each step move it on corresponding number of days
    current_date = date(2009, 7, 1)
    # number of quartals
    number_of_intervals = len(kernalizes_databases)
    for i in range(number_of_intervals):
    #   calculate start and end date for given № quartal
        start_date = ""
        end_date = ""
        increased_days = 0
    #   for leap year, there is one additional day in the first quartal
        if i == 10 or i == 26:
            increased_days = 1
        start_date = current_date.strftime("%Y-%m-%d")
        end_date = (current_date + timedelta(days=intervals[i%4] + increased_days)).strftime("%Y-%m-%d")
        current_date = current_date + timedelta(days=intervals[i%4] + 1 + increased_days)

    #   get daily historical prices for companies which has statistical data
        historical_price_data = YahooFinancials(keys_list).get_historical_price_data(start_date, end_date, 'daily')
        matrix, names = create_prices_matrix(historical_price_data)

        price_original_matrixes.append(matrix)
        price_matrix_names.append(names)

    #   execution of this block is very long (more than an hour) - this is the track of progress
        print(i, start_date, end_date)
        
    for i in range(1, number_of_intervals):
        # some companies doesn't have proper prices list - remove them from statistical databes in corresponding quartal
        extra_keys = find_extra_keys(price_matrix_names[i])
        # minus one interval because stats and prices never comes together in one quartal. All statistics comes with delay
        # from 1 month to a quartal. Therefore all future combination of stats and prices will be with 1-quartal delay
        # for example 1th quartal 2010 of statistical data with 2nd quartal 2010 of prices
        stat_mat = np.delete(np.delete(kernalizes_databases[i-1], extra_keys, 0), extra_keys, 1)
        adjusted_statistical_matrix.append(stat_mat)
#     append the last stat matrix without modification - it will not be used, just to keep dimensionality the same
    adjusted_statistical_matrix.append(kernalizes_databases[-1])
    return price_original_matrixes, adjusted_statistical_matrix
    
    
# form distance matrixes from prices. Approach = 'corr'/'KL'/'custom'
# original matrixes - 3D quartal-company-prices
def calculate_price_distances(original_matrixes, approach='corr'):
        # matrixes of distances between companies based on price and default approach (correlation)
    price_distance_matrixes = []
    for matrix in original_matrixes:
        distance_matrix = create_distance_matrix(historical_prices=matrix, approach=approach)
        price_distance_matrixes.append(distance_matrix)
    return price_distance_matrixes
    

# Anomaly detection

In [298]:
# matrix: 2D array of distance measure of variables
# trevial approach based on mean distance 
# return the list of indexes with detected anomalies
def find_anomaly_custom(matrix, per_out):
    result = []
    mean_distance = np.mean(matrix) / 3
    # if cloud_number of nearest points located father than mean_distance, then it's anomaly  
    cloud_number = 10
    avg_distances = []
    for i, stock in enumerate(matrix):
        A = np.array(stock)
        idx = np.argpartition(A, cloud_number)
        avg_distances.append(np.mean(A[idx[:cloud_number]]))
        
    number_anomaly = int(per_out * len(matrix[0]))
    return np.argpartition(np.array(avg_distances), number_anomaly)[:number_anomaly]

# base on local outlier factor
def find_anomaly_lof(matrix, per_out):
    detector = LOF(metric='precomputed', contamination=per_out)
    inlines = detector.fit_predict(matrix)
    result = []
    for i, res in enumerate(inlines):
        if res == -1:
            result.append(i)
    return result

# based on SVM
def find_anomaly_svm(matrix, per_out):
    detector = OneClassSVM(kernel='precomputed', nu=per_out)
    inlines = detector.fit_predict(matrix)
    result = []
    for i, res in enumerate(inlines):
        if res == -1:
            result.append(i)
    return result

# use pre-computed matrix of distances!!
# general function to find outliers, approach one of ['custom', 'lof', 'svm']
def find_anomaly(matrix, approach, per_out):
    if approach=='custom':
        return find_anomaly_custom(matrix, per_out)
    if approach=='lof':
        return find_anomaly_lof(matrix, per_out)
    if approach=='svm':
        return find_anomaly_svm(matrix, per_out)

Combine two distance(kernel) matrixes. Statistical and price. 

stat_1, price_1 - training period.
stat_2, price_2 - test period

1. Apply sum or multiply operator for kernels. 
2. Based on new kernel, apply anomaly detector approach.
3. Compare two arrays of anomalieys. 
4. Return two values: the proporation of anomalies from the first (training) set which are also exist in the second (test) set. And the number of detected anomalies in the first set. 

In [322]:
def compare(stat_1, price_1, stat_2, price_2, detector, per_out, operation='sum'):
    K = (stat_1 + price_1) / 2
    K_test = (stat_2 + price_2) / 2
    if operation == 'mult':
        K = stat_1 * price_1
        K_test = stat_2 * price_2
    train_anomaly = find_anomaly(K, detector, per_out)
    test_anomaly = find_anomaly(K_test, detector, per_out)
    predicted_anomalies = 0
    correct_anomalies = []
    for anomaly in train_anomaly:
        if anomaly in test_anomaly:
            correct_anomalies.append(keys_list[anomaly])
            predicted_anomalies += 1
    return predicted_anomalies/len(train_anomaly), len(train_anomaly), correct_anomalies

In [346]:
# print the information about copmanies (name/sector/industry)
# the first argument - the postfix for filename
# the second - list of tickets of companies 
def print_table(number, companies):
#   get keys
    url = "https://financialmodelingprep.com/api/v3/company/profile/AAPL"
    default_data = get_jsonparsed_data(url)['profile']
    with open('companies_{}.csv'.format(number), 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter='!',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(default_data.keys())
        for symbol in companies:
            url = "https://financialmodelingprep.com/api/v3/company/profile/{}".format(symbol)
            data = get_jsonparsed_data(url)['profile']
            writer.writerow([data[x] for x in default_data.keys()])
#                 [data['companyName'], data['sector'], data['industry']])

# print_table(8, ['AAP', 'ALXN', 'AMZN', 'AAL', 'ABC', 'ADP', 'BLL', 'BAC', 'BBBY', 'CNP', 'CTXS', 'DE', 'DISCA', 'EMR', 'FITB', 'F', 'FCX', 'GPC', 'GILD', 'IP', 'MDLZ', 'MCO', 'NTAP', 'NWL', 'NSC', 'PFE', 'O', 'SJM', 'SWN', 'TEL', 'VRSN'])

In [334]:
price_original_matrixes, adjusted_statistical_matrix = load_prices_information()

KeyboardInterrupt: 

In [321]:
# calculate and create a huge table of results
with open('resutls_3.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',',
                        quotechar='|', quoting=csv.QUOTE_MINIMAL)
    steps = 10
    writer.writerow([
        'per_out', 
        'det-dist-oper', 
        'avg_len',
        *range(1, steps+1),
        *range(1, steps+1)])

    for distance_measure in ['corr', 'custom']:
        print(distance_measure)
        price_matrixes = calculate_price_distances(price_original_matrixes, distance_measure)
        for operation in ['sum', 'mult']:
            for detector in ['svm', 'lof', 'custom']:
                for per_out in np.arange(0.05, 0.4, 0.01):
                    step_means = []
                    step_std = []
                    avg_len = 0
                    for step in range(steps):
                        predicted_anomylies_percentage = []
                        predicted_length = []
                        for i in range(1, 40 - step):
                            a, b, c = compare(adjusted_statistical_matrix[i-1], 
                                           price_matrixes[i], 
                                           adjusted_statistical_matrix[i + step], 
                                           price_matrixes[i+1 + step],
                                           detector,
                                           per_out,
                                           operation)
                            predicted_anomylies_percentage.append(a)
                            predicted_length.append(b)
    #                     avg_len = np.mean(predicted_length)
                        avg_per = np.mean(predicted_anomylies_percentage)
                        avg_len = np.mean(predicted_length)
                        std = np.std(predicted_anomylies_percentage)
                        step_means.append(round(avg_per, 3))
                        step_std.append(round(std, 3))
                    writer.writerow([
                        round(per_out, 3), 
                        "{}-{}-{}".format(detector, distance_measure, operation), 
                        avg_len,
                        *step_means,
                        *step_std
                    ]),

# exampled list of "true" anomalies
# 'AAP', 'ADS', 'AIG', 'ABC', 'COF', 'CB', 'CVS', 'DE', 'EMR', 'FITB', 'FCX', 'FTR', 'GNW', 'GILD', 'MCD', 'NTAP', 'NWL', 'PNR', 'PSA', 'PVH', 'SWN', 'TXN'

corr
custom


In [349]:
# the following code in order to work with specific values/investigate companies

step_means = []
step_std = []
avg_len = 0
step = 0

predicted_anomylies_percentage = []
predicted_length = []
companies = []
# price_matrixes = calculate_price_distances(price_original_matrixes, "custom")
for i in range(3, 10 - step):
    a, b, c = compare(adjusted_statistical_matrix[i-1], 
                   price_matrixes[i], 
                   adjusted_statistical_matrix[i + step], 
                   price_matrixes[i+1 + step],
                   detector="svm",
                   per_out=0.12,
                   operation="sum")
    predicted_anomylies_percentage.append(a)
    predicted_length.append(b)
    companies.append(c)
    print(c)
#     print_table("{}-{}".format(i, round(a, 3)), c)

# avg_per = np.mean(predicted_anomylies_percentage)
# avg_len = np.mean(predicted_length)
# std = np.std(predicted_anomylies_percentage)
# step_means.append(round(avg_per, 3))
# step_std.append(round(std, 3))

['ABC', 'BAC', 'COF', 'CERN', 'CVS', 'DE', 'DG', 'EIX', 'EMR', 'FITB', 'F', 'FCX', 'FTR', 'GILD', 'MCD', 'MDLZ', 'MCO', 'MS', 'NWL', 'SWN', 'VRTX']
['ALXN', 'ADS', 'AAL', 'ABC', 'BAC', 'COF', 'CVS', 'DE', 'EIX', 'EMR', 'FITB', 'FCX', 'FTR', 'GILD', 'HES', 'MAS', 'MCD', 'MDLZ', 'MCO', 'NTAP', 'NWL', 'NI', 'PSA', 'RHT', 'SWN']
['ADS', 'AAL', 'AIG', 'ABC', 'BAC', 'COF', 'CMI', 'CVS', 'DE', 'DFS', 'EIX', 'EMR', 'FITB', 'FCX', 'FTR', 'GILD', 'INTC', 'IP', 'MCD', 'MDLZ', 'MCO', 'NTAP', 'NWL', 'NI', 'PSA', 'ROST', 'SWN', 'TGT', 'VRSN']
['AAL', 'AIG', 'ABC', 'BLL', 'BAC', 'BBBY', 'COF', 'CVX', 'CSCO', 'CVS', 'DE', 'ECL', 'EMR', 'FITB', 'F', 'FCX', 'FTR', 'GILD', 'IP', 'MKC', 'MCD', 'MDLZ', 'MCO', 'NTAP', 'NWL', 'NI', 'NUE', 'PSA', 'SWN', 'VRSN']
['AAP', 'ALXN', 'AAL', 'ABC', 'BLL', 'BAC', 'BBBY', 'CERN', 'CVX', 'CTXS', 'CVS', 'DE', 'EMR', 'ES', 'FITB', 'F', 'FCX', 'FTR', 'GILD', 'IP', 'KLAC', 'MAS', 'MCD', 'MDLZ', 'MCO', 'NTAP', 'NWL', 'PSA', 'SWN', 'VRSN']
['ALXN', 'AAL', 'ABC', 'ADP', 'BLL',

In [362]:
for per_out in np.arange(0.05, 0.4, 0.01):
    avg = []
    for i in range(100):
        k = int(len(keys_list) * per_out)
        a = random.choices(keys_list, k=k)
        b = random.choices(keys_list, k=k)

        random_predictor = 0
        for com_a in a:
            if com_a in b:
                random_predictor += 1

        avg.append(random_predictor / k)
    print(round(per_out, 2), "---", round(np.mean(avg),2))

0.05 --- 0.04
0.06 --- 0.05
0.07 --- 0.07
0.08 --- 0.08
0.09 --- 0.09
0.1 --- 0.1
0.11 --- 0.1
0.12 --- 0.11
0.13 --- 0.12
0.14 --- 0.13
0.15 --- 0.14
0.16 --- 0.15
0.17 --- 0.15
0.18 --- 0.15
0.19 --- 0.17
0.2 --- 0.18
0.21 --- 0.19
0.22 --- 0.2
0.23 --- 0.2
0.24 --- 0.21
0.25 --- 0.23
0.26 --- 0.24
0.27 --- 0.23
0.28 --- 0.24
0.29 --- 0.25
0.3 --- 0.26
0.31 --- 0.26
0.32 --- 0.27
0.33 --- 0.27
0.34 --- 0.29
0.35 --- 0.3
0.36 --- 0.3
0.37 --- 0.3
0.38 --- 0.32
0.39 --- 0.32
