In [1]:
import financedatabase as fd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import collections
import json
import csv
from pathlib import Path
import shutil

## Finance Database to Similar Stocks Industries

In [2]:
# Only needed to be done once! To save the dictionary
'''
fd_to_ss_dict = {}
for industry in fd.show_options("equities")["industries"]:
    if industry:
        fd_to_ss_dict[industry] = industry.replace(" - ", "-").replace(" ", "_").replace("&", "and")

with open('fd_to_ss_dict.json', 'w') as fp:
    json.dump(fd_to_ss_dict, fp, indent=4)
'''

# Reads the dictionary already pre-saved
with open('fd_to_ss_dict.json', 'r') as json_file:
    fd_to_ss_dict = json.load(json_file)

# Display some of the conversions
for idx, kv in enumerate(fd_to_ss_dict.items()):
    print(f"{kv[0]} {(40 - len(kv[0])) * ' '}: {kv[1]}")
    if idx >= 5:
        break

Advertising Agencies                     : Advertising_Agencies
Aerospace & Defense                      : Aerospace_and_Defense
Aerospace Defense - Major Diversified    : Aerospace_Defense-Major_Diversified
Aerospace Defense Products & Services    : Aerospace_Defense_Products_and_Services
Agricultural Chemicals                   : Agricultural_Chemicals
Agricultural Inputs                      : Agricultural_Inputs


## Create Algorithm folder

In [3]:
algorithm_folder = "cosine_similarity"

override_existing = True

In [4]:
# Create algorithm folder
Path(algorithm_folder).mkdir(parents=True, exist_ok=True)
for industry in fd.show_options("equities")["industries"]:
    if industry:
        print(f"\n{industry} ({algorithm_folder})")
        industry_path = os.path.join(algorithm_folder, fd_to_ss_dict[industry])
        # Create industry folder
        if not os.path.isdir(industry_path) or override_existing:
            Path(industry_path).mkdir(parents=True, exist_ok=True)

            try:
                df = pd.DataFrame(fd.select_equities(industry=industry)).T

                if not df.empty and len(df) > 1:
                    for idx in range(len(df)):
                        ticker = df.index[idx]
                        print(f"{ticker}, ", end="")

                        # NLP algorithm
                        vectorizer = TfidfVectorizer()
                        trsfm = vectorizer.fit_transform(df["summary"].values)
                        df_out = pd.DataFrame(
                            trsfm.toarray(),
                            columns=vectorizer.get_feature_names_out(),
                            index=df.index)

                        similarity = cosine_similarity(trsfm[idx], trsfm)

                        similary_comparison = {}
                        for smybol, similar_value in zip(df.index, similarity[0]):
                            similary_comparison[smybol] = similar_value

                        similary_comparison = {**collections.OrderedDict(sorted(similary_comparison.items(), key=lambda x: x[1], reverse=True))}

                        # Save files
                        ticker_path = os.path.join(algorithm_folder, fd_to_ss_dict[industry], ticker)
                        with open(f"{ticker_path}.json", 'w') as fp:
                            json.dump(similary_comparison, fp, indent=4)

                    print("")
                    
                else:
                    shutil.rmtree(industry_path)

            except ValueError:
                shutil.rmtree(industry_path)


Advertising Agencies (cosine_similarity)
AATV, ACGX, ACTL, ADMG, ADV, BAOS, BLAB, BOMN, BXNG, BYOC, CCO, CKPDY, CMGO, CMPR, CNET, CNFN, COGV, CQER, CRTO, CYAGF, CYGIY, CYGT, DBMM, DLX, DMS, DNTUF, DNTUY, EEX, ENGA, EPGC, EZOO, FBCD, FLNT, GOOLF, GTNM, GTRL, HKUOY, HRTH, ICLK, IFUS, INEOF, INUV, IPG, ISIG, ISML, ITKH, JCDXF, KHZM, KNIT, LIGA, LOGX, MCHX, MDCA, MGNI, MMDDF, MMND, MOBO, MOBQ, MYRY, MYSL, NCMI, NHLPF, NWCN, NXMR, OMC, OPESF, PGPEF, PRKI, PROM, PSYC, PUBGY, PVSP, QNST, QUOT, RFII, SCPPF, SNIPF, SRAX, STCB, SWWI, TRBO, TRKA, TSQ, TTTPF, UCPA, VIZC, VPSN, WIMI, WPP, WPPGF, WWIO, XNET, YUGVF, ZULU, 

Aerospace & Defense (cosine_similarity)
AAIIQ, AAZZF, ADFS, AERG, AIR, AIRI, AJRD, AKRRF, APLD, ASDN, ASTC, ATRO, ATROB, AUTLF, AVAV, AVPFF, AXON, BA, BAESF, BAESY, BANT, BDRAF, BDRBF, BRSI, BUKS, BWXT, BYRND, CAE, CMGMY, CODA, CUB, CVU, DCO, DEWY, DFLYF, DLRWF, DSUS, DUAVF, EADSF, EADSY, EH, ERJ, ESLT, ETCC, FINMF, FINMY, FLY, FLYLF, FTGFF, HEI-A, HEI, HERXF, HII, HWKE, HXL, IPT