Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

In [2]:
def read_data():
    # Read in the s&p dataset
    stocks = pd.read_csv('sp500.csv')

    # Create array X of the all long business summaries
    X = np.array(stocks.longbusinesssummary)

    # Encode the textual data from X into vectors so that we can compute the cosine distance
    text_data = X
    model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    embeddings = model.encode(text_data, show_progress_bar=True)
    embed_data = embeddings
    X = np.array(embed_data)
    n_comp = 5
    pca = PCA(n_components=n_comp)
    pca.fit(X)
    pca_data = pd.DataFrame(pca.transform(X))
    pca_data.head()

    # Recommender function taken in modified form from:https://towardsdatascience.com/hands-on-content-based-recommender-system-using-python-1d643bf314e4
    cos_sim_data = pd.DataFrame(cosine_similarity(X))
    # Write cosine similarity dataframe to .csv
    cos_sim_data.to_csv("cosine_sim_data.csv", encoding='utf-8', index=False)
    stocks.to_csv("stock_data.csv", encoding='utf-8', index=False)
    return cos_sim_data, X, stocks

In [3]:
def give_recommendations(index, cos_sim_data, stocks,  print_recommendation=False, print_recommendation_longbusinesssummary=False, print_sectors=False):
    index_recomm = cos_sim_data.loc[index].sort_values(ascending=False).index.tolist()[1:21]
    stocks_recomm = stocks['symbol'].loc[[index_recomm]].values
    result = {'Stocks': stocks_recomm, 'Index': index_recomm}
    if print_recommendation == True:
        print('The watched stock is this one: %s \n' %(stocks['symbol'].loc[index]))
        k = 1
        for stock in stocks_recomm:
            print('The number %i recommended stock is this one: %s \n' %(k, stock))
    if print_recommendation_longbusinesssummary == True:
        print('The longbusinesssummary of the watched stock is this one:\n %s \n' %(stocks['longbusinesssummary'].loc[index]))
        k = 1
        for q in range(len(stocks_recomm)):
            plot_q = stocks['longbusinesssummary'].loc[index_recomm[q]]
            print('The longbusinesssummary of the number %i recommended stock is this one:\n %s \n' % (
                k, plot_q))
            k = k+1
    if print_sectors == True:
        print('The sector of the watched stock is this one:\n %s \n' %
              (stocks['sector'].loc[index]))
        k = 1
        for q in range(len(stocks_recomm)):
            plot_q = stocks['sector'].loc[index_recomm[q]]
            print('The sector of the number %i recommended stock is this one:\n %s \n' % (
                k, plot_q))
            k = k+1
    return result

In [4]:
def retrieve_data():
    stocks = pd.read_csv('stock_data.csv')
    cos_sim_data = pd.read_csv('cosine_sim_data.csv')
    return stocks, cos_sim_data

In [5]:
cos_sim_data, X, stocks = read_data()

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [6]:
print("Using Read Data (From Vectors)")
print(cos_sim_data)
print("-"*40)
print(stocks)

Using Read Data (From Vectors)
          0         1         2         3         4         5         6    \
0    1.000000  0.839805  0.688621  0.653047  0.811482  0.685936  0.715968   
1    0.839805  1.000000  0.853749  0.791327  0.750751  0.680766  0.718450   
2    0.688621  0.853749  1.000000  0.861346  0.673313  0.666379  0.677037   
3    0.653047  0.791327  0.861346  1.000000  0.613879  0.600303  0.629626   
4    0.811482  0.750751  0.673313  0.613879  1.000000  0.657426  0.749447   
..        ...       ...       ...       ...       ...       ...       ...   
488  0.686056  0.700278  0.621001  0.598488  0.548673  0.687927  0.668160   
489  0.802010  0.781172  0.795191  0.758079  0.815782  0.786920  0.807681   
490  0.783420  0.888580  0.863532  0.832415  0.766614  0.647149  0.638198   
491  0.728397  0.738428  0.668322  0.654986  0.666000  0.716688  0.719008   
492  0.669107  0.763582  0.801618  0.752104  0.621262  0.664196  0.622523   

          7         8         9    ...      

In [7]:
stocks,cos_sim_data = retrieve_data()

In [8]:
print("Using Retrieve Data (From CSV)")
print(cos_sim_data)
print("-"*40)
print(stocks)

Using Retrieve Data (From CSV)
            0         1         2         3         4         5         6  \
0    1.000000  0.839805  0.688621  0.653047  0.811482  0.685936  0.715968   
1    0.839805  1.000000  0.853749  0.791327  0.750751  0.680766  0.718450   
2    0.688621  0.853749  1.000000  0.861346  0.673313  0.666379  0.677037   
3    0.653047  0.791327  0.861346  1.000000  0.613879  0.600303  0.629626   
4    0.811482  0.750751  0.673313  0.613879  1.000000  0.657426  0.749447   
..        ...       ...       ...       ...       ...       ...       ...   
488  0.686056  0.700278  0.621001  0.598488  0.548673  0.687927  0.668160   
489  0.802010  0.781172  0.795191  0.758079  0.815782  0.786920  0.807681   
490  0.783420  0.888580  0.863532  0.832415  0.766614  0.647149  0.638198   
491  0.728397  0.738428  0.668322  0.654986  0.666000  0.716688  0.719008   
492  0.669107  0.763582  0.801618  0.752104  0.621262  0.664196  0.622523   

            7         8         9  ...      

In [9]:
index = 0
index_recomm = cos_sim_data.loc[index].sort_values(ascending=False).index.tolist()[1:21]
stocks_recomm = stocks['symbol'].loc[[index_recomm]].values
print(stocks['symbol'].loc[index_recomm].values)

KeyError: "None of [Index([('246', '417', '201', '220', '109', '383', '54', '144', '469', '230', '143', '66', '396', '152', '482', '149', '191', '159', '394', '83')], dtype='object')] are in the [index]"

In [None]:
index_recomm = cos_sim_data.loc[index].sort_values(ascending=False).index.tolist()[1:21]
print(give_recommendations(0, cos_sim_data, stocks))