# Find two stocks with high correlation

From [Find Highly Correlated Stocks with Python!](https://towardsdatascience.com/find-highly-correlated-stocks-with-python-77eba4fd061b)

In [11]:
# Uncomment the following lines to install the required packages
# !pip install yfinance
# !pip install pandas_datareader
# !pip install yahoo-fin

In [12]:
import numpy as np
import pandas as pd

import warnings
from pandas_datareader import data as pdr

# to override deprecations in pandas-datareader
import yfinance as yf

# To set up the start/end dates for the prices
import datetime as dt

# to quickly get access to a list of the tickers in different indices
from yahoo_fin import stock_info as si

# Settings
pd.set_option('display.max_rows', None)
warnings.filterwarnings("ignore")
yf.pdr_override()

In [13]:
# Set up the start/end dates for the prices
num_of_years = 1
start = dt.date.today() - dt.timedelta(days = int(365.25*num_of_years))
end = dt.date.today()

# Get the list of tickers in the Dow Jones
tickers = si.tickers_dow()

Since we are focusing on the correlation between the movement of stocks, we can concentrate on the Adjusted Close column and then create a new Pandas DataFrame with the base-10 logarithm of the daily percentage change. With the Pandas method .corr(), we can create a correlation matrix of the new dataframe.

In [14]:
# set the list of tickers to the Dow Jones Industrial Average (DJIA) index
dataset = pdr.get_data_yahoo(tickers, start, end)['Adj Close']
stocks_returns = np.log(dataset/dataset.shift(1))

print('\nCorrelation Matrix')
corr_matrix = stocks_returns.corr()
print (corr_matrix)

[*********************100%***********************]  30 of 30 completed

Correlation Matrix
          AAPL      AMGN       AXP        BA       CAT       CRM      CSCO   
AAPL  1.000000  0.293887  0.672441  0.511271  0.464976  0.636557  0.631905  \
AMGN  0.293887  1.000000  0.256105  0.183186  0.274294  0.124244  0.286965   
AXP   0.672441  0.256105  1.000000  0.528860  0.603351  0.544533  0.537049   
BA    0.511271  0.183186  0.528860  1.000000  0.513951  0.416603  0.391762   
CAT   0.464976  0.274294  0.603351  0.513951  1.000000  0.392100  0.424282   
CRM   0.636557  0.124244  0.544533  0.416603  0.392100  1.000000  0.461028   
CSCO  0.631905  0.286965  0.537049  0.391762  0.424282  0.461028  1.000000   
CVX   0.405413  0.129876  0.375785  0.348660  0.634403  0.361904  0.325226   
DIS   0.645363  0.220448  0.634683  0.500348  0.494488  0.585753  0.464608   
DOW   0.505752  0.256414  0.612777  0.483485  0.704978  0.414713  0.440797   
GS    0.578323  0.247603  0.669191  0.531251  0.630

The correlation matrix includes redundant pairs such as AAPL to AAPL or a pair showing up twice (AAPL to MSFT and MSFT to AAPL). We can drop these and rank the dataframe to get the top absolute correlations with the functions in the gist down below.

In [15]:
def get_redundant_pairs(df):
    """drop the correlations between two of the same stocks

    Args:
        df (pd.PandasDataframe): dataframe table of the stocks

    Returns:
        pairs_to_drop (set): set of the pairs of stocks to drop
    """
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df):
    """Drop the correlations between two of the same stocks

    Args:
        df (pd.PandasDataframe): dataframe table of the stocks

    Returns:
        au_corr (pd.PandasDataframe): dataframe table of the stocks, with the 
            highest correlations among the inputted list of tickers
    """
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr

print("\nTop Absolute Correlations")
print(get_top_abs_correlations(stocks_returns))


Top Absolute Correlations
AAPL  MSFT    0.789751
GS    JPM     0.757971
KO    PG      0.739009
AXP   V       0.717757
AAPL  V       0.707100
CAT   DOW     0.704978
KO    MCD     0.698351
HON   MMM     0.690901
AAPL  INTC    0.683594
CAT   HON     0.683140
AAPL  NKE     0.679650
CRM   MSFT    0.673953
AAPL  AXP     0.672441
AXP   GS      0.669191
INTC  MSFT    0.667468
DOW   GS      0.664303
GS    HON     0.648406
AXP   JPM     0.647334
HD    MSFT    0.646946
AAPL  DIS     0.645363
HD    NKE     0.643605
NKE   V       0.637521
AAPL  CRM     0.636557
AXP   DIS     0.634683
CAT   CVX     0.634403
AAPL  CSCO    0.631905
CAT   GS      0.630276
AAPL  HD      0.628148
HD    HON     0.627266
MSFT  V       0.626460
JNJ   MRK     0.625669
MSFT  NKE     0.624210
MCD   PG      0.624018
DIS   GS      0.623130
      MSFT    0.623014
DOW   HON     0.620700
DIS   V       0.620694
CAT   JPM     0.618780
DOW   JPM     0.616674
HON   JPM     0.615769
GS    V       0.615479
AXP   DOW     0.612777
DOW   M

To prove these results we can use [TradingView](https://www.tradingview.com/), to compare the stocks. In our case we have that Apple and Microsoft have an high correlation, maybe because they operate in the same type of market, i.e. technology.