In [1]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt

In [2]:
#connect to the S&P 500 database file and create an associated cursor
conn = sqlite3.connect("SP500_stats_5y.db")
c = conn.cursor()

In [3]:
f = open('LikelyPairStocks.txt','r')
lines = f.readlines()[1:]
Symbol1s = [ line.split()[0] for line in lines]
Symbol2s = [ line.split()[1] for line in lines]
AvgDiffs = [ float(line.split()[2]) for line in lines]
S1Scales = [ float(line.split()[3]) for line in lines]
S2Scales = [ float(line.split()[4]) for line in lines]
f.close()


In [4]:
#Get the general S&P 500 index value to include for comparison
c.execute("SELECT * FROM GSPC_stats WHERE date_int > 1460")
SP500_df = pd.DataFrame(c.fetchall(), columns = [x[0] for x in c.description])

In [5]:
def DailyStockPlot(Symbol):
    c.execute("SELECT * FROM "+Symbol.replace('.','_')+"_stats WHERE date_int > 1460")
    stats_df = pd.DataFrame(c.fetchall(), columns = [x[0] for x in c.description])
    
    return stats_df

def PairTradeCalc(Symbol1,Symbol2,AvgDiff,S1Scale,S2Scale,Nostradamus=False):
    
    stats1_df = DailyStockPlot(Symbol1)
    stats2_df = DailyStockPlot(Symbol2)
    
    #Can't use this here, since it requires "future" information unavailable for real-time decision making
    if Nostradamus == True:
        S1Scale = np.mean(stats1_df['Close'])
        S2Scale = np.mean(stats2_df['Close'])
        AvgDiff = np.mean(stats1_df['Close']/S1Scale - stats2_df['Close']/S2Scale)

    InitFunds = 100000.0
    AvailableFunds = InitFunds

    CurrentValues = []

    Symbol1Position = 'None'
    Symbol2Position = 'None'

    Symbol1Shares = 0.0
    Symbol2Shares = 0.0

    LongCost1 = 0.0
    LongCost2 = 0.0

    ShortFunds1 = 0.0
    ShortFunds2 = 0.0

    for i in range(len(stats1_df)-1):
        DailyFunds = AvailableFunds

        printstats = False

        #if (stats1_df['Close'][i] - stats2_df['Close'][i]) > AvgDiff:
        if (stats1_df['Close'][i]/S1Scale - stats2_df['Close'][i]/S2Scale) > AvgDiff:
            #want to short S1 / long S2
            #print('We expect that '+Symbol1+' will drop in price relative to '+Symbol2+' in the near(ish) future.')
            #print("Thus, we'd like to be in a short position for "+Symbol1+" and a long position for "+Symbol2+".")

            if Symbol1Position == 'None':
                #Set up a short position worth half of the available funds on symbol 1 shares at the next open
                Symbol1Shares = -0.5*DailyFunds/stats1_df['Open'][i+1]
                ShortFunds1 = 0.5*DailyFunds
                AvailableFunds = AvailableFunds - ShortFunds1
                Symbol1Position = 'Short'
                #print('Opening short position on '+Symbol1+': $'+str(round(ShortFunds1,2))+' / '+str(round(-Symbol1Shares,2))+' shares')
                printstats = True

            if Symbol1Position == 'Long':
                #Sell currently owned symbol 1 shares at the next open
                #print('Closing long position on '+Symbol1+', netting $'+str(round(stats1_df['Open'][i+1]*Symbol1Shares-LongCost1,2)))
                AvailableFunds += stats1_df['Open'][i+1]*Symbol1Shares
                Symbol1Shares = 0.0
                Symbol1Position = 'None'
                printstats = True

            if Symbol2Position == 'None':
                #Spend half of available funds on symbol 2 shares at the next open
                #print('Opening long position on '+Symbol2+': $'+str(round(0.5*DailyFunds,2))+' / '+str(round(0.5*DailyFunds/stats2_df['Open'][i+1],2))+' shares')
                Symbol2Shares = 0.5*DailyFunds/stats2_df['Open'][i+1]
                AvailableFunds = AvailableFunds - 0.5*DailyFunds
                LongCost2 = 0.5*DailyFunds
                Symbol2Position = 'Long'
                printstats = True

            if Symbol2Position == 'Short':
                #Close out the symbol 2 short position
                #print('Closing short position on '+Symbol2+', netting $'+str(round(ShortFunds2+Symbol2Shares*stats2_df['Open'][i+1],2)))
                AvailableFunds += 2*ShortFunds2+Symbol2Shares*stats2_df['Open'][i+1]
                Symbol2Shares = 0.0
                ShortFunds2 = 0.0
                Symbol2Position = 'None'
                printstats = True

        #elif (stats1_df['Close'][i] - stats2_df['Close'][i]) < AvgDiff:
        elif (stats1_df['Close'][i]/S1Scale - stats2_df['Close'][i]/S2Scale) < AvgDiff:
            #want to long S1 / short S2
            #print('We expect that '+Symbol1+' will rise in price relative to '+Symbol2+' in the near(ish) future.')
            #print("Thus, we'd like to be in a long position for "+Symbol1+" and a short position for "+Symbol2+".")

            if Symbol1Position == 'None':
                #Spend half of available funds on symbol 1 shares at the next open
                #print('Opening long position on '+Symbol1+': $'+str(round(0.5*DailyFunds,2))+' / '+str(round(0.5*DailyFunds/stats1_df['Open'][i+1],2))+' shares')
                Symbol1Shares = 0.5*DailyFunds/stats1_df['Open'][i+1]
                AvailableFunds = AvailableFunds - 0.5*DailyFunds
                LongCost1 = 0.5*DailyFunds
                Symbol1Position = 'Long'
                printstats = True

            if Symbol1Position == 'Short':
                #Close out the symbol 1 short position
                #print('Closing short position on '+Symbol1+', netting $'+str(round(ShortFunds1+Symbol1Shares*stats1_df['Open'][i+1],2)))
                AvailableFunds += 2*ShortFunds1+Symbol1Shares*stats1_df['Open'][i+1]
                Symbol1Shares = 0.0
                ShortFunds1 = 0.0
                Symbol1Position = 'None'
                printstats = True

            if Symbol2Position == 'None':
                #Set up a short position worth half of the available funds on symbol 2 shares at the next open
                Symbol2Shares = -0.5*DailyFunds/stats2_df['Open'][i+1]
                ShortFunds2 = 0.5*DailyFunds
                AvailableFunds = AvailableFunds - ShortFunds2
                Symbol2Position = 'Short'
                #print('Opening short position on '+Symbol2+': $'+str(round(ShortFunds2,2))+' / '+str(round(-Symbol2Shares,2))+' shares')
                printstats = True

            if Symbol2Position == 'Long':
                #Sell currently owned symbol 2 shares at the next open
                #print('Closing long position on '+Symbol2+', netting $'+str(round(stats2_df['Open'][i+1]*Symbol2Shares-LongCost2,2)))
                AvailableFunds += stats2_df['Open'][i+1]*Symbol2Shares
                Symbol2Shares = 0.0
                Symbol2Position = 'None'
                printstats = True


    #Close out all positions on the final day
    i = len(stats2_df)-1
    if Symbol1Position == 'Long':
        #Sell currently owned symbol 1 shares at the final open
        #print('Closing long position on '+Symbol1+', netting $'+str(round(stats1_df['Open'][i]*Symbol1Shares-LongCost1,2)))
        AvailableFunds += stats1_df['Open'][i]*Symbol1Shares
        Symbol1Shares = 0.0
        Symbol1Position = 'None'
    if Symbol2Position == 'Long':
        #Sell currently owned symbol 2 shares at the final open
        #print('Closing long position on '+Symbol2+', netting $'+str(round(stats2_df['Open'][i]*Symbol2Shares-LongCost2,2)))
        AvailableFunds += stats2_df['Open'][i]*Symbol2Shares
        Symbol2Shares = 0.0
        Symbol2Position = 'None'
    if Symbol1Position == 'Short':
        #Close out the symbol 1 short position
        #print('Closing short position on '+Symbol1+', netting $'+str(round(ShortFunds1+Symbol1Shares*stats1_df['Open'][i],2)))
        AvailableFunds += 2*ShortFunds1+Symbol1Shares*stats1_df['Open'][i]
        Symbol1Shares = 0.0
        ShortFunds1 = 0.0
        Symbol1Position == 'None'
    if Symbol2Position == 'Short':
        #Close out the symbol 2 short position
        #print('Closing short position on '+Symbol2+', netting $'+str(round(ShortFunds2+Symbol2Shares*stats2_df['Open'][i],2)))
        AvailableFunds += 2*ShortFunds2+Symbol2Shares*stats2_df['Open'][i]
        Symbol2Shares = 0.0
        ShortFunds2 = 0.0
        Symbol2Position == 'None'
    CurrentValues.append(AvailableFunds)

    #printstr = stats1_df['Date'][i]+' Close: \t'+Symbol1+'\t$'+str(round(stats1_df['Close'][i],2))
    #printstr += '\t'+Symbol2+'\t$'+str(round(stats2_df['Close'][i],2))+'\tDifference: '
    #printstr += str(round(stats1_df['Close'][i]-stats2_df['Close'][i],2))+'\tCurrent Value:\t$'+str(round(AvailableFunds,2))
    #print(printstr)
    
    BaselineFund  =  InitFunds*SP500_df['Open'][len(SP500_df)-1]/SP500_df['Open'][0]
    BaselineFund1 =  0.5*InitFunds*stats1_df['Open'][len(stats1_df)-1]/stats1_df['Open'][0]
    BaselineFund2 =  0.5*InitFunds*stats2_df['Open'][len(stats2_df)-1]/stats2_df['Open'][0]
    
    return(100.0*AvailableFunds/InitFunds, 100.0*2*BaselineFund1/InitFunds, 100.0*2*BaselineFund2/InitFunds, 100.0*(BaselineFund)/InitFunds)

In [6]:
PairTot = 0.0
SP500Tot = 0.0

MaxNumPairs = 100

print('Symbol1\tSymbol2\tPair Trading\tS&P 500\t\tIndividual Ratio\tCumulative Portfolio Ratio')
for i in range(len(Symbol1s[0:MaxNumPairs])):
    PairTrade,Symbol1Trade,Symbol2Trade,SP500Trade = PairTradeCalc(Symbol1s[i],Symbol2s[i],AvgDiffs[i],S1Scales[i],S2Scales[i])
    PairTot  += PairTrade
    SP500Tot += SP500Trade
    print(str(Symbol1s[i])+'\t'+str(Symbol2s[i])+'\t'+str(round(PairTrade,1))+'%\t\t'+str(round(SP500Trade,1))+'%\t\t'+str(round(100.0*PairTrade/SP500Trade,1))+'%\t\t\t'+str(round(100.0*PairTot/SP500Tot,2))+'%')

print()
print("Pair trading portfolio value after one year: "+str(round(PairTot/len(Symbol1s[0:MaxNumPairs]),2))+"%")
print("Had we just invested in the S&P 500 writ large: "+str(round(SP500Tot/len(Symbol1s[0:MaxNumPairs]),2))+"%")
print()
print("Thus, while we gained "+str(round(PairTot/len(Symbol1s[0:MaxNumPairs])-100.0,2))+"%, we underperformed the market by "+str(round(-(PairTot-SP500Tot)/len(Symbol1s[0:MaxNumPairs]),2))+"%")


Symbol1	Symbol2	Pair Trading	S&P 500		Individual Ratio	Cumulative Portfolio Ratio
AAL	APA	101.3%		117.8%		86.0%			86.01%
UAA	VTR	145.2%		117.8%		123.3%			104.65%
PXD	WYNN	62.8%		117.8%		53.3%			87.53%
HES	ULTA	126.1%		117.8%		107.1%			92.41%
BKR	WYNN	89.3%		117.8%		75.8%			89.09%
CCL	FANG	45.0%		117.8%		38.2%			80.61%
FLT	PEAK	96.5%		117.8%		81.9%			80.79%
VLO	WYNN	118.2%		117.8%		100.3%			83.23%
CVX	UAA	111.9%		117.8%		95.0%			84.54%
AAL	DVN	22.4%		117.8%		19.0%			77.99%
LUMN	WDC	168.5%		117.8%		143.1%			83.9%
AIG	WYNN	86.3%		117.8%		73.2%			83.01%
K	MU	102.4%		117.8%		87.0%			83.32%
CVX	UA	138.8%		117.8%		117.8%			85.78%
GWW	TWTR	101.7%		117.8%		86.3%			85.82%
APA	SLB	123.3%		117.8%		104.7%			86.99%
CBOE	WDC	114.8%		117.8%		97.5%			87.61%
LKQ	WY	102.4%		117.8%		86.9%			87.57%
LRCX	POOL	125.3%		117.8%		106.4%			88.56%
BKR	NLSN	82.2%		117.8%		69.7%			87.62%
MAS	FB	115.8%		117.8%		98.3%			88.12%
NLSN	PXD	86.8%		117.8%		73.7%			87.47%
BIIB	KR	67.6%		117.8%		57.4%			86.16%
DAL	UAA	105.7%	

In [7]:
#Now let's see how the ML-identified pairs perform

f = open('LikelyPairStocks_ML.txt','r')
lines = f.readlines()[1:]
Symbol1s = [ line.split()[0] for line in lines]
Symbol2s = [ line.split()[1] for line in lines]
AvgDiffs = [ float(line.split()[2]) for line in lines]
S1Scales = [ float(line.split()[3]) for line in lines]
S2Scales = [ float(line.split()[4]) for line in lines]
f.close()

In [8]:
PairTotML = 0.0
SP500Tot = 0.0

print('Symbol1\tSymbol2\tPair Trading\tS&P 500\t\tIndividual Ratio\tCumulative Portfolio Ratio')
for i in range(len(Symbol1s[0:MaxNumPairs])):
    PairTrade,Symbol1Trade,Symbol2Trade,SP500Trade = PairTradeCalc(Symbol1s[i],Symbol2s[i],AvgDiffs[i],S1Scales[i],S2Scales[i])
    PairTotML  += PairTrade
    SP500Tot += SP500Trade
    print(str(Symbol1s[i])+'\t'+str(Symbol2s[i])+'\t'+str(round(PairTrade,1))+'%\t\t'+str(round(SP500Trade,1))+'%\t\t'+str(round(100.0*PairTrade/SP500Trade,1))+'%\t\t\t'+str(round(100.0*PairTotML/SP500Tot,2))+'%')

print()
print("Pair trading portfolio value after one year: "+str(round(PairTotML/len(Symbol1s[0:MaxNumPairs]),2))+"%")
print("Had we just invested in the S&P 500 writ large: "+str(round(SP500Tot/len(Symbol1s[0:MaxNumPairs]),2))+"%")
print()
print("Thus, while we gained "+str(round(PairTotML/len(Symbol1s[0:MaxNumPairs])-100.0,2))+"%, we underperformed the market by "+str(round(-(PairTotML-SP500Tot)/len(Symbol1s[0:MaxNumPairs]),2))+"%")


Symbol1	Symbol2	Pair Trading	S&P 500		Individual Ratio	Cumulative Portfolio Ratio
BKR	WYNN	89.3%		117.8%		75.8%			75.79%
DLTR	RHI	66.4%		117.8%		56.3%			66.06%
BKNG	DLTR	118.3%		117.8%		100.4%			77.51%
CHRW	NVR	86.0%		117.8%		73.0%			76.38%
DLTR	WAB	142.1%		117.8%		120.6%			85.22%
DLTR	NTRS	116.5%		117.8%		98.9%			87.49%
FB	ROK	107.8%		117.8%		91.5%			88.07%
ANET	ROST	77.2%		117.8%		65.5%			85.25%
LUMN	WDC	168.5%		117.8%		143.1%			91.67%
HOLX	URI	85.7%		117.8%		72.7%			89.78%
PSA	WDC	88.5%		117.8%		75.1%			88.44%
DLTR	UHS	102.8%		117.8%		87.3%			88.34%
GWW	TWTR	101.7%		117.8%		86.3%			88.19%
TTWO	URI	81.1%		117.8%		68.8%			86.81%
HPE	HST	99.4%		117.8%		84.4%			86.64%
DLTR	IPG	89.4%		117.8%		75.9%			85.97%
LKQ	WY	102.4%		117.8%		86.9%			86.02%
CAT	TWTR	119.7%		117.8%		101.6%			86.89%
LRCX	POOL	125.3%		117.8%		106.4%			87.92%
BKR	NLSN	82.2%		117.8%		69.7%			87.01%
BIIB	K	127.8%		117.8%		108.5%			88.03%
BIIB	CNC	95.7%		117.8%		81.2%			87.72%
MAS	FB	115.8%		117.8%		98.3%			88.18%
NLSN	PXD	

In [9]:
print('Notably, while neither set of stocks outperform the market, the pairs identified using machine learning outperform those identified using more "conventional" means by '+str(round((PairTotML-PairTot)/len(Symbol1s[0:MaxNumPairs]),2))+'%')


Notably, while neither set of stocks outperform the market, the pairs identified using machine learning outperform those identified using more "conventional" means by 3.29%


In [10]:
#Let's now look at how the portfolio does if we know the future AvgDiff values
PairTotML = 0.0
SP500Tot = 0.0

print('Symbol1\tSymbol2\tPair Trading\tS&P 500\t\tIndividual Ratio\tCumulative Portfolio Ratio')
for i in range(len(Symbol1s[0:MaxNumPairs])):
    PairTrade,Symbol1Trade,Symbol2Trade,SP500Trade = PairTradeCalc(Symbol1s[i],Symbol2s[i],AvgDiffs[i],S1Scales[i],S2Scales[i],Nostradamus=True)
    PairTotML  += PairTrade
    SP500Tot += SP500Trade
    print(str(Symbol1s[i])+'\t'+str(Symbol2s[i])+'\t'+str(round(PairTrade,1))+'%\t\t'+str(round(SP500Trade,1))+'%\t\t'+str(round(100.0*PairTrade/SP500Trade,1))+'%\t\t\t'+str(round(100.0*PairTotML/SP500Tot,2))+'%')

print()
print("Assuming we know the average price differential beforehand...")
print()
print("Pair trading portfolio value after one year: "+str(round(PairTotML/len(Symbol1s[0:MaxNumPairs]),2))+"%")
print("Had we just invested in the S&P 500 writ large: "+str(round(SP500Tot/len(Symbol1s[0:MaxNumPairs]),2))+"%")
print()
print("Thus, not only did we gain "+str(round(PairTotML/len(Symbol1s[0:MaxNumPairs])-100.0,2))+"%, we also outperformed the market by "+str(round((PairTotML-SP500Tot)/len(Symbol1s[0:MaxNumPairs]),2))+"%")



Symbol1	Symbol2	Pair Trading	S&P 500		Individual Ratio	Cumulative Portfolio Ratio
BKR	WYNN	96.8%		117.8%		82.2%			82.2%
DLTR	RHI	148.4%		117.8%		125.9%			104.06%
BKNG	DLTR	142.3%		117.8%		120.8%			109.63%
CHRW	NVR	154.9%		117.8%		131.5%			115.09%
DLTR	WAB	108.4%		117.8%		92.0%			110.46%
DLTR	NTRS	118.7%		117.8%		100.7%			108.84%
FB	ROK	124.1%		117.8%		105.3%			108.34%
ANET	ROST	83.1%		117.8%		70.6%			103.62%
LUMN	WDC	167.1%		117.8%		141.8%			107.86%
HOLX	URI	190.4%		117.8%		161.6%			113.24%
PSA	WDC	103.9%		117.8%		88.2%			110.96%
DLTR	UHS	108.2%		117.8%		91.8%			109.36%
GWW	TWTR	93.7%		117.8%		79.5%			107.07%
TTWO	URI	180.6%		117.8%		153.3%			110.37%
HPE	HST	140.5%		117.8%		119.3%			110.96%
DLTR	IPG	121.0%		117.8%		102.7%			110.44%
LKQ	WY	116.4%		117.8%		98.8%			109.76%
CAT	TWTR	105.7%		117.8%		89.7%			108.64%
LRCX	POOL	127.9%		117.8%		108.5%			108.64%
BKR	NLSN	88.3%		117.8%		75.0%			106.95%
BIIB	K	133.4%		117.8%		113.2%			107.25%
BIIB	CNC	127.0%		117.8%		107.8%			107.28%
MAS	FB	103.9%