In [269]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set() #this is used for plot styling
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

# Begin by reading the original data
df1 = pd.read_csv("2018_Financial_Data.csv")

# Create the financial data frame we need with extra features taken directly from the real data (df1)
financial_data = pd.DataFrame(df1, columns = ['Symbol', 'Revenue', 'Revenue Growth', 'Cost of Revenue', 'Gross Profit', 
                     'Operating Expenses', 'Operating Income', 'Earnings before Tax', 'Net Income', 'Net Debt']) 


# Calculate earnings per share ratio
financial_data['EarningsPerShare'] = (df1['Net Income'] - df1['Preferred Dividends']) / df1['Weighted Average Shs Out']

# Get the P/E ratio directly from data
financial_data['PriceEarningsRatio'] = df1['PE ratio']

# Calculate debt to equity ratio
financial_data['DebtEquityRatio'] = (df1['Short-term debt'] + df1['Long-term debt']) / df1['Total shareholders equity']

# Get the return on equity ratio
financial_data['ReturnOnEquity'] = df1['returnOnEquity']

# Get the quick ratio
financial_data['QuickRatio'] = df1['quickRatio']

# Calculate the working capital ratio
financial_data['WorkingCapitalRatio'] = df1['Total current assets'] / df1['Total current liabilities']



# Get rid of empty values from the date given
nan_value = float("NaN")
financial_data.replace("", nan_value, inplace=True) #change empty data to nan
financial_data.replace(0, nan_value, inplace=True) #change zero data to nan
financial_data.dropna(axis=0, inplace = True) #get rid of nans

# Save the original data frame as target data frame
target_data = financial_data

# Drop the symbols for KMeans
financial_data = financial_data.drop('Symbol', 1)
#display(target_data)

In [259]:
# KMeans with 3 groups
kmeans = KMeans(n_clusters = 3,  random_state=0)
kmeans.fit(financial_data)
y_k3means = kmeans.predict(financial_data)
print(kmeans.labels_)

# Now find what stock is performing in the best group
portfolio_k3 = pd.DataFrame(target_data.loc[y_k3means > 1, 'Symbol'])
display(portfolio_k3)
portfolio_k3.to_csv("2018_Portoflio_3K.csv", index = None) #create csv file wih stock tickers (companies)

[1 0 1 ... 1 1 1]


Unnamed: 0,Symbol
6,AAPL
36,XOM
90,WMT
169,UNH
172,AMZN
427,BP
905,MCK
1482,SNP
2059,TM
2204,PTR


In [260]:
# KMeans with 4 groups
kmeans = KMeans(n_clusters = 4,  random_state=None)
kmeans.fit(financial_data)
y_k4means = kmeans.predict(financial_data)

# Now find what stock is performing in the best group
portfolio_k4 = pd.DataFrame(target_data.loc[y_k4means > 2, 'Symbol'])
display(portfolio_k4)
portfolio_k4.to_csv("2018_Portoflio_4K.csv", index = None) #create csv file wih stock tickers (companies)

Unnamed: 0,Symbol
6,AAPL
36,XOM
90,WMT
427,BP
1482,SNP
2059,TM
2204,PTR


In [261]:
portfolio_k3 = portfolio_k3.to_numpy()
portfolio_k4 = portfolio_k4.to_numpy()
np.intersect1d(portfolio_k3, portfolio_k4)

array(['AAPL', 'BP', 'PTR', 'SNP', 'TM', 'WMT', 'XOM'], dtype=object)

In [270]:
for i in range(50):
    kmeans = KMeans(n_clusters = 3,  random_state=0)
    kmeans.fit(financial_data)
    y_k3means = kmeans.predict(financial_data)
    portfolio_k3 = pd.DataFrame(target_data.loc[y_k3means > 1, 'Symbol'])
    
    kmeans = KMeans(n_clusters = 4,  random_state=None)
    kmeans.fit(financial_data)
    y_k4means = kmeans.predict(financial_data)
    portfolio_k4 = pd.DataFrame(target_data.loc[y_k4means > 2, 'Symbol'])
    
    portfolio_k3 = portfolio_k3.to_numpy()
    portfolio_k4 = portfolio_k4.to_numpy()
    print(np.intersect1d(portfolio_k3, portfolio_k4))

['AAPL' 'BP' 'PTR' 'SNP' 'TM' 'WMT' 'XOM']
[]
['AAPL' 'BP' 'PTR' 'SNP' 'TM' 'WMT' 'XOM']
[]
[]
['AAPL' 'BP' 'PTR' 'SNP' 'TM' 'WMT' 'XOM']
[]
[]
['AMZN' 'MCK' 'UNH']
['AMZN' 'MCK' 'UNH']
[]
['AMZN' 'MCK' 'UNH']
['AMZN' 'MCK' 'UNH']
[]
['AMZN' 'MCK' 'UNH']
[]
['AMZN' 'MCK' 'UNH']
[]
['AMZN' 'MCK' 'UNH']
['AMZN' 'MCK' 'UNH']
[]
['AMZN' 'MCK' 'UNH']
['AAPL' 'BP' 'PTR' 'SNP' 'TM' 'WMT' 'XOM']
['AMZN' 'MCK' 'UNH']
['AAPL' 'BP' 'PTR' 'SNP' 'TM' 'WMT' 'XOM']
[]
['AMZN' 'MCK' 'UNH']
[]
[]
[]
[]
['AMZN' 'MCK' 'UNH']
['AMZN' 'MCK' 'UNH']
[]
[]
[]
[]
[]
[]
['AMZN' 'MCK' 'UNH']
[]
['MCK']
[]
['AMZN' 'MCK' 'UNH']
['AAPL' 'BP' 'PTR' 'SNP' 'TM' 'WMT' 'XOM']
['AAPL' 'BP' 'PTR' 'SNP' 'TM' 'WMT' 'XOM']
[]
[]
['AMZN' 'MCK' 'UNH']
[]
