In [6]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set() #this is used for plot styling
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

# Begin by reading the original data
df1 = pd.read_csv("2018_Financial_Data.csv")

# Create the financial data frame we need with extra features taken directly from the real data (df1)
financial_data = pd.DataFrame(df1, columns = ['Symbol', 'Revenue', 'Revenue Growth', 'Cost of Revenue', 'Gross Profit', 
                     'Operating Expenses', 'Operating Income', 'Earnings before Tax', 'Net Income', 'Net Debt']) 


# Calculate earnings per share ratio
financial_data['EarningsPerShare'] = (df1['Net Income'] - df1['Preferred Dividends']) / df1['Weighted Average Shs Out']

# Get the P/E ratio directly from data
financial_data['PriceEarningsRatio'] = df1['PE ratio']

# Calculate debt to equity ratio
financial_data['DebtEquityRatio'] = (df1['Short-term debt'] + df1['Long-term debt']) / df1['Total shareholders equity']

# Get the return on equity ratio
financial_data['ReturnOnEquity'] = df1['returnOnEquity']

# Get the quick ratio
financial_data['QuickRatio'] = df1['quickRatio']

# Calculate the working capital ratio
financial_data['WorkingCapitalRatio'] = df1['Total current assets'] / df1['Total current liabilities']



# Get rid of empty values from the date given
nan_value = float("NaN")
financial_data.replace("", nan_value, inplace=True) #change empty data to nan
financial_data.replace(0, nan_value, inplace=True) #change zero data to nan
financial_data.dropna(axis=0, inplace = True) #get rid of nans

# Save the original data frame as target data frame
target_data = financial_data

# Drop the symbols for KMeans
financial_data = financial_data.drop('Symbol', 1)

#display(target_data)

In [15]:
# In this section we run K-Means algorithm with 3 different cluster sizes, which are 4, 5 and 6. 
# We will know what company is really good when we get the company to be in group 3 (with 4 clusters), 
# in group 4 (with 5 clusters), and in group 5 (with 6 clusters). These groups perform the highest therefore our portfolio
# will contain the stock of these companies.

# Temporary data frames to save the symbols of the stocks in the best group for each 
temp_df1 = pd.DataFrame()
temp_df2 = pd.DataFrame()
temp_df3 = pd.DataFrame()

# Since K-Means algorithm p
for i in range(100):
    kmeans = KMeans(n_clusters = 4,  random_state=0)
    kmeans.fit(financial_data)
    y_k3means = kmeans.predict(financial_data)
    portfolio_k3 = pd.DataFrame(target_data.loc[y_k3means > 2, 'Symbol']).to_numpy()
    
    kmeans = KMeans(n_clusters = 5,  random_state=None)
    kmeans.fit(financial_data)
    y_k4means = kmeans.predict(financial_data)
    portfolio_k4 = pd.DataFrame(target_data.loc[y_k4means > 3, 'Symbol']).to_numpy()
    
    kmeans = KMeans(n_clusters = 6,  random_state=None)
    kmeans.fit(financial_data)
    y_k5means = kmeans.predict(financial_data)
    portfolio_k5 = pd.DataFrame(target_data.loc[y_k5means > 4, 'Symbol']).to_numpy()
    

    port1 = pd.DataFrame(np.intersect1d(portfolio_k3, portfolio_k4))
    port2 = pd.DataFrame(np.intersect1d(portfolio_k4, portfolio_k5))
    port3 = pd.DataFrame(np.intersect1d(portfolio_k3, portfolio_k5))
    
    temp_df1 = temp_df1.append(port1, ignore_index=True)
    temp_df2 = temp_df2.append(port2, ignore_index=True)
    temp_df3 = temp_df3.append(port3, ignore_index=True)

In [16]:
temp_df1 = temp_df1.reset_index()
temp_df2 = temp_df2.reset_index()
temp_df3 = temp_df3.reset_index()
port_df = [temp_df1, temp_df2, temp_df3]
df_final = pd.concat(port_df, axis=1)

In [17]:
df_final = df_final.drop('index', 1)
df_final.to_csv("hello.csv", index=False)

In [18]:
portfolio_dff = pd.read_csv("hello.csv", header=None)
portfolio_dff.columns = ['one','two', 'three']
#display(portfolio_dff)
countsOne = portfolio_dff['one'].value_counts().to_dict()
mostOne = portfolio_dff['one'].value_counts().idxmax()

countsTwo = portfolio_dff['two'].value_counts().to_dict()
mostTwo = portfolio_dff['two'].value_counts().idxmax()

countsThree = portfolio_dff['three'].value_counts().to_dict()
mostThree = portfolio_dff['three'].value_counts().idxmax()

print(mostOne)
print(mostTwo)
print(mostThree)

portfolio_dff=portfolio_dff.apply(pd.value_counts).fillna(0)
display(portfolio_dff.nlargest(10, ['one']))
display(portfolio_dff.nlargest(10, ['two']))
display(portfolio_dff.nlargest(10, ['three']))

GOOGL
C
GOOGL


Unnamed: 0,one,two,three
GOOGL,20.0,4,21.0
MSFT,16.0,1,9.0
ANTM,15.0,3,9.0
LFC,15.0,3,9.0
PBR,13.0,3,9.0
HD,12.0,3,9.0
ABC,11.0,1,8.0
BA,11.0,2,9.0
CAH,11.0,2,9.0
COST,11.0,2,9.0


Unnamed: 0,one,two,three
C,0.0,35,0.0
GOOGL,20.0,4,21.0
RY,0.0,4,0.0
ANTM,15.0,3,9.0
E,0.0,3,0.0
HD,12.0,3,9.0
LFC,15.0,3,9.0
PBR,13.0,3,9.0
BA,11.0,2,9.0
CAH,11.0,2,9.0


Unnamed: 0,one,two,three
GOOGL,20.0,4,21.0
AMZN,10.0,1,13.0
MCK,11.0,1,13.0
UNH,10.0,1,13.0
ANTM,15.0,3,9.0
BA,11.0,2,9.0
CAH,11.0,2,9.0
COST,11.0,2,9.0
FCAU,11.0,2,9.0
HD,12.0,3,9.0
