In [85]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set() #this is used for plot styling
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

# Begin by reading the original data
df1 = pd.read_csv("2018_Financial_Data.csv")

# Create the financial data frame we need with extra features taken directly from the real data (df1)
financial_data = pd.DataFrame(df1, columns = ['Symbol', 'Revenue', 'Revenue Growth', 'Cost of Revenue', 'Gross Profit', 
                     'Operating Expenses', 'Operating Income', 'Earnings before Tax', 'Net Income', 'Net Debt']) 


# Calculate earnings per share ratio
financial_data['EarningsPerShare'] = (df1['Net Income'] - df1['Preferred Dividends']) / df1['Weighted Average Shs Out']

# Get the P/E ratio directly from data
financial_data['PriceEarningsRatio'] = df1['PE ratio']

# Calculate debt to equity ratio
financial_data['DebtEquityRatio'] = (df1['Short-term debt'] + df1['Long-term debt']) / df1['Total shareholders equity']

# Get the return on equity ratio
financial_data['ReturnOnEquity'] = df1['returnOnEquity']

# Get the quick ratio
financial_data['QuickRatio'] = df1['quickRatio']

# Calculate the working capital ratio
financial_data['WorkingCapitalRatio'] = df1['Total current assets'] / df1['Total current liabilities']



# Get rid of empty values from the date given
nan_value = float("NaN")
financial_data.replace("", nan_value, inplace=True) #change empty data to nan
financial_data.replace(0, nan_value, inplace=True) #change zero data to nan
financial_data.dropna(axis=0, inplace = True) #get rid of nans

# Save the original data frame as target data frame
target_data = financial_data

# Drop the symbols for KMeans
financial_data = financial_data.drop('Symbol', 1)

#display(target_data)


In [86]:
# In this section we run K-Means algorithm with 3 different cluster sizes, which are 4, 5 and 6. 
# We will know what company is really good when we get the company to be in group 3 (with 4 clusters), 
# in group 4 (with 5 clusters), and in group 5 (with 6 clusters). These groups perform the highest therefore our portfolio
# will contain the stock of these companies.

# Since K-Means algorithm picks arbitrary points and then tries to adjust the mean, then we will have different portfolios 
# each time we run the K-Means. This is not good for us as we need groups to be the same, so we can create our 
# final portfolio. Now if we run this many times we notice that after 3 or 4 iterations, the algorithm adjusts back the first 
# groups, and this happens over and over again. We get similar stocks in each iteration and we add this an array and keep them
# for further analysis. 
# In nutshell, we iterate K-Means algorithm many times, we get groups in each iterations, we take the stock that is most 
# repeated and put it in our final portfolio. 

# Temporary data frames to save the symbols of the stocks in the best group for each 
temp_df1 = pd.DataFrame()
temp_df2 = pd.DataFrame()
temp_df3 = pd.DataFrame()

# Loop to run K-Means many times with 3 different cluster sizes. 
for i in range(200):
    
    # K-Means with 4 clusters
    kmeans = KMeans(n_clusters = 4,  random_state=0)
    kmeans.fit(financial_data)
    y_k3means = kmeans.predict(financial_data)
    portfolio_k3 = pd.DataFrame(target_data.loc[y_k3means > 2, 'Symbol']).to_numpy() #get the best group of stocks
    
    # K-Means with 5 clusters
    kmeans = KMeans(n_clusters = 5,  random_state=None)
    kmeans.fit(financial_data)
    y_k4means = kmeans.predict(financial_data)
    portfolio_k4 = pd.DataFrame(target_data.loc[y_k4means > 3, 'Symbol']).to_numpy() #get the best group of stocks
    
    # K-Means with 6 clusters
    kmeans = KMeans(n_clusters = 6,  random_state=None)
    kmeans.fit(financial_data)
    y_k5means = kmeans.predict(financial_data)
    portfolio_k5 = pd.DataFrame(target_data.loc[y_k5means > 4, 'Symbol']).to_numpy() #get the best group of stocks
    
    # compare the portoflios and select the best stocks
    port1 = pd.DataFrame(np.intersect1d(portfolio_k3, portfolio_k4))
    port2 = pd.DataFrame(np.intersect1d(portfolio_k4, portfolio_k5))
    port3 = pd.DataFrame(np.intersect1d(portfolio_k3, portfolio_k5))
    #print(np.intersect1d(portfolio_k3, portfolio_k4))
    
    # save the list of these stocks
    temp_df1 = temp_df1.append(port1, ignore_index=True)
    temp_df2 = temp_df2.append(port2, ignore_index=True)
    temp_df3 = temp_df3.append(port3, ignore_index=True)

In [87]:
# Put all the above stocks together in a list
temp_df1 = temp_df1.reset_index()
temp_df2 = temp_df2.reset_index()
temp_df3 = temp_df3.reset_index()
port_df = [temp_df1, temp_df2, temp_df3]
df_final = pd.concat(port_df, axis=1)

In [88]:
df_final = df_final.drop('index', 1)

# May not need to create semi-portfolio
df_final.to_csv("semi_final_portfolio.csv", index=False)

In [90]:
portfolio_dff = pd.read_csv("semi_final_portfolio.csv", header=None)
#portfolio_dff = df_final
portfolio_dff.columns = ['one','two', 'three']
#display(portfolio_dff)

## OBJECTIVE: join/concat all columns together-> combine totals(add)-> group top 10

# Place all columns into the same list
df1 = portfolio_dff['one']
df2 = portfolio_dff['two']
df3 = portfolio_dff['three']
dff = df3.append(df2).append(df1)

# Get top 10 total counts
countsTotal = dff.value_counts().head(10)

#countsOne = portfolio_dff['one'].value_counts().to_dict()  -> ORIGINAL
#mostOne = portfolio_dff['one'].value_counts().idxmax()
print(countsTotal)

# Output result to final_portfolio.csv
countsTotal.to_csv("final_portfolio.csv", header=False)

#portfolio_dff=portfolio_dff.apply(pd.value_counts).fillna(0)
#display(portfolio_dff.nlargest(5, ['one']))
#display(portfolio_dff.nlargest(10, ['two']))
#display(portfolio_dff.nlargest(10, ['three']))
#portfolio_dff = portfolio_dff.to_numpy()



GOOGL    116
C         77
MSFT      58
MCK       42
UNH       39
AMZN      39
LFC       37
ANTM      37
PBR       29
HD        27
dtype: int64
