# Exploratory Data Analysis

## Import Libraries

In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import datetime as dt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

## Read Dataset to Pandas Dataframe

In [2]:
# read fundamentals_fscore.csv file into a pandas dataframe
funda_df = pd.read_csv('fundamentals_fscore.csv')
# display funda_df
display(funda_df.head())

Unnamed: 0,CIK,Ticker_Symbol,Company_Name,Sector,Data_Date,Market_Capitalization,Total_Assets,Total_Assets_PY1,Total_Assets_PY2,Total_Liabilities,...,F_CLEVER,CLIQUID,F_CLIQUID,EQ_OFFER,F_EQ_OFFER,CMARGIN,F_CMARGIN,CTURN,F_CTURN,F_SCORE
0,1750,AIR,AAR CORP,Capital Goods,2011-05-31,1049.8206,1703.727,1501.042,1377.511,868.438,...,1,-0.45519,0,0.297,0,-0.004629,0,0.201443,1,6
1,1750,AIR,AAR CORP,Capital Goods,2012-05-31,485.2897,2195.653,1703.727,1501.042,1329.631,...,0,0.049832,1,0.492,0,-0.008424,0,0.034591,1,5
2,1750,AIR,AAR CORP,Capital Goods,2013-05-31,790.0029,2136.9,2195.653,1703.727,1217.4,...,1,0.410468,1,-0.891,1,0.010205,1,-0.230628,0,7
3,1750,AIR,AAR CORP,Capital Goods,2014-05-31,961.308,2199.5,2136.9,2195.653,1198.8,...,1,0.120341,1,0.178,0,0.014049,1,-0.034682,0,7
4,1750,AIR,AAR CORP,Capital Goods,2015-05-31,1046.3954,1515.0,2199.5,2136.9,669.9,...,1,-0.461891,0,-4.137,1,-0.065087,0,-0.227468,0,3


In [3]:
# note: the cik of each company is usuallly 10 digits in length
# note: the leading 0's from some ciks have been removed
# note: therefore, these leading 0's have to be added again to ensure all ciks are of length 10

# add leading 0's to ciks that have a length less than 10
funda_df['CIK'] = funda_df['CIK'].apply(lambda x: '{0:0>10}'.format(x))

# convert values in the Data_Date column of funda_df to datetime 
funda_df['Data_Date'] = pd.to_datetime(funda_df['Data_Date'])

## Filter High Book-to-Market Companies

In [4]:
# note: piotroski's study focuses on companies with high b/m ratio values
# note: i.e. the companies that are contained in the highest b/m quintile
# note: thus, it makes sense to conduct exploratory data analysis on these companies only

# filter high b/m valued companies and reset index of funda_df
funda_df = funda_df[funda_df['BM_Quintile']=='Very High'].reset_index(drop=True)
# row count of funda_df after filtering
print('funda_df row count:', funda_df.shape[0])

funda_df row count: 1457


## Basic Data Statistics

In [17]:
# basic data statistics of numerical variables in funda_df
pd.options.display.max_columns = None
funda_df.describe()

Unnamed: 0,Market_Capitalization,Total_Assets,Total_Assets_PY1,Total_Assets_PY2,Total_Liabilities,Net_Income_Before_Extra_Items,Cash_Flow_From_Operations,Total_Long_Term_Debt,Total_Long_Term_Debt_PY1,Current_Assets,Current_Assets_PY1,Current_Liabilities,Current_Liabilities_PY1,Common_Shares_Outstanding,Common_Shares_Outstanding_PY1,Total_Sales,Total_Sales_PY1,Cost_Of_Goods_Sold,Cost_Of_Goods_Sold_PY1,BM_Ratio,ROA,F_ROA,CFO,F_CFO,CROA,F_CROA,ACCRUAL,F_ACCRUAL,CLEVER,F_CLEVER,CLIQUID,F_CLIQUID,EQ_OFFER,F_EQ_OFFER,CMARGIN,F_CMARGIN,CTURN,F_CTURN,F_SCORE
count,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0,1457.0
mean,3808.380671,10426.221568,9980.533977,9306.162421,6141.761528,11.593026,743.728166,3180.745658,2948.664066,1956.609402,1961.349509,1521.562669,1486.38013,216.942767,210.080103,5459.87006,5338.14596,4121.956987,3967.230993,1.852489,-0.006867,0.577213,0.067304,0.912835,-0.016721,0.400824,-0.074172,0.910776,0.005066,0.479753,-0.060125,0.479067,6.862664,0.323953,-0.015321,0.4221,-0.020301,0.471517,4.978037
std,10009.256182,24983.923979,22997.926179,21131.412206,16417.678891,1213.538602,1967.069219,8012.942231,7123.634985,5409.743381,5636.227182,5242.362445,5144.264764,425.646456,403.281381,14206.8549,14151.688566,11714.025913,11504.463624,5.828545,0.091012,0.494172,0.057708,0.282174,0.104098,0.490234,0.08728,0.285165,0.099382,0.499761,1.672566,0.499733,77.860883,0.468143,0.410295,0.494064,0.33675,0.499359,1.56483
min,3.1909,69.123,53.961,49.039,16.065,-14459.0,-4784.0,0.0,0.0,2.152,4.628,2.809,2.809,0.621,1.201,17.697,5.966,0.668,0.192,0.816972,-0.834983,0.0,-0.31907,0.0,-0.890394,0.0,-0.903233,0.0,-0.685533,0.0,-23.757453,0.0,-634.011,0.0,-6.611349,0.0,-6.09036,0.0,0.0
25%,356.5624,1223.527,1195.469,1112.216,565.027,-68.753,50.728,240.541,220.629,228.465,231.833,126.537,123.7,40.712,39.673,480.821,470.484,285.282,267.136,0.935353,-0.032001,0.0,0.037641,1.0,-0.041538,0.0,-0.101514,1.0,-0.027051,0.0,-0.250645,0.0,-0.145,0.0,-0.0328,0.0,-0.068404,0.0,4.0
50%,1061.4134,3174.016,3042.79,2930.559,1786.819,13.558,185.665,882.572,827.41,614.0,623.836,370.568,369.888,88.583,86.336,1558.758,1564.342,976.892,965.25,1.14195,0.008117,1.0,0.064927,1.0,-0.007053,0.0,-0.059885,1.0,0.0,0.0,-0.016311,0.0,0.3,0.0,-0.004556,0.0,-0.004322,0.0,5.0
75%,3122.4202,9518.1,9456.0,9160.0,5315.3,128.0,636.0,3107.0,2853.095,1837.921,1790.7,1198.854,1165.0,228.246,219.3,4503.0,4443.0,3183.5,2997.0,1.6467,0.035312,1.0,0.094123,1.0,0.017428,1.0,-0.034433,1.0,0.035301,1.0,0.228133,1.0,2.193,1.0,0.017709,1.0,0.050286,1.0,6.0
max,207817.6917,531864.0,444097.0,403821.0,337980.0,19370.0,43602.0,166250.0,125972.0,114649.0,115902.0,95569.0,94600.0,7281.629,6139.425,170805.0,192308.0,132907.0,149310.0,199.680871,1.224404,1.0,0.466842,1.0,1.141538,1.0,0.757563,1.0,0.66782,1.0,19.648207,1.0,1540.223,1.0,7.581915,1.0,3.671603,1.0,9.0


In [18]:
# proportion of fundamental measures with positive scoring function result
print('ROA - proportion with positive signal:', funda_df.query('F_ROA==1').shape[0] / funda_df.shape[0])
print('\n')
print('CFO - proportion with positive signal:', funda_df.query('F_CFO==1').shape[0] / funda_df.shape[0])
print('\n')
print('CROA - proportion with positive signal:', funda_df.query('F_CROA==1').shape[0] / funda_df.shape[0])
print('\n')
print('ACCRUAL - proportion with positive signal:', funda_df.query('F_ACCRUAL==1').shape[0] / funda_df.shape[0])
print('\n')
print('CLEVER - proportion with positive signal:', funda_df.query('F_CLEVER==1').shape[0] / funda_df.shape[0])
print('\n')
print('CLIQUID - proportion with positive signal:', funda_df.query('F_CLIQUID==1').shape[0] / funda_df.shape[0])
print('\n')
print('EQ_OFFER - proportion with positive signal:', funda_df.query('F_EQ_OFFER==1').shape[0] / funda_df.shape[0])
print('\n')
print('CMARGIN - proportion with positive signal:', funda_df.query('F_CMARGIN==1').shape[0] / funda_df.shape[0])
print('\n')
print('CTURN - proportion with positive signal:', funda_df.query('F_CTURN==1').shape[0] / funda_df.shape[0])
print('\n')

ROA - proportion with positive signal: 0.577213452299245


CFO - proportion with positive signal: 0.91283459162663


CROA - proportion with positive signal: 0.4008236101578586


ACCRUAL - proportion with positive signal: 0.9107755662319835


CLEVER - proportion with positive signal: 0.47975291695264244


CLIQUID - proportion with positive signal: 0.4790665751544269


EQ_OFFER - proportion with positive signal: 0.32395332875772137


CMARGIN - proportion with positive signal: 0.42210020590253944


CTURN - proportion with positive signal: 0.47151681537405626




In [19]:
# note: given the basic data statistics in the able above, the data will have to standardized or normalized
# note: this is due to some numerical variables having different scales
# note: and the variation in minimum and maximum values
# note: standardization / normalization will be applied where necessary below

## F-SCORE Distibution

In [26]:
# histogram of F-SCORE distribution
fig = go.Figure()
fig.add_trace(go.Histogram(x=funda_df['F_SCORE'], name='F_SCORE', 
                           xbins=dict(start=0.0, end=10.0, size=1), 
                           marker=dict(color='red', line=dict(color='darkred', width=1)), 
                           opacity=0.6))
fig.update_layout(template='plotly_white', 
                  title=go.layout.Title(text='Histogram - Distribution of F_SCORE',
                                        font=dict(family='Times New Roman, serif'),
                                        x=0.5),
                  xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text='F_SCORE',
                                                                    font=dict(family='Times New Roman, serif')),
                                        tickfont=dict(family='Times New Roman, serif')),
                  yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text='Frequency',
                                                                    font=dict(family='Times New Roman, serif')),
                                        tickfont=dict(family='Times New Roman, serif')))
fig.show()

In [21]:
# create dataframe to store statistics of F-SCORE distribution
F_SCORE_dist_df = pd.DataFrame(columns=['Count', 'Percentage_(%)'])
# count and percentage of each F-SCORE class
F_SCORE_dist_df['Count'] = funda_df['F_SCORE'].value_counts()
F_SCORE_dist_df['Percentage_(%)'] = funda_df['F_SCORE'].value_counts(normalize=True) * 100
# reset index in F_SCORE_dist_df and sort F_SCORE by ascending order
F_SCORE_dist_df.index.name = 'F_SCORE'
F_SCORE_dist_df = F_SCORE_dist_df.reset_index(level=0)
F_SCORE_dist_df = F_SCORE_dist_df.sort_values(by='F_SCORE').reset_index(drop=True)
# display F_SCORE_dist_df
display(F_SCORE_dist_df)

Unnamed: 0,F_SCORE,Count,Percentage_(%)
0,0,1,0.068634
1,1,7,0.480439
2,2,70,4.804393
3,3,185,12.697323
4,4,300,20.590254
5,5,355,24.365134
6,6,285,19.560741
7,7,175,12.010981
8,8,70,4.804393
9,9,9,0.617708


## Profitability Fundamental Measures - Analysis

In [27]:
#----------ROA by company size----------

# create dataframe to store ROA and company size tercile
ROA_df = funda_df[['ROA', 'Size_Tercile']]
# store ROA values for small, medium and large companies
# note: these values have been normalized  
small_ROA = ((ROA_df.loc[ROA_df['Size_Tercile']=='Small', 'ROA'] - 
              ROA_df.loc[ROA_df['Size_Tercile']=='Small', 'ROA'].min()) /
             (ROA_df.loc[ROA_df['Size_Tercile']=='Small', 'ROA'].max() -
              ROA_df.loc[ROA_df['Size_Tercile']=='Small', 'ROA'].min())) 
medium_ROA = ((ROA_df.loc[ROA_df['Size_Tercile']=='Medium', 'ROA'] - 
             ROA_df.loc[ROA_df['Size_Tercile']=='Medium', 'ROA'].min()) /
             (ROA_df.loc[ROA_df['Size_Tercile']=='Medium', 'ROA'].max() -
              ROA_df.loc[ROA_df['Size_Tercile']=='Medium', 'ROA'].min())) 
large_ROA = ((ROA_df.loc[ROA_df['Size_Tercile']=='Large', 'ROA'] - 
              ROA_df.loc[ROA_df['Size_Tercile']=='Large', 'ROA'].min()) /
             (ROA_df.loc[ROA_df['Size_Tercile']=='Large', 'ROA'].max() -
              ROA_df.loc[ROA_df['Size_Tercile']=='Large', 'ROA'].min()))
# box plots of ROA by company size tercile 
fig = go.Figure()
fig.add_trace(go.Box(x=small_ROA, name='Small', boxpoints='outliers', marker_color='red', opacity=0.6))
fig.add_trace(go.Box(x=medium_ROA, name='Medium', boxpoints='outliers', marker_color='red', opacity=0.6))
fig.add_trace(go.Box(x=large_ROA, name='Large', boxpoints='outliers', marker_color='red', opacity=0.6))
fig.update_layout(template='plotly_white', showlegend=False,
                  title=go.layout.Title(text='Boxplots - Normalized ROA by Company Size Tercile',
                                        font=dict(family='Times New Roman, serif'),
                                        x=0.5),
                  xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text='Normalized ROA',
                                                                    font=dict(family='Times New Roman, serif')),
                                        tickfont=dict(family='Times New Roman, serif')),
                  yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text='Company Size Tercile',
                                                                    font=dict(family='Times New Roman, serif')),
                                        tickfont=dict(family='Times New Roman, serif')))
fig.show()

In [28]:
#----------CFO by company size----------

# create dataframe to store CFO and company size tercile
CFO_df = funda_df[['CFO', 'Size_Tercile']]
# store CFO values for small, medium and large companies
# note: these values have been normalized  
small_CFO = ((CFO_df.loc[CFO_df['Size_Tercile']=='Small', 'CFO'] - 
              CFO_df.loc[CFO_df['Size_Tercile']=='Small', 'CFO'].min()) /
             (CFO_df.loc[CFO_df['Size_Tercile']=='Small', 'CFO'].max() -
              CFO_df.loc[CFO_df['Size_Tercile']=='Small', 'CFO'].min())) 
medium_CFO = ((CFO_df.loc[CFO_df['Size_Tercile']=='Medium', 'CFO'] - 
               CFO_df.loc[CFO_df['Size_Tercile']=='Medium', 'CFO'].min()) /
             (CFO_df.loc[CFO_df['Size_Tercile']=='Medium', 'CFO'].max() -
              CFO_df.loc[CFO_df['Size_Tercile']=='Medium', 'CFO'].min())) 
large_CFO = ((CFO_df.loc[CFO_df['Size_Tercile']=='Large', 'CFO'] - 
              CFO_df.loc[CFO_df['Size_Tercile']=='Large', 'CFO'].min()) /
             (CFO_df.loc[CFO_df['Size_Tercile']=='Large', 'CFO'].max() -
              CFO_df.loc[CFO_df['Size_Tercile']=='Large', 'CFO'].min()))
# box plots of CFO by company size tercile 
fig = go.Figure()
fig.add_trace(go.Box(x=small_CFO, name='Small', boxpoints='outliers', marker_color='red', opacity=0.6))
fig.add_trace(go.Box(x=medium_CFO, name='Medium', boxpoints='outliers', marker_color='red', opacity=0.6))
fig.add_trace(go.Box(x=large_CFO, name='Large', boxpoints='outliers', marker_color='red', opacity=0.6))
fig.update_layout(template='plotly_white', showlegend=False,
                  title=go.layout.Title(text='Boxplots - Normalized CFO by Company Size Tercile',
                                        font=dict(family='Times New Roman, serif'),
                                        x=0.5),
                  xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text='Normalized CFO',
                                                                    font=dict(family='Times New Roman, serif')),
                                        tickfont=dict(family='Times New Roman, serif')),
                  yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text='Company Size Tercile',
                                                                    font=dict(family='Times New Roman, serif')),
                                        tickfont=dict(family='Times New Roman, serif')))
fig.show()

In [29]:
#----------ACCRUAL relative to ROA and CFO----------

# create dataframe to store ACCRUAL, ROA and CFO
ACCRUAL_df = funda_df[['ACCRUAL', 'ROA', 'CFO']]
# store ACCRUAL, ROA and CFO values in independent variables
# note: these values have been normalized
norm_ACCRUAL = ((ACCRUAL_df['ACCRUAL'] - ACCRUAL_df['ACCRUAL'].min()) /
                (ACCRUAL_df['ACCRUAL'].max() - ACCRUAL_df['ACCRUAL'].min()))
norm_ROA = ((ACCRUAL_df['ROA'] - ACCRUAL_df['ROA'].min()) /
            (ACCRUAL_df['ROA'].max() - ACCRUAL_df['ROA'].min()))
norm_CFO = ((ACCRUAL_df['CFO'] - ACCRUAL_df['CFO'].min()) /
            (ACCRUAL_df['CFO'].max() - ACCRUAL_df['CFO'].min()))
# 3d scatter plot of ACCRUAL relative to ROA and CFO
fig = go.Figure()
fig.add_trace(go.Scatter3d(x=norm_ROA, y=norm_CFO, z=norm_ACCRUAL,
                           mode='markers', marker_color='red', opacity=0.6))
fig.update_layout(template='plotly_white', showlegend=False, 
                  scene=dict(xaxis=dict(title='Normalized ROA',
                                        tickfont=dict(family='Times New Roman, serif'),
                                        titlefont=dict(family='Times New Roman, serif')),
                               yaxis=dict(title='Normalized CFO',
                                          tickfont=dict(family='Times New Roman, serif'),
                                          titlefont=dict(family='Times New Roman, serif')),
                               zaxis=dict(title='Normalized ACCRUAL',
                                          tickfont=dict(family='Times New Roman, serif'),
                                          titlefont=dict(family='Times New Roman, serif'))),
                   title=go.layout.Title(text='3D Scatter Plot - Normalized ACCRUAL',
                                        font=dict(family='Times New Roman, serif'),
                                        x=0.5),
                  height=700)
fig.show()

## Leverage, Liquidity and Source of Funds' Fundamental Measures - Analysis

In [30]:
#----------CLEVER by company size----------

# create dataframe to store CLEVER and company size tercile
CLEVER_df = funda_df[['CLEVER', 'Size_Tercile']]
# store CLEVER values for small, medium and large companies
# note: these values have been normalized  
small_CLEVER = ((CLEVER_df.loc[CLEVER_df['Size_Tercile']=='Small', 'CLEVER'] - 
                 CLEVER_df.loc[CLEVER_df['Size_Tercile']=='Small', 'CLEVER'].min()) /
             (CLEVER_df.loc[CLEVER_df['Size_Tercile']=='Small', 'CLEVER'].max() -
              CLEVER_df.loc[CLEVER_df['Size_Tercile']=='Small', 'CLEVER'].min())) 
medium_CLEVER = ((CLEVER_df.loc[CLEVER_df['Size_Tercile']=='Medium', 'CLEVER'] - 
                  CLEVER_df.loc[CLEVER_df['Size_Tercile']=='Medium', 'CLEVER'].min()) /
             (CLEVER_df.loc[CLEVER_df['Size_Tercile']=='Medium', 'CLEVER'].max() -
              CLEVER_df.loc[CLEVER_df['Size_Tercile']=='Medium', 'CLEVER'].min())) 
large_CLEVER = ((CLEVER_df.loc[CLEVER_df['Size_Tercile']=='Large', 'CLEVER'] - 
                 CLEVER_df.loc[CLEVER_df['Size_Tercile']=='Large', 'CLEVER'].min()) /
             (CLEVER_df.loc[CLEVER_df['Size_Tercile']=='Large', 'CLEVER'].max() -
              CLEVER_df.loc[CLEVER_df['Size_Tercile']=='Large', 'CLEVER'].min()))
# box plots of CLEVER by company size tercile 
fig = go.Figure()
fig.add_trace(go.Box(x=small_CLEVER, name='Small', boxpoints='outliers', marker_color='red', opacity=0.6))
fig.add_trace(go.Box(x=medium_CLEVER, name='Medium', boxpoints='outliers', marker_color='red', opacity=0.6))
fig.add_trace(go.Box(x=large_CLEVER, name='Large', boxpoints='outliers', marker_color='red', opacity=0.6))
fig.update_layout(template='plotly_white', showlegend=False,
                  title=go.layout.Title(text='Boxplots - Normalized CLEVER by Company Size Tercile',
                                        font=dict(family='Times New Roman, serif'),
                                        x=0.5),
                  xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text='Normalized CLEVER',
                                                                    font=dict(family='Times New Roman, serif')),
                                        tickfont=dict(family='Times New Roman, serif')),
                  yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text='Company Size Tercile',
                                                                    font=dict(family='Times New Roman, serif')),
                                        tickfont=dict(family='Times New Roman, serif')))
fig.show()

In [31]:
#----------CLIQUID by market capitalization----------

# create dataframe to store CLIQUID, company size tercile and market capitalization
CLIQUID_df = funda_df[['CLIQUID', 'Size_Tercile', 'Market_Capitalization']]
# store CLIQUID values for small, medium and large companies
# note: these values have been normalized
small_CLIQUID = ((CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Small', 'CLIQUID'] - 
                  CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Small', 'CLIQUID'].min()) /
             (CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Small', 'CLIQUID'].max() -
              CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Small', 'CLIQUID'].min())) 
medium_CLIQUID = ((CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Medium', 'CLIQUID'] - 
                   CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Medium', 'CLIQUID'].min()) /
             (CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Medium', 'CLIQUID'].max() -
              CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Medium', 'CLIQUID'].min())) 
large_CLIQUID = ((CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Large', 'CLIQUID'] - 
                  CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Large', 'CLIQUID'].min()) /
             (CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Large', 'CLIQUID'].max() -
              CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Large', 'CLIQUID'].min()))
# store market capitalization values for small, medium and large companies
small_market_cap = CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Small', 'Market_Capitalization']
medium_market_cap = CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Medium', 'Market_Capitalization']
large_market_cap = CLIQUID_df.loc[CLIQUID_df['Size_Tercile']=='Large', 'Market_Capitalization']
# scatter plots of CLIQUID by market capitalization 
# note: the plots are split by company size terciles
fig = make_subplots(rows=1, cols=3)
fig.add_trace(go.Scatter(x=small_market_cap*1000000, y=small_CLIQUID, name='Small',
                         mode='markers', marker_color='green', opacity=0.6), row=1, col=1)
fig.add_trace(go.Scatter(x=medium_market_cap*1000000, y=medium_CLIQUID, name='Medium',
                         mode='markers', marker_color='orange', opacity=0.6), row=1, col=2)
fig.add_trace(go.Scatter(x=large_market_cap*1000000, y=large_CLIQUID, name='Large',
                         mode='markers', marker_color='red', opacity=0.6), row=1, col=3)
fig.update_xaxes(title_text='Market Capitalization ($)', titlefont=dict(family='Times New Roman, serif'),
                 tickfont=dict(family='Times New Roman, serif'))
fig.update_yaxes(title_text='Normalized CLIQUID', titlefont=dict(family='Times New Roman, serif'),
                 tickfont=dict(family='Times New Roman, serif'), row=1, col=1)
fig.update_layout(template='plotly_white', showlegend=True,
                  title=go.layout.Title(text='Scatter Plots - Normalized CLIQUID by Market Capitalization',
                                        font=dict(family='Times New Roman, serif'),
                                        x=0.5),
                  legend=go.layout.Legend(font=dict(family='Times New Roman, serif')))
fig.show()

In [32]:
#----------EQ_OFFER by market capitalization----------

# create dataframe to store EQ_OFFER, company size tercile and market capitalization
EQ_OFFER_df = funda_df[['EQ_OFFER', 'Size_Tercile', 'Market_Capitalization']]
# store EQ_OFFER values for small, medium and large companies
# note: these values have been normalized
small_EQ_OFFER = ((EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Small', 'EQ_OFFER'] - 
                  EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Small', 'EQ_OFFER'].min()) /
             (EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Small', 'EQ_OFFER'].max() -
              EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Small', 'EQ_OFFER'].min())) 
medium_EQ_OFFER = ((EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Medium', 'EQ_OFFER'] - 
                   EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Medium', 'EQ_OFFER'].min()) /
             (EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Medium', 'EQ_OFFER'].max() -
              EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Medium', 'EQ_OFFER'].min())) 
large_EQ_OFFER = ((EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Large', 'EQ_OFFER'] - 
                  EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Large', 'EQ_OFFER'].min()) /
             (EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Large', 'EQ_OFFER'].max() -
              EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Large', 'EQ_OFFER'].min()))
# store market capitalization values for small, medium and large companies
small_market_cap = EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Small', 'Market_Capitalization']
medium_market_cap = EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Medium', 'Market_Capitalization']
large_market_cap = EQ_OFFER_df.loc[EQ_OFFER_df['Size_Tercile']=='Large', 'Market_Capitalization']
# scatter plots of EQ_OFFER by market capitalization 
# note: the plots are split by company size terciles
fig = make_subplots(rows=1, cols=3)
fig.add_trace(go.Scatter(x=small_market_cap*1000000, y=small_EQ_OFFER, name='Small',
                         mode='markers', marker_color='green', opacity=0.6), row=1, col=1)
fig.add_trace(go.Scatter(x=medium_market_cap*1000000, y=medium_EQ_OFFER, name='Medium',
                         mode='markers', marker_color='orange', opacity=0.6), row=1, col=2)
fig.add_trace(go.Scatter(x=large_market_cap*1000000, y=large_EQ_OFFER, name='Large',
                         mode='markers', marker_color='red', opacity=0.6), row=1, col=3)
fig.update_xaxes(title_text='Market Capitalization ($)', titlefont=dict(family='Times New Roman, serif'),
                 tickfont=dict(family='Times New Roman, serif'))
fig.update_yaxes(title_text='Normalized EQ_OFFER', titlefont=dict(family='Times New Roman, serif'),
                 tickfont=dict(family='Times New Roman, serif'), row=1, col=1)
fig.update_layout(template='plotly_white', showlegend=True,
                  title=go.layout.Title(text='Scatter Plots - Normalized EQ_OFFER by Market Capitalization',
                                        font=dict(family='Times New Roman, serif'),
                                        x=0.5),
                  legend=go.layout.Legend(font=dict(family='Times New Roman, serif')))
fig.show()

## Operating Efficiency Fundamental Measures - Analysis

In [33]:
#----------CMARGIN by comapny size and year----------

pd.options.mode.chained_assignment = None

# create dataframe to store data date, company size tercile and CMARGIN
CMARGIN_df = funda_df[['Data_Date', 'Size_Tercile', 'CMARGIN']]
# create new column to store normalized value of each CMARGIN observation in CMARGIN_df 
# note: each CMARGIN will be normalized in respect to its company size tercile
CMARGIN_df['Normalized_CMARGIN'] = 0
for idx, row in CMARGIN_df.iterrows():
    if row['Size_Tercile'] == 'Small':
        CMARGIN_df.loc[idx, 'Normalized_CMARGIN'] = ((CMARGIN_df.loc[idx, 'CMARGIN'] - 
                                                     CMARGIN_df.loc[CMARGIN_df['Size_Tercile']=='Small', 
                                                                   'CMARGIN'].min()) / 
                                                    (CMARGIN_df.loc[CMARGIN_df['Size_Tercile']=='Small', 
                                                                   'CMARGIN'].max() -
                                                     CMARGIN_df.loc[CMARGIN_df['Size_Tercile']=='Small', 
                                                                   'CMARGIN'].min()))
    elif row['Size_Tercile'] == 'Medium':
        CMARGIN_df.loc[idx, 'Normalized_CMARGIN'] = ((CMARGIN_df.loc[idx, 'CMARGIN'] - 
                                                     CMARGIN_df.loc[CMARGIN_df['Size_Tercile']=='Medium', 
                                                                   'CMARGIN'].min()) / 
                                                    (CMARGIN_df.loc[CMARGIN_df['Size_Tercile']=='Medium', 
                                                                   'CMARGIN'].max() -
                                                     CMARGIN_df.loc[CMARGIN_df['Size_Tercile']=='Medium', 
                                                                   'CMARGIN'].min()))
    else:
        CMARGIN_df.loc[idx, 'Normalized_CMARGIN'] = ((CMARGIN_df.loc[idx, 'CMARGIN'] - 
                                                     CMARGIN_df.loc[CMARGIN_df['Size_Tercile']=='Large', 
                                                                   'CMARGIN'].min()) / 
                                                    (CMARGIN_df.loc[CMARGIN_df['Size_Tercile']=='Large', 
                                                                   'CMARGIN'].max() -
                                                     CMARGIN_df.loc[CMARGIN_df['Size_Tercile']=='Large', 
                                                                   'CMARGIN'].min()))
# create dataframe to store average normalized CMARGIN for each year by company size tercile
av_CMARGIN_df = pd.DataFrame(columns=['Year', 'Small_Norm_CMARGIN', 'Medium_Norm_CMARGIN', 'Large_Norm_CMARGIN'])
av_CMARGIN_df['Year'] = ['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']
# calculate average normalized CMARGIN for small companies by year
year_count = 2011
for year in av_CMARGIN_df['Year']:
    av_CMARGIN_df.loc[av_CMARGIN_df['Year']==year, 'Small_Norm_CMARGIN'] = \
    CMARGIN_df.loc[(CMARGIN_df['Data_Date'].dt.year==year_count) & (CMARGIN_df['Size_Tercile']=='Small'), 
                   'Normalized_CMARGIN'].mean()
    year_count += 1
# calculate average normalized CMARGIN for medium companies by year
year_count = 2011
for year in av_CMARGIN_df['Year']:
    av_CMARGIN_df.loc[av_CMARGIN_df['Year']==year, 'Medium_Norm_CMARGIN'] = \
    CMARGIN_df.loc[(CMARGIN_df['Data_Date'].dt.year==year_count) & (CMARGIN_df['Size_Tercile']=='Medium'), 
                   'Normalized_CMARGIN'].mean()
    year_count += 1
# calculate average normalized CMARGIN for large companies by year
year_count = 2011
for year in av_CMARGIN_df['Year']:
    av_CMARGIN_df.loc[av_CMARGIN_df['Year']==year, 'Large_Norm_CMARGIN'] = \
    CMARGIN_df.loc[(CMARGIN_df['Data_Date'].dt.year==year_count) & (CMARGIN_df['Size_Tercile']=='Large'), 
                   'Normalized_CMARGIN'].mean()
    year_count += 1
# line plots of average CMARGIN by company size terciles and year
fig = go.Figure()
fig.add_trace(go.Scatter(mode='markers+lines', x=av_CMARGIN_df['Year'], y=av_CMARGIN_df['Small_Norm_CMARGIN'],
                         name='Small', line=dict(color='green', width=2, dash='dot'), opacity=0.6, 
                         marker_size=10))
fig.add_trace(go.Scatter(mode='markers+lines', x=av_CMARGIN_df['Year'], y=av_CMARGIN_df['Medium_Norm_CMARGIN'],
                         name='Medium', line=dict(color='orange', width=2, dash='dot'), opacity=0.6, 
                         marker_size=10))
fig.add_trace(go.Scatter(mode='markers+lines', x=av_CMARGIN_df['Year'], y=av_CMARGIN_df['Large_Norm_CMARGIN'],
                         name='Large', line=dict(color='red', width=2, dash='dot'), opacity=0.6, 
                         marker_size=10))
fig.update_layout(template='plotly_white', showlegend=True,
                  title=go.layout.Title(text='Line Plots - Normalized Average CMARGIN by Year',
                                        font=dict(family='Times New Roman, serif'),
                                        x=0.5),
                  xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text='Year',
                                                                    font=dict(family='Times New Roman, serif')),
                                        tickfont=dict(family='Times New Roman, serif')),
                  yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text='Normalized Average CMARGIN',
                                                                    font=dict(family='Times New Roman, serif')),
                                        tickfont=dict(family='Times New Roman, serif')),
                  legend=go.layout.Legend(font=dict(family='Times New Roman, serif')))
fig.show()

In [34]:
#----------CTURN by comapny size and year----------

pd.options.mode.chained_assignment = None

# create dataframe to store data date, company size tercile and CTURN
CTURN_df = funda_df[['Data_Date', 'Size_Tercile', 'CTURN']]
# create new column to store normalized value of each CTURN observation in CTURN_df 
# note: each CTURN will be normalized in respect to its company size tercile
CTURN_df['Normalized_CTURN'] = 0
for idx, row in CTURN_df.iterrows():
    if row['Size_Tercile'] == 'Small':
        CTURN_df.loc[idx, 'Normalized_CTURN'] = ((CTURN_df.loc[idx, 'CTURN'] - 
                                                     CTURN_df.loc[CTURN_df['Size_Tercile']=='Small', 
                                                                   'CTURN'].min()) / 
                                                    (CTURN_df.loc[CTURN_df['Size_Tercile']=='Small', 
                                                                   'CTURN'].max() -
                                                     CTURN_df.loc[CTURN_df['Size_Tercile']=='Small', 
                                                                   'CTURN'].min()))
    elif row['Size_Tercile'] == 'Medium':
        CTURN_df.loc[idx, 'Normalized_CTURN'] = ((CTURN_df.loc[idx, 'CTURN'] - 
                                                     CTURN_df.loc[CTURN_df['Size_Tercile']=='Medium', 
                                                                   'CTURN'].min()) / 
                                                    (CTURN_df.loc[CTURN_df['Size_Tercile']=='Medium', 
                                                                   'CTURN'].max() -
                                                     CTURN_df.loc[CTURN_df['Size_Tercile']=='Medium', 
                                                                   'CTURN'].min()))
    else:
        CTURN_df.loc[idx, 'Normalized_CTURN'] = ((CTURN_df.loc[idx, 'CTURN'] - 
                                                     CTURN_df.loc[CTURN_df['Size_Tercile']=='Large', 
                                                                   'CTURN'].min()) / 
                                                    (CTURN_df.loc[CTURN_df['Size_Tercile']=='Large', 
                                                                   'CTURN'].max() -
                                                     CTURN_df.loc[CTURN_df['Size_Tercile']=='Large', 
                                                                   'CTURN'].min()))
# create dataframe to store average normalized CTURN for each year by company size tercile
av_CTURN_df = pd.DataFrame(columns=['Year', 'Small_Norm_CTURN', 'Medium_Norm_CTURN', 'Large_Norm_CTURN'])
av_CTURN_df['Year'] = ['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']
# calculate average normalized CTURN for small companies by year
year_count = 2011
for year in av_CTURN_df['Year']:
    av_CTURN_df.loc[av_CTURN_df['Year']==year, 'Small_Norm_CTURN'] = \
    CTURN_df.loc[(CTURN_df['Data_Date'].dt.year==year_count) & (CTURN_df['Size_Tercile']=='Small'), 
                   'Normalized_CTURN'].mean()
    year_count += 1
# calculate average normalized CTURN for medium companies by year
year_count = 2011
year_count = 2011
for year in av_CTURN_df['Year']:
    av_CTURN_df.loc[av_CTURN_df['Year']==year, 'Medium_Norm_CTURN'] = \
    CTURN_df.loc[(CTURN_df['Data_Date'].dt.year==year_count) & (CTURN_df['Size_Tercile']=='Medium'), 
                   'Normalized_CTURN'].mean()
    year_count += 1
# calculate average normalized CTURN for large companies by year
year_count = 2011
for year in av_CTURN_df['Year']:
    av_CTURN_df.loc[av_CTURN_df['Year']==year, 'Large_Norm_CTURN'] = \
    CTURN_df.loc[(CTURN_df['Data_Date'].dt.year==year_count) & (CTURN_df['Size_Tercile']=='Large'), 
                   'Normalized_CTURN'].mean()
    year_count += 1
# line plots of average CTURN by company size terciles and year
fig = go.Figure()
fig.add_trace(go.Scatter(mode='markers+lines', x=av_CTURN_df['Year'], y=av_CTURN_df['Small_Norm_CTURN'],
                         name='Small', line=dict(color='green', width=2, dash='dot'), opacity=0.6, 
                         marker_size=10))
fig.add_trace(go.Scatter(mode='markers+lines', x=av_CTURN_df['Year'], y=av_CTURN_df['Medium_Norm_CTURN'],
                         name='Medium', line=dict(color='orange', width=2, dash='dot'), opacity=0.6, 
                         marker_size=10))
fig.add_trace(go.Scatter(mode='markers+lines', x=av_CTURN_df['Year'], y=av_CTURN_df['Large_Norm_CTURN'],
                         name='Large', line=dict(color='red', width=2, dash='dot'), opacity=0.6, 
                         marker_size=10))
fig.update_layout(template='plotly_white', showlegend=True,
                  title=go.layout.Title(text='Line Plots - Normalized Average CTURN by Year',
                                        font=dict(family='Times New Roman, serif'),
                                        x=0.5),
                  xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text='Year',
                                                                    font=dict(family='Times New Roman, serif')),
                                        tickfont=dict(family='Times New Roman, serif')),
                  yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text='Normalized Average CTURN',
                                                                    font=dict(family='Times New Roman, serif')),
                                        tickfont=dict(family='Times New Roman, serif')),
                  legend=go.layout.Legend(font=dict(family='Times New Roman, serif')))
fig.show()

## Fundamental Measures - Correlation Analysis

In [35]:
#----------correlation analysis of nine funadmental measures----------

# note: the pearson correlation coefficient will be calculated for each pair of fundamental measures

# create dataframe to store the values of the nine fundamental measures
funda_measures_df = funda_df[['ROA', 'CFO', 'CROA', 'ACCRUAL', 
                              'CLEVER', 'CLIQUID', 'EQ_OFFER', 
                              'CMARGIN', 'CTURN']]
# plot correlation heatmap
fig = ff.create_annotated_heatmap(z=np.array(funda_measures_df.corr(method='pearson')).round(2),
                                  y=funda_measures_df.columns.tolist(),
                                  x=funda_measures_df.columns.tolist(),
                                  colorscale=[[0, 'rgb(0, 0, 0)'],
                                              [0.1, 'rgb(0, 0, 0)'],
                                              [0.1, 'rgb(20, 20, 20)'],
                                              [0.2, 'rgb(20, 20, 20)'],
                                              [0.2, 'rgb(40, 40, 40)'],
                                              [0.3, 'rgb(40, 40, 40)'],
                                              [0.3, 'rgb(60, 60, 60)'],
                                              [0.4, 'rgb(60, 60, 60)'],
                                              [0.4, 'rgb(80, 80, 80)'],
                                              [0.5, 'rgb(80, 80, 80)'],
                                              [0.5, 'rgb(100, 100, 100)'],
                                              [0.6, 'rgb(100, 100, 100)'],
                                              [0.6, 'rgb(120, 120, 120)'],
                                              [0.7, 'rgb(120, 120, 120)'],
                                              [0.7, 'rgb(140, 140, 140)'],
                                              [0.8, 'rgb(140, 140, 140)'],
                                              [0.8, 'rgb(160, 160, 160)'],
                                              [0.9, 'rgb(160, 160, 160)'],
                                              [0.9, 'rgb(180, 180, 180)'],
                                              [1.0, 'rgb(180, 180, 180)']],
                                  colorbar=dict(tickfont=dict(family='Times New Roman, serif')), 
                                  showscale=True)
for i in range(len(fig.layout.annotations)):
    fig.layout.annotations[i].font = dict(family='Times New Roman, serif', color='white')
fig.update_layout(template='plotly_white',
                  title=go.layout.Title(text='Pearson Correlation Heatmap - Fundamental Measures',
                                        font=dict(family='Times New Roman, serif'),
                                        x=0.5),
                  xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text='Fundamental Measure',
                                                                    font=dict(family='Times New Roman, serif')),
                                        tickfont=dict(family='Times New Roman, serif')),
                  yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text='Fundamental Measure',
                                                                    font=dict(family='Times New Roman, serif')),
                                        tickfont=dict(family='Times New Roman, serif')))
fig.show()