In [2]:
import pandas as pd
import numpy as np

In [62]:
df1 = pd.DataFrame(
    {
        "name": ["Oliver", "Emma", "Jack"],
        "age": [21, 24, 36],
        "education": ["BSc", "BA", "MSc"],
    }
)

df2 = pd.DataFrame(
    {
        "name": ["Jimmy", "Frank", "Sue", "Ava"],
        "age": [20, 18, 65, 40],
        "income": [40000, 0, 120000, 300000]
    }
)

df3 = pd.DataFrame(
    {
        "name": ["Emma", "Jack", "Mia"],
        "position": ["manager", "CFO", "analyst"],
        "salary": [78000, 160000, 56000],
        "education": ["BA", "PhD", "BSc"],
    }
)

In [63]:
df1

Unnamed: 0,name,age,education
0,Oliver,21,BSc
1,Emma,24,BA
2,Jack,36,MSc


In [64]:
df2

Unnamed: 0,name,age,income
0,Jimmy,20,40000
1,Frank,18,0
2,Sue,65,120000
3,Ava,40,300000


In [65]:
df3

Unnamed: 0,name,position,salary,education
0,Emma,manager,78000,BA
1,Jack,CFO,160000,PhD
2,Mia,analyst,56000,BSc


## 1) Vertically combine datasets (concat)

In [66]:
pd.concat([df1, df2]) 

Unnamed: 0,name,age,education,income
0,Oliver,21,BSc,
1,Emma,24,BA,
2,Jack,36,MSc,
0,Jimmy,20,,40000.0
1,Frank,18,,0.0
2,Sue,65,,120000.0
3,Ava,40,,300000.0


In [67]:
pd.concat([df1, df2], join="outer")

Unnamed: 0,name,age,education,income
0,Oliver,21,BSc,
1,Emma,24,BA,
2,Jack,36,MSc,
0,Jimmy,20,,40000.0
1,Frank,18,,0.0
2,Sue,65,,120000.0
3,Ava,40,,300000.0


In [68]:
pd.concat([df1, df2], join="inner")

Unnamed: 0,name,age
0,Oliver,21
1,Emma,24
2,Jack,36
0,Jimmy,20
1,Frank,18
2,Sue,65
3,Ava,40


In [70]:
pd.concat([df1, df2], join="outer", sort=True)

Unnamed: 0,age,education,income,name
0,21,BSc,,Oliver
1,24,BA,,Emma
2,36,MSc,,Jack
0,20,,40000.0,Jimmy
1,18,,0.0,Frank
2,65,,120000.0,Sue
3,40,,300000.0,Ava


## 2) Horizontally Join/Merge Datasets

### 2.1) Inner Join

In [71]:
df1

Unnamed: 0,name,age,education
0,Oliver,21,BSc
1,Emma,24,BA
2,Jack,36,MSc


In [73]:
df3

Unnamed: 0,name,position,salary,education
0,Emma,manager,78000,BA
1,Jack,CFO,160000,PhD
2,Mia,analyst,56000,BSc


In [74]:
df1.merge(df3, on="name")

Unnamed: 0,name,age,education_x,position,salary,education_y
0,Emma,24,BA,manager,78000,BA
1,Jack,36,MSc,CFO,160000,PhD


In [75]:
df1.merge(df3, on="name", how = "inner")

Unnamed: 0,name,age,education_x,position,salary,education_y
0,Emma,24,BA,manager,78000,BA
1,Jack,36,MSc,CFO,160000,PhD


In [78]:
## Suffixes
df1.merge(df3, on="name", how = "inner", suffixes = ["_1", "_2"])

Unnamed: 0,name,age,education_1,position,salary,education_2
0,Emma,24,BA,manager,78000,BA
1,Jack,36,MSc,CFO,160000,PhD


In [81]:
# Alternative syntax:
pd.merge(df1, df3, on="name", how = "inner", suffixes = ["_1", "_2"])


Unnamed: 0,name,age,education_1,position,salary,education_2
0,Emma,24,BA,manager,78000,BA
1,Jack,36,MSc,CFO,160000,PhD


In [47]:
## Sort the resulting dataframe

In [85]:
pd.merge(df3, df1, on="name", sort=False)

Unnamed: 0,name,position,salary,education_x,age,education_y
0,Emma,manager,78000,BA,24,BA
1,Jack,CFO,160000,PhD,36,MSc


### 2.2) Outer Join

In [86]:
df1

Unnamed: 0,name,age,education
0,Oliver,21,BSc
1,Emma,24,BA
2,Jack,36,MSc


In [87]:
df3

Unnamed: 0,name,position,salary,education
0,Emma,manager,78000,BA
1,Jack,CFO,160000,PhD
2,Mia,analyst,56000,BSc


In [88]:
df1.merge(df3, on="name", how = "outer")

Unnamed: 0,name,age,education_x,position,salary,education_y
0,Oliver,21.0,BSc,,,
1,Emma,24.0,BA,manager,78000.0,BA
2,Jack,36.0,MSc,CFO,160000.0,PhD
3,Mia,,,analyst,56000.0,BSc


In [89]:
## Sort the rows as well:
df1.merge(df3, on="name", how = "outer", sort=True)

Unnamed: 0,name,age,education_x,position,salary,education_y
0,Emma,24.0,BA,manager,78000.0,BA
1,Jack,36.0,MSc,CFO,160000.0,PhD
2,Mia,,,analyst,56000.0,BSc
3,Oliver,21.0,BSc,,,


### 2.3) Left Outer Join

In [90]:
df1.merge(df3, on="name", how="left", sort=True)

Unnamed: 0,name,age,education_x,position,salary,education_y
0,Emma,24,BA,manager,78000.0,BA
1,Jack,36,MSc,CFO,160000.0,PhD
2,Oliver,21,BSc,,,


### 2.4) Right Join

In [92]:
pd.merge(df1, df3, on="name", how="right", sort=True)

Unnamed: 0,name,age,education_x,position,salary,education_y
0,Emma,24.0,BA,manager,78000,BA
1,Jack,36.0,MSc,CFO,160000,PhD
2,Mia,,,analyst,56000,BSc


### 2.5) Left_on & Right_on parameters

In [93]:
df3

Unnamed: 0,name,position,salary,education
0,Emma,manager,78000,BA
1,Jack,CFO,160000,PhD
2,Mia,analyst,56000,BSc


In [94]:
df4 = df3.copy()
df4

Unnamed: 0,name,position,salary,education
0,Emma,manager,78000,BA
1,Jack,CFO,160000,PhD
2,Mia,analyst,56000,BSc


In [96]:
df4.rename({"name": "first_name"}, inplace=True, axis=1)
df4

Unnamed: 0,first_name,position,salary,education
0,Emma,manager,78000,BA
1,Jack,CFO,160000,PhD
2,Mia,analyst,56000,BSc


In [97]:
df1

Unnamed: 0,name,age,education
0,Oliver,21,BSc
1,Emma,24,BA
2,Jack,36,MSc


In [101]:
pd.merge(df1, df4, left_on="name", right_on="first_name", how = "inner", sort=True)

Unnamed: 0,name,age,education_x,first_name,position,salary,education_y
0,Emma,24,BA,Emma,manager,78000,BA
1,Jack,36,MSc,Jack,CFO,160000,PhD


## Case Study: Financial Statements for S&P 500 Constituents

In [3]:
BS = pd.read_parquet("SP500_balance_sheet.parquet")
IS = pd.read_parquet("SP500_income_statement.parquet")
companies = pd.read_parquet("SP500_constituents.parquet")

In [4]:
companies.sort_values("symbol")

Unnamed: 0,symbol,name,sector,subSector,headQuarter,dateFirstAdded,cik,founded
308,A,Agilent Technologies,Health Care,Health Care Equipment,"Santa Clara, California",2000-06-05,1090872,1999
149,AAL,American Airlines Group,Industrials,Passenger Airlines,"Fort Worth, Texas",2015-03-23,6201,1934
396,AAPL,Apple Inc.,Information Technology,"Technology Hardware, Storage & Peripherals","Cupertino, California",1982-11-30,320193,1977
173,ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013
294,ABC,AmerisourceBergen,Health Care,Health Care Distributors,"Conshohocken, Pennsylvania",2001-08-30,1140859,1985
...,...,...,...,...,...,...,...,...
331,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
295,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927
57,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
296,ZION,Zions Bancorporation,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873


In [109]:
BS.sample(3)

Unnamed: 0,symbol,date,ddate,reportedCurrency,uom,cik,calendarYear,period,companyName,fy,...,PPENet,TotalReceivablesCurrent,RestrictedCashAndInvestmentsCurrent,RestrictedCashAndInvestmentsNoncurrent,RetainedEarnings,ShortTermBorrowings,ShortTermInvestments,StockholdersEquity,Equity,TreasuryStockValue
435925,RCL,2016-12-31,2016-12-31,USD,USD,884887.0,2016,FY,Royal Caribbean Cruises Ltd.,2016.0,...,20161430000.0,291899000.0,0.0,0.0,7860341000.0,0.0,0.0,9121412000.0,9121412000.0,1153308000.0
366036,MA,2018-12-31,2018-12-31,USD,USD,1141391.0,2018,FY,Mastercard Incorporated,2018.0,...,921000000.0,2276000000.0,553000000.0,0.0,27283000000.0,0.0,0.0,5395000000.0,5418000000.0,25750000000.0
187004,AOS,2020-12-31,2020-12-31,USD,USD,91142.0,2020,FY,A. O. Smith Corporation,2020.0,...,541300000.0,585000000.0,0.0,0.0,2509600000.0,0.0,116500000.0,1848300000.0,1848300000.0,1155900000.0


In [110]:
IS.sample(3)

Unnamed: 0,symbol,date,ddate,reportedCurrency,uom,cik,calendarYear,period,companyName,fy,...,Revenues,SellingAndMarketingExpense,SGAExpense,TangibleAssetImpairmentCharges,WaSharesDiluted,WaSharesBasic,cum_ratio,adj_factor,adj_EPSBasic,adj_EPSDiluted
277459,DPZ,2022-01-02,2021-12-31,USD,USD,1286681.0,2021,FY,"Domino's Pizza, Inc.",2021.0,...,4357373000.0,479501000.0,907834000.0,0.0,0.0,0.0,1.0,1.0,13.72,13.54
373542,LEN,2015-11-30,2015-11-30,USD,USD,920760.0,2015,FY,Lennar Corporation,2015.0,...,9474008000.0,0.0,216244000.0,0.0,0.0,0.0,12.0,1.017,3.80531,3.402163
358413,JKHY,2021-06-30,2021-06-30,USD,USD,779152.0,2021,FY,"Jack Henry & Associates, Inc.",2021.0,...,1758225000.0,0.0,187060000.0,0.0,0.0,0.0,18.0,1.0,4.12,4.12


In [74]:
## ROE? ROA?
# ROE = NI / average(Total Equity) -- simplified version: NI/Equity
# ROA = NI / avg Total Assets --> simplified version: NI/Assets

In [111]:
BS["Equity"]

162586    5.185000e+09
162587    5.289000e+09
162588    5.301000e+09
162589    4.170000e+09
162590    4.246000e+09
              ...     
529284    2.185000e+09
529285    2.708000e+09
529286    3.773000e+09
529287    4.544000e+09
529288    4.403000e+09
Name: Equity, Length: 5434, dtype: float64

In [112]:
IS["NI"]

172117    1.153000e+09
172118    7.240000e+08
172119    5.040000e+08
172120    4.010000e+08
172121    4.620000e+08
              ...     
551486    1.428000e+09
551487    1.500000e+09
551488    1.638000e+09
551489    2.037000e+09
551490    2.114000e+09
Name: NI, Length: 5455, dtype: float64

In [113]:
BS.head(5)

Unnamed: 0,symbol,date,ddate,reportedCurrency,uom,cik,calendarYear,period,companyName,fy,...,PPENet,TotalReceivablesCurrent,RestrictedCashAndInvestmentsCurrent,RestrictedCashAndInvestmentsNoncurrent,RetainedEarnings,ShortTermBorrowings,ShortTermInvestments,StockholdersEquity,Equity,TreasuryStockValue
162586,A,2012-10-31,2012-10-31,USD,USD,1090872.0,2012,FY,"Agilent Technologies, Inc.",2012.0,...,1164000000.0,923000000.0,0.0,0.0,5505000000.0,250000000.0,0.0,5182000000.0,5185000000.0,8707000000.0
162587,A,2013-10-31,2013-10-31,USD,USD,1090872.0,2013,FY,"Agilent Technologies, Inc.",2013.0,...,1134000000.0,899000000.0,0.0,0.0,6073000000.0,0.0,0.0,5286000000.0,5289000000.0,9607000000.0
162588,A,2014-10-31,2014-10-31,USD,USD,1090872.0,2014,FY,"Agilent Technologies, Inc.",2014.0,...,1101000000.0,983000000.0,0.0,0.0,6466000000.0,0.0,0.0,5298000000.0,5301000000.0,9807000000.0
162589,A,2015-10-31,2015-10-31,USD,USD,1090872.0,2015,FY,"Agilent Technologies, Inc.",2015.0,...,604000000.0,606000000.0,242000000.0,0.0,5581000000.0,0.0,0.0,4167000000.0,4170000000.0,10074000000.0
162590,A,2016-10-31,2016-10-31,USD,USD,1090872.0,2016,FY,"Agilent Technologies, Inc.",2016.0,...,639000000.0,631000000.0,0.0,0.0,6089000000.0,0.0,0.0,4243000000.0,4246000000.0,10508000000.0


In [114]:
IS.head(5)

Unnamed: 0,symbol,date,ddate,reportedCurrency,uom,cik,calendarYear,period,companyName,fy,...,Revenues,SellingAndMarketingExpense,SGAExpense,TangibleAssetImpairmentCharges,WaSharesDiluted,WaSharesBasic,cum_ratio,adj_factor,adj_EPSBasic,adj_EPSDiluted
172117,A,2012-10-31,2012-10-31,USD,USD,1090872.0,2012,FY,"Agilent Technologies, Inc.",2012.0,...,6858000000.0,0.0,1817000000.0,0.0,0.0,0.0,1.0,1.398,2.367668,2.339056
172118,A,2013-10-31,2013-10-31,USD,USD,1090872.0,2013,FY,"Agilent Technologies, Inc.",2013.0,...,6782000000.0,0.0,1880000000.0,0.0,0.0,0.0,1.0,1.398,1.516452,1.502146
172119,A,2014-10-31,2014-10-31,USD,USD,1090872.0,2014,FY,"Agilent Technologies, Inc.",2014.0,...,6981000000.0,0.0,2043000000.0,0.0,0.0,0.0,1.0,1.398,1.080114,1.065808
172120,A,2015-10-31,2015-10-31,USD,USD,1090872.0,2015,FY,"Agilent Technologies, Inc.",2015.0,...,4038000000.0,0.0,1189000000.0,0.0,0.0,0.0,1.398,1.0,1.2,1.2
172121,A,2016-10-31,2016-10-31,USD,USD,1090872.0,2016,FY,"Agilent Technologies, Inc.",2016.0,...,4202000000.0,0.0,1253000000.0,0.0,0.0,0.0,1.398,1.0,1.42,1.4


In [116]:
BS.duplicated(subset=["symbol", "date"]).sum()

0

In [117]:
IS.duplicated(subset=["symbol", "date"]).sum()

0

In [123]:
combined_data = BS.merge(IS, on=["symbol", "date"], how = "inner", suffixes = ["", "_is"])

In [124]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5433 entries, 0 to 5432
Columns: 157 entries, symbol to adj_EPSDiluted
dtypes: category(8), datetime64[us](3), float64(141), int64(2), object(3)
memory usage: 7.5+ MB


In [125]:
combined_data.columns

Index(['symbol', 'date', 'ddate', 'reportedCurrency', 'uom', 'cik',
       'calendarYear', 'period', 'companyName', 'fy',
       ...
       'Revenues', 'SellingAndMarketingExpense', 'SGAExpense',
       'TangibleAssetImpairmentCharges', 'WaSharesDiluted', 'WaSharesBasic',
       'cum_ratio', 'adj_factor', 'adj_EPSBasic', 'adj_EPSDiluted'],
      dtype='object', length=157)

In [126]:
combined_data[combined_data.symbol=="AAPL"]

Unnamed: 0,symbol,date,ddate,reportedCurrency,uom,cik,calendarYear,period,companyName,fy,...,Revenues,SellingAndMarketingExpense,SGAExpense,TangibleAssetImpairmentCharges,WaSharesDiluted,WaSharesBasic,cum_ratio,adj_factor,adj_EPSBasic,adj_EPSDiluted
22,AAPL,2012-09-29,2012-09-30,USD,USD,320193.0,2012,FY,Apple Inc.,2012.0,...,156508000000.0,0.0,10040000000.0,0.0,0.0,0.0,8.0,28.0,1.594286,1.576786
23,AAPL,2013-09-28,2013-09-30,USD,USD,320193.0,2013,FY,Apple Inc.,2013.0,...,170910000000.0,0.0,10830000000.0,0.0,0.0,0.0,8.0,28.0,1.429643,1.419643
24,AAPL,2014-09-27,2014-09-30,USD,USD,320193.0,2014,FY,Apple Inc.,2014.0,...,182795000000.0,0.0,11993000000.0,0.0,0.0,0.0,56.0,4.0,1.6225,1.6125
25,AAPL,2015-09-26,2015-09-30,USD,USD,320193.0,2015,FY,Apple Inc.,2015.0,...,233715000000.0,0.0,14329000000.0,0.0,0.0,0.0,56.0,4.0,2.32,2.305
26,AAPL,2016-09-24,2016-09-30,USD,USD,320193.0,2016,FY,Apple Inc.,2016.0,...,215639000000.0,0.0,14194000000.0,0.0,0.0,0.0,56.0,4.0,2.0875,2.0775
27,AAPL,2017-09-30,2017-09-30,USD,USD,320193.0,2017,FY,Apple Inc.,2017.0,...,229234000000.0,0.0,15261000000.0,0.0,0.0,0.0,56.0,4.0,2.3175,2.3025
28,AAPL,2018-09-29,2018-09-30,USD,USD,320193.0,2018,FY,Apple Inc.,2018.0,...,265595000000.0,0.0,16705000000.0,0.0,0.0,0.0,56.0,4.0,3.0025,2.9775
29,AAPL,2019-09-28,2019-09-30,USD,USD,320193.0,2019,FY,Apple Inc.,2019.0,...,260174000000.0,0.0,18245000000.0,0.0,0.0,0.0,56.0,4.0,2.9925,2.9725
30,AAPL,2020-09-26,2020-09-30,USD,USD,320193.0,2020,FY,Apple Inc.,2020.0,...,274515000000.0,0.0,19916000000.0,0.0,0.0,0.0,224.0,1.0,3.31,3.28
31,AAPL,2021-09-25,2021-09-30,USD,USD,320193.0,2021,FY,Apple Inc.,2021.0,...,365817000000.0,0.0,21973000000.0,0.0,0.0,0.0,224.0,1.0,5.67,5.61


In [127]:
combined_data["ROE"] = combined_data.NI / combined_data.Equity
combined_data["ROA"] = combined_data.NI / combined_data.Assets

In [130]:
combined_data = combined_data.replace(np.inf, np.nan)

In [132]:
combined_data[["symbol", "companyName", "date", "fy", "Equity", "Assets", "NI", "ROE", "ROA"]].loc[combined_data.symbol=='BA']

Unnamed: 0,symbol,companyName,date,fy,Equity,Assets,NI,ROE,ROA
572,BA,The Boeing Company,2012-12-31,2012.0,5967000000.0,88896000000.0,3900000000.0,0.653595,0.043871
573,BA,The Boeing Company,2013-12-31,2013.0,14997000000.0,92663000000.0,4585000000.0,0.305728,0.04948
574,BA,The Boeing Company,2014-12-31,2014.0,8790000000.0,99198000000.0,5446000000.0,0.619568,0.0549
575,BA,The Boeing Company,2015-12-31,2015.0,6397000000.0,94408000000.0,5176000000.0,0.809129,0.054826
576,BA,The Boeing Company,2016-12-31,2016.0,877000000.0,89997000000.0,4895000000.0,5.581528,0.054391
577,BA,The Boeing Company,2017-12-31,2017.0,412000000.0,92333000000.0,8197000000.0,19.895631,0.088776
578,BA,The Boeing Company,2018-12-31,2018.0,410000000.0,117359000000.0,10460000000.0,25.512195,0.089128
579,BA,The Boeing Company,2019-12-31,2019.0,-8300000000.0,133625000000.0,-636000000.0,0.076627,-0.00476
580,BA,The Boeing Company,2020-12-31,2020.0,-18075000000.0,152136000000.0,-11873000000.0,0.656874,-0.078042
581,BA,The Boeing Company,2021-12-31,2021.0,-14846000000.0,138552000000.0,-4202000000.0,0.283039,-0.030328


In [133]:
combined_data.loc[combined_data.Equity<=0, "ROE"] = np.nan
combined_data.loc[combined_data.Assets<=0, "ROA"] = np.nan

In [134]:
combined_data[["symbol", "companyName", "date", "fy", "Equity", "Assets", "NI", "ROE", "ROA"]].loc[combined_data.symbol=='BA']

Unnamed: 0,symbol,companyName,date,fy,Equity,Assets,NI,ROE,ROA
572,BA,The Boeing Company,2012-12-31,2012.0,5967000000.0,88896000000.0,3900000000.0,0.653595,0.043871
573,BA,The Boeing Company,2013-12-31,2013.0,14997000000.0,92663000000.0,4585000000.0,0.305728,0.04948
574,BA,The Boeing Company,2014-12-31,2014.0,8790000000.0,99198000000.0,5446000000.0,0.619568,0.0549
575,BA,The Boeing Company,2015-12-31,2015.0,6397000000.0,94408000000.0,5176000000.0,0.809129,0.054826
576,BA,The Boeing Company,2016-12-31,2016.0,877000000.0,89997000000.0,4895000000.0,5.581528,0.054391
577,BA,The Boeing Company,2017-12-31,2017.0,412000000.0,92333000000.0,8197000000.0,19.895631,0.088776
578,BA,The Boeing Company,2018-12-31,2018.0,410000000.0,117359000000.0,10460000000.0,25.512195,0.089128
579,BA,The Boeing Company,2019-12-31,2019.0,-8300000000.0,133625000000.0,-636000000.0,,-0.00476
580,BA,The Boeing Company,2020-12-31,2020.0,-18075000000.0,152136000000.0,-11873000000.0,,-0.078042
581,BA,The Boeing Company,2021-12-31,2021.0,-14846000000.0,138552000000.0,-4202000000.0,,-0.030328


In [135]:
combined_data[["ROE", "ROA"]].describe()

Unnamed: 0,ROE,ROA
count,5206.0,5430.0
mean,0.710554,0.069352
std,13.696224,0.349673
min,-51.893846,-1.226993
25%,0.080116,0.023098
50%,0.142719,0.054738
75%,0.25047,0.098931
max,701.857143,24.983985
