In [1]:
import pandas as pd
import numpy as np

In [3]:
df1 = pd.DataFrame(
    {
        "name": ["Oliver", "Emma", "Jack"],
        "age": [21, 24, 36],
        "education": ["BSc", "BA", "MSc"],
    }
)

df2 = pd.DataFrame(
    {
        "name": ["Jimmy", "Frank", "Sue", "Ava"],
        "age": [20, 18, 65, 40],
        "income": [40000, 0, 120000, 300000]
    }
)

df3 = pd.DataFrame(
    {
        "name": ["Emma", "Jack", "Mia"],
        "position": ["manager", "CFO", "analyst"],
        "salary": [78000, 160000, 56000],
        "education": ["BA", "PhD", "BSc"],
    }
)

In [4]:
df1

Unnamed: 0,name,age,education
0,Oliver,21,BSc
1,Emma,24,BA
2,Jack,36,MSc


In [5]:
df2

Unnamed: 0,name,age,income
0,Jimmy,20,40000
1,Frank,18,0
2,Sue,65,120000
3,Ava,40,300000


In [6]:
df3

Unnamed: 0,name,position,salary,education
0,Emma,manager,78000,BA
1,Jack,CFO,160000,PhD
2,Mia,analyst,56000,BSc


# vertical combination


In [9]:
pd.concat([df1,df2])

Unnamed: 0,name,age,education,income
0,Oliver,21,BSc,
1,Emma,24,BA,
2,Jack,36,MSc,
0,Jimmy,20,,40000.0
1,Frank,18,,0.0
2,Sue,65,,120000.0
3,Ava,40,,300000.0


In [11]:
pd.concat([df1,df2], join="outer") # outer padrão

Unnamed: 0,name,age,education,income
0,Oliver,21,BSc,
1,Emma,24,BA,
2,Jack,36,MSc,
0,Jimmy,20,,40000.0
1,Frank,18,,0.0
2,Sue,65,,120000.0
3,Ava,40,,300000.0


In [12]:
pd.concat([df1,df2], join="inner") # somente colunas comuns

Unnamed: 0,name,age
0,Oliver,21
1,Emma,24
2,Jack,36
0,Jimmy,20
1,Frank,18
2,Sue,65
3,Ava,40


In [13]:
pd.concat([df1,df2], join="outer",sort=True) #ordem colunas

Unnamed: 0,age,education,income,name
0,21,BSc,,Oliver
1,24,BA,,Emma
2,36,MSc,,Jack
0,20,,40000.0,Jimmy
1,18,,0.0,Frank
2,65,,120000.0,Sue
3,40,,300000.0,Ava


# Horizontal merge/join

In [14]:
df1

Unnamed: 0,name,age,education
0,Oliver,21,BSc
1,Emma,24,BA
2,Jack,36,MSc


In [15]:
df2

Unnamed: 0,name,age,income
0,Jimmy,20,40000
1,Frank,18,0
2,Sue,65,120000
3,Ava,40,300000


In [17]:
df1.merge(df3, on="name")

Unnamed: 0,name,age,education_x,position,salary,education_y
0,Emma,24,BA,manager,78000,BA
1,Jack,36,MSc,CFO,160000,PhD


In [18]:
df1.merge(df3, on="name", how="inner")

Unnamed: 0,name,age,education_x,position,salary,education_y
0,Emma,24,BA,manager,78000,BA
1,Jack,36,MSc,CFO,160000,PhD


In [19]:
df1.merge(df3, on="name", how="inner", suffixes=["_1", "_2"])

Unnamed: 0,name,age,education_1,position,salary,education_2
0,Emma,24,BA,manager,78000,BA
1,Jack,36,MSc,CFO,160000,PhD


In [28]:
# sintaxe alternativa
pd.merge(df1, df3, on="name", sort=False)

Unnamed: 0,name,age,education_x,position,salary,education_y
0,Emma,24,BA,manager,78000,BA
1,Jack,36,MSc,CFO,160000,PhD


# outer join

In [30]:
df1

Unnamed: 0,name,age,education
0,Oliver,21,BSc
1,Emma,24,BA
2,Jack,36,MSc


In [31]:
df3

Unnamed: 0,name,position,salary,education
0,Emma,manager,78000,BA
1,Jack,CFO,160000,PhD
2,Mia,analyst,56000,BSc


In [32]:
df1.merge(df3, on="name", how="outer")

Unnamed: 0,name,age,education_x,position,salary,education_y
0,Oliver,21.0,BSc,,,
1,Emma,24.0,BA,manager,78000.0,BA
2,Jack,36.0,MSc,CFO,160000.0,PhD
3,Mia,,,analyst,56000.0,BSc


# left outer join

In [33]:
df1.merge(df3, on="name", how="left")

Unnamed: 0,name,age,education_x,position,salary,education_y
0,Oliver,21,BSc,,,
1,Emma,24,BA,manager,78000.0,BA
2,Jack,36,MSc,CFO,160000.0,PhD


In [34]:
df1.merge(df3, on="name", how="right")

Unnamed: 0,name,age,education_x,position,salary,education_y
0,Emma,24.0,BA,manager,78000,BA
1,Jack,36.0,MSc,CFO,160000,PhD
2,Mia,,,analyst,56000,BSc


# left_on right_on

In [35]:
df4 =df3.copy()

In [36]:
df4

Unnamed: 0,name,position,salary,education
0,Emma,manager,78000,BA
1,Jack,CFO,160000,PhD
2,Mia,analyst,56000,BSc


In [38]:
df4.rename({"name":"first_name"}, inplace=True,axis=1)

In [39]:
df4

Unnamed: 0,first_name,position,salary,education
0,Emma,manager,78000,BA
1,Jack,CFO,160000,PhD
2,Mia,analyst,56000,BSc


In [40]:
df1

Unnamed: 0,name,age,education
0,Oliver,21,BSc
1,Emma,24,BA
2,Jack,36,MSc


In [41]:
df1.merge(df4, left_on="name", right_on="first_name", how="outer")

Unnamed: 0,name,age,education_x,first_name,position,salary,education_y
0,Oliver,21.0,BSc,,,,
1,Emma,24.0,BA,Emma,manager,78000.0,BA
2,Jack,36.0,MSc,Jack,CFO,160000.0,PhD
3,,,,Mia,analyst,56000.0,BSc


# Case Study 1

In [44]:
BS =pd.read_parquet("data+files/SP500_balance_sheet.parquet")
IS =pd.read_parquet("data+files/SP500_income_statement.parquet")
companies = pd.read_parquet("data+files/SP500_constituents.parquet")

In [46]:
companies.sort_values("symbol")

Unnamed: 0,symbol,name,sector,subSector,headQuarter,dateFirstAdded,cik,founded
308,A,Agilent Technologies,Health Care,Health Care Equipment,"Santa Clara, California",2000-06-05,1090872,1999
149,AAL,American Airlines Group,Industrials,Passenger Airlines,"Fort Worth, Texas",2015-03-23,6201,1934
396,AAPL,Apple Inc.,Information Technology,"Technology Hardware, Storage & Peripherals","Cupertino, California",1982-11-30,320193,1977
173,ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013
294,ABC,AmerisourceBergen,Health Care,Health Care Distributors,"Conshohocken, Pennsylvania",2001-08-30,1140859,1985
...,...,...,...,...,...,...,...,...
331,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
295,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927
57,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
296,ZION,Zions Bancorporation,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873


In [48]:
BS.sample(3)

Unnamed: 0,symbol,date,ddate,reportedCurrency,uom,cik,calendarYear,period,companyName,fy,...,PPENet,TotalReceivablesCurrent,RestrictedCashAndInvestmentsCurrent,RestrictedCashAndInvestmentsNoncurrent,RetainedEarnings,ShortTermBorrowings,ShortTermInvestments,StockholdersEquity,Equity,TreasuryStockValue
505924,USB,2022-12-31,2022-12-31,USD,USD,36104.0,2022,FY,U.S. Bancorp,2022.0,...,3858000000.0,388213000000.0,0.0,0.0,71901000000.0,31216000000.0,161650000000.0,50766000000.0,51232000000.0,25269000000.0
257490,DAL,2016-12-31,2016-12-31,USD,USD,27904.0,2016,FY,"Delta Air Lines, Inc.",2016.0,...,24375000000.0,2064000000.0,0.0,0.0,7903000000.0,0.0,487000000.0,12287000000.0,12287000000.0,274000000.0
180033,ALLE,2016-12-31,2016-12-31,USD,USD,1579241.0,2016,FY,Allegion plc,2016.0,...,226600000.0,260000000.0,0.0,0.0,376600000.0,0.0,0.0,113300000.0,116400000.0,0.0


In [49]:
IS.sample(3)

Unnamed: 0,symbol,date,ddate,reportedCurrency,uom,cik,calendarYear,period,companyName,fy,...,Revenues,SellingAndMarketingExpense,SGAExpense,TangibleAssetImpairmentCharges,WaSharesDiluted,WaSharesBasic,cum_ratio,adj_factor,adj_EPSBasic,adj_EPSDiluted
318442,GIS,2014-05-25,2014-05-31,USD,USD,40704.0,2014,FY,"General Mills, Inc.",2014.0,...,0.0,0.0,3474300000.0,0.0,0.0,0.0,19.358742,1.0,2.9,2.83
436634,PHM,2013-12-31,2013-12-31,USD,USD,822416.0,2013,FY,"PulteGroup, Inc.",2013.0,...,5679595000.0,0.0,568500000.0,0.0,0.0,0.0,96.0,1.0,6.79,6.72
208345,AVB,2013-12-31,2013-12-31,USD,USD,915912.0,2013,FY,"AvalonBay Communities, Inc.",2013.0,...,1462921000.0,0.0,39573000.0,0.0,0.0,0.0,0.888889,1.0,2.78,2.78


In [55]:
# ROE = Net Income/average(total equity)
# ROA = NI/avg Total Assets

In [52]:
BS["Equity"]

162586    5.185000e+09
162587    5.289000e+09
162588    5.301000e+09
162589    4.170000e+09
162590    4.246000e+09
              ...     
529284    2.185000e+09
529285    2.708000e+09
529286    3.773000e+09
529287    4.544000e+09
529288    4.403000e+09
Name: Equity, Length: 5434, dtype: float64

In [54]:
IS["NI"]

172117    1.153000e+09
172118    7.240000e+08
172119    5.040000e+08
172120    4.010000e+08
172121    4.620000e+08
              ...     
551486    1.428000e+09
551487    1.500000e+09
551488    1.638000e+09
551489    2.037000e+09
551490    2.114000e+09
Name: NI, Length: 5455, dtype: float64

In [56]:
BS.head(5)

Unnamed: 0,symbol,date,ddate,reportedCurrency,uom,cik,calendarYear,period,companyName,fy,...,PPENet,TotalReceivablesCurrent,RestrictedCashAndInvestmentsCurrent,RestrictedCashAndInvestmentsNoncurrent,RetainedEarnings,ShortTermBorrowings,ShortTermInvestments,StockholdersEquity,Equity,TreasuryStockValue
162586,A,2012-10-31,2012-10-31,USD,USD,1090872.0,2012,FY,"Agilent Technologies, Inc.",2012.0,...,1164000000.0,923000000.0,0.0,0.0,5505000000.0,250000000.0,0.0,5182000000.0,5185000000.0,8707000000.0
162587,A,2013-10-31,2013-10-31,USD,USD,1090872.0,2013,FY,"Agilent Technologies, Inc.",2013.0,...,1134000000.0,899000000.0,0.0,0.0,6073000000.0,0.0,0.0,5286000000.0,5289000000.0,9607000000.0
162588,A,2014-10-31,2014-10-31,USD,USD,1090872.0,2014,FY,"Agilent Technologies, Inc.",2014.0,...,1101000000.0,983000000.0,0.0,0.0,6466000000.0,0.0,0.0,5298000000.0,5301000000.0,9807000000.0
162589,A,2015-10-31,2015-10-31,USD,USD,1090872.0,2015,FY,"Agilent Technologies, Inc.",2015.0,...,604000000.0,606000000.0,242000000.0,0.0,5581000000.0,0.0,0.0,4167000000.0,4170000000.0,10074000000.0
162590,A,2016-10-31,2016-10-31,USD,USD,1090872.0,2016,FY,"Agilent Technologies, Inc.",2016.0,...,639000000.0,631000000.0,0.0,0.0,6089000000.0,0.0,0.0,4243000000.0,4246000000.0,10508000000.0


In [57]:
IS.head(5)

Unnamed: 0,symbol,date,ddate,reportedCurrency,uom,cik,calendarYear,period,companyName,fy,...,Revenues,SellingAndMarketingExpense,SGAExpense,TangibleAssetImpairmentCharges,WaSharesDiluted,WaSharesBasic,cum_ratio,adj_factor,adj_EPSBasic,adj_EPSDiluted
172117,A,2012-10-31,2012-10-31,USD,USD,1090872.0,2012,FY,"Agilent Technologies, Inc.",2012.0,...,6858000000.0,0.0,1817000000.0,0.0,0.0,0.0,1.0,1.398,2.367668,2.339056
172118,A,2013-10-31,2013-10-31,USD,USD,1090872.0,2013,FY,"Agilent Technologies, Inc.",2013.0,...,6782000000.0,0.0,1880000000.0,0.0,0.0,0.0,1.0,1.398,1.516452,1.502146
172119,A,2014-10-31,2014-10-31,USD,USD,1090872.0,2014,FY,"Agilent Technologies, Inc.",2014.0,...,6981000000.0,0.0,2043000000.0,0.0,0.0,0.0,1.0,1.398,1.080114,1.065808
172120,A,2015-10-31,2015-10-31,USD,USD,1090872.0,2015,FY,"Agilent Technologies, Inc.",2015.0,...,4038000000.0,0.0,1189000000.0,0.0,0.0,0.0,1.398,1.0,1.2,1.2
172121,A,2016-10-31,2016-10-31,USD,USD,1090872.0,2016,FY,"Agilent Technologies, Inc.",2016.0,...,4202000000.0,0.0,1253000000.0,0.0,0.0,0.0,1.398,1.0,1.42,1.4


In [59]:
# verificar se existe conbinacao de symbol e date
BS.duplicated(subset=["symbol","date"]).sum()

0

In [60]:
IS.duplicated(subset=["symbol","date"]).sum()

0

In [65]:
combined_data= BS.merge(IS, on=["symbol","date"], how="inner",suffixes=["","_is"])

In [66]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5433 entries, 0 to 5432
Columns: 157 entries, symbol to adj_EPSDiluted
dtypes: category(8), datetime64[ns](3), float64(141), int64(2), object(3)
memory usage: 7.5+ MB


In [67]:
combined_data.columns

Index(['symbol', 'date', 'ddate', 'reportedCurrency', 'uom', 'cik',
       'calendarYear', 'period', 'companyName', 'fy',
       ...
       'Revenues', 'SellingAndMarketingExpense', 'SGAExpense',
       'TangibleAssetImpairmentCharges', 'WaSharesDiluted', 'WaSharesBasic',
       'cum_ratio', 'adj_factor', 'adj_EPSBasic', 'adj_EPSDiluted'],
      dtype='object', length=157)

In [68]:
combined_data["ROE"] = combined_data.NI/combined_data.Equity
combined_data["ROA"] = combined_data.NI/combined_data.Assets

In [69]:
combined_data[["ROE","ROA"]].describe()

Unnamed: 0,ROE,ROA
count,5432.0,5432.0
mean,inf,inf
std,,
min,-1051.410377,-1.226993
25%,0.073824,0.023098
50%,0.138613,0.054762
75%,0.248144,0.099201
max,inf,inf


In [71]:
combined_data =combined_data.replace(np.inf, np.nan) # tirar o infinito

In [72]:
combined_data[["ROE","ROA"]].describe()

Unnamed: 0,ROE,ROA
count,5429.0,5430.0
mean,0.373219,0.069352
std,19.720843,0.349673
min,-1051.410377,-1.226993
25%,0.073739,0.023098
50%,0.138532,0.054738
75%,0.247682,0.098931
max,701.857143,24.983985


In [76]:
# exemplo com valoras negativos

combined_data[
    ["symbol",
      "companyName",
        "date",
          "fy",
            "Equity",
              "Assets",
                "NI","ROE","ROA"]].loc[combined_data["symbol"] == "BA"]

Unnamed: 0,symbol,companyName,date,fy,Equity,Assets,NI,ROE,ROA
572,BA,The Boeing Company,2012-12-31,2012.0,5967000000.0,88896000000.0,3900000000.0,0.653595,0.043871
573,BA,The Boeing Company,2013-12-31,2013.0,14997000000.0,92663000000.0,4585000000.0,0.305728,0.04948
574,BA,The Boeing Company,2014-12-31,2014.0,8790000000.0,99198000000.0,5446000000.0,0.619568,0.0549
575,BA,The Boeing Company,2015-12-31,2015.0,6397000000.0,94408000000.0,5176000000.0,0.809129,0.054826
576,BA,The Boeing Company,2016-12-31,2016.0,877000000.0,89997000000.0,4895000000.0,5.581528,0.054391
577,BA,The Boeing Company,2017-12-31,2017.0,412000000.0,92333000000.0,8197000000.0,19.895631,0.088776
578,BA,The Boeing Company,2018-12-31,2018.0,410000000.0,117359000000.0,10460000000.0,25.512195,0.089128
579,BA,The Boeing Company,2019-12-31,2019.0,-8300000000.0,133625000000.0,-636000000.0,0.076627,-0.00476
580,BA,The Boeing Company,2020-12-31,2020.0,-18075000000.0,152136000000.0,-11873000000.0,0.656874,-0.078042
581,BA,The Boeing Company,2021-12-31,2021.0,-14846000000.0,138552000000.0,-4202000000.0,0.283039,-0.030328


In [77]:
combined_data.loc[combined_data.Equity<=0, "ROE"] = np.nan
combined_data.loc[combined_data.Equity<=0, "ROA"] = np.nan

In [78]:
combined_data[["ROE","ROA"]].describe()

Unnamed: 0,ROE,ROA
count,5206.0,5204.0
mean,0.710554,0.067664
std,13.696224,0.354252
min,-51.893846,-1.226993
25%,0.080116,0.022949
50%,0.142719,0.053806
75%,0.25047,0.096431
max,701.857143,24.983985
