In [12]:
import pandas as pd
import numpy as np

In [22]:
personal_data = pd.read_csv("personal_data.csv")
sales = pd.read_csv("sales.csv")
BS = pd.read_parquet("balance_sheet.parquet")

## a) Reshape from Long to Wide: pivot()

In [26]:
personal_data = personal_data.sort_values(["first", "last"])

In [27]:
personal_data

Unnamed: 0,first,last,measurement,value
0,John,Doe,height,188
4,John,Doe,weight,200
8,John,Doe,eye_color,blue
12,John,Doe,age,29
3,John,Lee,height,169
7,John,Lee,weight,178
11,John,Lee,eye_color,brown
15,John,Lee,age,80
1,Mary,Bo,height,160
5,Mary,Bo,weight,120


In [29]:
# Pivoted a long format to wide format
personal_data_wide = personal_data.pivot(index=["first", "last"], columns = "measurement", values="value") #

In [30]:
personal_data_wide

Unnamed: 0_level_0,measurement,age,eye_color,height,weight
first,last,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
John,Doe,29,blue,188,200
John,Lee,80,brown,169,178
Mary,Bo,38,brown,160,120
Sean,Kay,67,black,175,166


In [36]:
personal_data_wide[["age", "height", "weight"]] = personal_data_wide[["age", "height", "weight"]].astype("float")

In [38]:
personal_data_wide[["age", "height", "weight"]].agg(["mean", "median", "max", "min"])

measurement,age,height,weight
mean,53.5,173.0,166.0
median,52.5,172.0,172.0
max,80.0,188.0,200.0
min,29.0,160.0,120.0


In [42]:
personal_data["value2"] = personal_data["value"] * 2
personal_data

Unnamed: 0,first,last,measurement,value,value2
0,John,Doe,height,188,188188
4,John,Doe,weight,200,200200
8,John,Doe,eye_color,blue,blueblue
12,John,Doe,age,29,2929
3,John,Lee,height,169,169169
7,John,Lee,weight,178,178178
11,John,Lee,eye_color,brown,brownbrown
15,John,Lee,age,80,8080
1,Mary,Bo,height,160,160160
5,Mary,Bo,weight,120,120120


In [70]:
personal_data.pivot(index=["first", "last"], columns = "measurement").reset_index()

Unnamed: 0_level_0,first,last,value,value,value,value,value2,value2,value2,value2
measurement,Unnamed: 1_level_1,Unnamed: 2_level_1,age,eye_color,height,weight,age,eye_color,height,weight
0,John,Doe,29,blue,188,200,2929,blueblue,188188,200200
1,John,Lee,80,brown,169,178,8080,brownbrown,169169,178178
2,Mary,Bo,38,brown,160,120,3838,brownbrown,160160,120120
3,Sean,Kay,67,black,175,166,6767,blackblack,175175,166166


## b) Reshape from Long to Wide with Aggregation: pivot_table()

In [56]:
sales

Unnamed: 0,date,item,quantity,sales
0,2023-09-01,PC,4,1802
1,2023-09-01,Washer,1,1741
2,2023-09-01,Fridge,1,1654
3,2023-09-01,Washer,1,715
4,2023-09-01,Fridge,1,762
5,2023-09-01,PC,5,604
6,2023-09-01,Washer,3,3640
7,2023-09-01,Fridge,2,3612
8,2023-09-02,Fridge,1,1629
9,2023-09-02,TV,2,2155


In [71]:
result_2 = sales.pivot_table(index = "date", 
                             columns="item", 
                             aggfunc=["sum", "mean", "max"], 
                             values = ["sales"],
                             fill_value = 0,
                            )
result_2

Unnamed: 0_level_0,sum,sum,sum,sum,mean,mean,mean,mean,max,max,max,max
Unnamed: 0_level_1,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales
item,Fridge,PC,TV,Washer,Fridge,PC,TV,Washer,Fridge,PC,TV,Washer
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
2023-09-01,6028,2406,0,6096,2009.333333,1203.0,0.0,2032.0,3612,1802,0,3640
2023-09-02,3343,1963,6022,4622,1671.5,1963.0,2007.333333,2311.0,1714,1963,3281,3203
2023-09-03,0,9107,886,4103,0.0,2276.75,886.0,1367.666667,0,3632,886,2316


In [63]:
result_2.columns

MultiIndex([( 'sum', 'sales', 'Fridge'),
            ( 'sum', 'sales',     'PC'),
            ( 'sum', 'sales',     'TV'),
            ( 'sum', 'sales', 'Washer'),
            ('mean', 'sales', 'Fridge'),
            ('mean', 'sales',     'PC'),
            ('mean', 'sales',     'TV'),
            ('mean', 'sales', 'Washer'),
            ( 'max', 'sales', 'Fridge'),
            ( 'max', 'sales',     'PC'),
            ( 'max', 'sales',     'TV'),
            ( 'max', 'sales', 'Washer')],
           names=[None, None, 'item'])

In [72]:
result_2.columns = [ col[1]+"-"+col[2]+"-"+col[0] for col in result_2.columns.to_list() ]

In [73]:
result_2 = result_2.reset_index()

In [74]:
result_2

Unnamed: 0,date,sales-Fridge-sum,sales-PC-sum,sales-TV-sum,sales-Washer-sum,sales-Fridge-mean,sales-PC-mean,sales-TV-mean,sales-Washer-mean,sales-Fridge-max,sales-PC-max,sales-TV-max,sales-Washer-max
0,2023-09-01,6028,2406,0,6096,2009.333333,1203.0,0.0,2032.0,3612,1802,0,3640
1,2023-09-02,3343,1963,6022,4622,1671.5,1963.0,2007.333333,2311.0,1714,1963,3281,3203
2,2023-09-03,0,9107,886,4103,0.0,2276.75,886.0,1367.666667,0,3632,886,2316


In [78]:
sales.pivot_table(index="date", columns="item", values="sales", aggfunc="sum", fill_value=0).reset_index()

item,date,Fridge,PC,TV,Washer
0,2023-09-01,6028,2406,0,6096
1,2023-09-02,3343,1963,6022,4622
2,2023-09-03,0,9107,886,4103


## c) Reshape from Wide to Long

In [81]:
personal_data_wide = personal_data.pivot(index=["first", "last"], columns = "measurement", values="value").reset_index()

In [82]:
personal_data_wide

measurement,first,last,age,eye_color,height,weight
0,John,Doe,29,blue,188,200
1,John,Lee,80,brown,169,178
2,Mary,Bo,38,brown,160,120
3,Sean,Kay,67,black,175,166


In [88]:
# To reshape from wide to long, use melt()
personal_data_wide.melt(id_vars = ["first", "last"], 
                        value_vars = ["age", "eye_color", "height", "weight"], 
                        var_name = "metrics",
                        value_name = "actual_values"
                       )

Unnamed: 0,first,last,metrics,actual_values
0,John,Doe,age,29
1,John,Lee,age,80
2,Mary,Bo,age,38
3,Sean,Kay,age,67
4,John,Doe,eye_color,blue
5,John,Lee,eye_color,brown
6,Mary,Bo,eye_color,brown
7,Sean,Kay,eye_color,black
8,John,Doe,height,188
9,John,Lee,height,169


## d) Case Study: Balance Sheets Data Reshaped

In [90]:
BS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55149 entries, 0 to 55148
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   cik     55149 non-null  float64 
 1   name    55149 non-null  category
 2   metric  55149 non-null  object  
 3   2008    12 non-null     float64 
 4   2009    1443 non-null   float64 
 5   2010    4200 non-null   float64 
 6   2011    21414 non-null  float64 
 7   2012    22305 non-null  float64 
 8   2013    22272 non-null  float64 
 9   2014    21870 non-null  float64 
 10  2015    20712 non-null  float64 
 11  2016    19590 non-null  float64 
 12  2017    20166 non-null  float64 
 13  2018    19956 non-null  float64 
 14  2019    19512 non-null  float64 
 15  2020    20190 non-null  float64 
 16  2021    22758 non-null  float64 
 17  2022    21570 non-null  float64 
 18  2023    84 non-null     float64 
dtypes: category(1), float64(17), object(1)
memory usage: 8.3+ MB


In [93]:
BS = BS.reset_index(drop=True)
BS

fy,cik,name,metric,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,1750.0,AAR CORP,Assets,,,,1.703727e+09,2.195653e+09,2.136900e+09,2.199500e+09,1.515000e+09,1.442100e+09,1.504100e+09,1.524700e+09,1.517200e+09,2.079000e+09,1.539700e+09,1.573900e+09,
1,1750.0,AAR CORP,Equity,,,,8.352890e+08,8.660220e+08,9.195000e+08,1.000700e+09,8.451000e+08,8.658000e+08,9.142000e+08,9.363000e+08,9.059000e+08,9.026000e+08,9.744000e+08,1.034500e+09,
2,1750.0,AAR CORP,Liabilities,,,,8.684380e+08,1.329631e+09,1.217400e+09,1.198800e+09,6.699000e+08,5.763000e+08,5.899000e+08,5.884000e+08,6.113000e+08,1.176400e+09,5.653000e+08,5.394000e+08,
3,1800.0,ABBOTT LABORATORIES,Assets,,5.241662e+10,5.946227e+10,6.027689e+10,6.723494e+10,4.295300e+10,4.127500e+10,4.124700e+10,5.266600e+10,7.625000e+10,6.717300e+10,6.788700e+10,7.254800e+10,7.519600e+10,7.443800e+10,
4,1800.0,ABBOTT LABORATORIES,Equity,,2.289873e+10,2.247646e+10,2.452614e+10,2.681323e+10,2.526700e+10,2.163900e+10,2.132600e+10,2.071700e+10,3.109800e+10,3.072200e+10,3.130100e+10,3.300300e+10,3.602400e+10,3.690500e+10,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55144,1953530.0,OCULIS HOLDING AG,Equity,,,,,,,,,,,,,,,0.000000e+00,
55145,1953530.0,OCULIS HOLDING AG,Liabilities,,,,,,,,,,,,,,,1.350510e+08,
55146,1962738.0,"CASI PHARMACEUTICALS, INC.",Assets,,,,,,,,,,,,,,,9.623400e+07,
55147,1962738.0,"CASI PHARMACEUTICALS, INC.",Equity,,,,,,,,,,,,,,,4.457400e+07,


In [98]:
temp = BS.melt(id_vars = ["cik", "name", "metric"], var_name = "year", value_name="value")

In [105]:
BS_clean = temp.pivot(index=["cik", "name", "year"], columns= "metric", values="value").reset_index().dropna()

In [106]:
BS_clean # Tidy Data Format!

metric,cik,name,year,Assets,Equity,Liabilities
3,1750.0,AAR CORP,2011,1.703727e+09,8.352890e+08,8.684380e+08
4,1750.0,AAR CORP,2012,2.195653e+09,8.660220e+08,1.329631e+09
5,1750.0,AAR CORP,2013,2.136900e+09,9.195000e+08,1.217400e+09
6,1750.0,AAR CORP,2014,2.199500e+09,1.000700e+09,1.198800e+09
7,1750.0,AAR CORP,2015,1.515000e+09,8.451000e+08,6.699000e+08
...,...,...,...,...,...,...
294062,1948056.0,KKR INFRASTRUCTURE CONGLOMERATE LLC,2022,6.701163e+06,1.000000e+03,6.700163e+06
294078,1949543.0,SITIO ROYALTIES CORP.,2022,5.170902e+09,3.886595e+09,1.284307e+09
294094,1951378.0,QILUN GROUP INC.,2022,1.534338e+07,6.345809e+06,8.997570e+06
294110,1953530.0,OCULIS HOLDING AG,2022,3.706000e+07,0.000000e+00,1.350510e+08


In [107]:
BS_clean[["Assets", "Equity", "Liabilities"]].describe()

metric,Assets,Equity,Liabilities
count,86018.0,86018.0,86018.0
mean,217399900000.0,11515240000.0,182135800000.0
std,7535202000000.0,324230800000.0,6796709000000.0
min,-1654400000.0,-3582720000000.0,-928000000.0
25%,18169600.0,0.0,6155153.0
50%,352909000.0,71814500.0,134427500.0
75%,2584576000.0,663385200.0,1578875000.0
max,675884300000000.0,20737680000000.0,624753900000000.0


In [108]:
BS_clean[["Assets", "Equity", "Liabilities"]].agg(["std", "mean", "median"])

metric,Assets,Equity,Liabilities
std,7535202000000.0,324230800000.0,6796709000000.0
mean,217399900000.0,11515240000.0,182135800000.0
median,352909000.0,71814500.0,134427500.0


In [109]:
BS_clean.sort_values("Assets", ascending=False).head(10)

metric,cik,name,year,Assets,Equity,Liabilities
109518,1263043.0,SHINHAN FINANCIAL GROUP CO LTD,2022,675884300000000.0,-3582720000000.0,624753900000000.0
109517,1263043.0,SHINHAN FINANCIAL GROUP CO LTD,2021,648152200000000.0,-984936000000.0,598613800000000.0
109516,1263043.0,SHINHAN FINANCIAL GROUP CO LTD,2020,605234100000000.0,-404181000000.0,558877200000000.0
109515,1263043.0,SHINHAN FINANCIAL GROUP CO LTD,2019,552419600000000.0,-260156000000.0,510489200000000.0
109514,1263043.0,SHINHAN FINANCIAL GROUP CO LTD,2018,459600500000000.0,-753220000000.0,422949100000000.0
109513,1263043.0,SHINHAN FINANCIAL GROUP CO LTD,2017,426305700000000.0,-529734000000.0,392603100000000.0
9181,67088.0,MITSUBISHI UFJ FINANCIAL GROUP INC,2021,367650000000000.0,16296520000000.0,351353500000000.0
185245,1504764.0,GRUPO AVAL ACCIONES Y VALORES S.A.,2021,366903900000000.0,1117182000000.0,327432300000000.0
9180,67088.0,MITSUBISHI UFJ FINANCIAL GROUP INC,2020,353824600000000.0,16244550000000.0,337580100000000.0
80030,1071371.0,BANCOLOMBIA SA,2022,352814700000000.0,7758216000000.0,312817200000000.0


In [110]:
BS_clean.loc[BS_clean.name.str.startswith("APPLE")]

metric,cik,name,year,Assets,Equity,Liabilities
18305,320193.0,APPLE INC,2009,53851000000.0,27832000000.0,26019000000.0
18306,320193.0,APPLE INC,2010,75183000000.0,47791000000.0,27392000000.0
18307,320193.0,APPLE INC,2011,116371000000.0,76615000000.0,39756000000.0
18308,320193.0,APPLE INC,2012,176064000000.0,118210000000.0,57854000000.0
18309,320193.0,APPLE INC,2013,207000000000.0,123549000000.0,83451000000.0
18310,320193.0,APPLE INC,2014,231839000000.0,111547000000.0,120292000000.0
18311,320193.0,APPLE INC,2015,290479000000.0,119355000000.0,171124000000.0
18312,320193.0,APPLE INC,2016,321686000000.0,128249000000.0,193437000000.0
18313,320193.0,APPLE INC,2017,375319000000.0,134047000000.0,241272000000.0
18314,320193.0,APPLE INC,2018,365725000000.0,107147000000.0,258578000000.0
