# Import Packages

In [1]:
import tabula
from re import sub
import pandas as pd
import numpy as np
import os

In [2]:
import extract as ex

# Import Data

In [3]:
pdf_path=f"{os.getcwd()}/data/discover_10K_2021.pdf"
pdf_data=tabula.read_pdf(pdf_path, pages=61)[0]

# Data Engineering

## Extract Column Name from the first two columns

In [4]:
row_desc_ser=ex.extract_col_nm(pdf_data)
row_desc_ser.replace('', np.nan, inplace=True)
row_desc_ser=row_desc_ser.dropna()
row_desc_ser

Unnamed: 2     average_balance
2021                yield_rate
Unnamed: 4            interest
Unnamed: 6     average_balance
2020                yield_rate
Unnamed: 8            interest
Unnamed: 10    average_balance
2019                yield_rate
Unnamed: 13           interest
dtype: object

In [6]:
# Delete the first two rows
pdf_data=pdf_data.drop([0,1]).reset_index(drop=True)
# Make the position column to index
pdf_data=pdf_data.rename(columns={"Unnamed: 0":"position"})
pdf_data["position"]=pdf_data["position"].apply(lambda x: x.split("(")[0])
pdf_data["position"]=pdf_data["position"].apply(ex.replace_special_char)
pdf_data["position"]=pdf_data["position"].apply(ex.snake_case)
pdf_data=pdf_data.set_index(keys="position",drop=True)
pdf_data

Unnamed: 0_level_0,Unnamed: 1,Unnamed: 2,2021,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,2020,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,2019,Unnamed: 11,Unnamed: 12,Unnamed: 13
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
cash_and_cash_equivalents,$,14236,0.13,%,$ 18,$,11348,0.3,%,$ 35,$,9667,2.27,%,$,219
restricted_cash,,695,0.03,%,NM,,438,0.45,%,2,,620,2.24,%,,14
other_short_term_investments,,176,0.12,%,NM,,2677,0.14,%,4,,754,2.66,%,,20
investment_securities,,8713,2.09,%,182,,11431,2.21,%,252,,7603,2.35,%,,179
loan_receivables,,,,,,,,,,,,,,,,
credit_card_loans,,69365,12.57,%,8717,,71447,12.58,%,8985,,72740,13.32,%,,9690
private_student_loans,,10057,7.38,%,742,,9890,7.63,%,754,,9559,8.54,%,,817
personal_loans,,6945,12.64,%,878,,7406,12.93,%,958,,7522,13.07,%,,983
other,,2054,5.57,%,114,,1660,6.35,%,105,,1065,6.63,%,,71
total_loan_receivables,,88421,11.82,%,10451,,90403,11.95,%,10802,,90886,12.72,%,,11561


## Find Unit Columns

In [7]:
ser_unit_cols=ex.find_unit_cols(pdf_data)
ser_unit_cols

Unnamed: 1     [$]
Unnamed: 3     [%]
Unnamed: 5     [$]
Unnamed: 7     [%]
Unnamed: 9     [$]
Unnamed: 11    [%]
Unnamed: 12    [$]
dtype: object

## Extract Unit from a column 

In [8]:
temp_df=pdf_data

In [9]:
# Remove unit columns
temp_df=pdf_data
li_col=list(temp_df.columns)
[li_col.remove(col) for col in ser_unit_cols.keys()]
print(li_col)

['Unnamed: 2', '2021', 'Unnamed: 4', 'Unnamed: 6', '2020', 'Unnamed: 8', 'Unnamed: 10', '2019', 'Unnamed: 13']


In [10]:
li_new_col=[f"new_col_{str(li_col.index(col))}" for col in li_col]

In [11]:
for col in li_col:
    
    # get value and unit of the column
    col_val_unit=ex.get_val_unit(dataframe=temp_df,col_nm=col)

    # Add unit column whether we have unit in the unit column
    dict_check_unit=ex.check_unit(col_val_unit)
    
    if dict_check_unit["fg_unit"]:

        # get the index of the column
        idx_col=temp_df.columns.get_loc(col)

        if dict_check_unit["unit"]== "$":
            idx_col=idx_col
        elif dict_check_unit["unit"]== "%":
            idx_col=idx_col+1
        
        temp_df[col]=col_val_unit["val"]
        temp_df.insert(loc=idx_col, column=f'new_col: {col}', value=col_val_unit["unit"])

In [12]:
temp_df

Unnamed: 0_level_0,Unnamed: 1,Unnamed: 2,2021,Unnamed: 3,new_col: Unnamed: 4,Unnamed: 4,Unnamed: 5,Unnamed: 6,2020,Unnamed: 7,new_col: Unnamed: 8,Unnamed: 8,Unnamed: 9,Unnamed: 10,2019,Unnamed: 11,Unnamed: 12,Unnamed: 13
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
cash_and_cash_equivalents,$,14236,0.13,%,$,18,$,11348,0.3,%,$,35.0,$,9667,2.27,%,$,219
restricted_cash,,695,0.03,%,,NM,,438,0.45,%,,2.0,,620,2.24,%,,14
other_short_term_investments,,176,0.12,%,,NM,,2677,0.14,%,,4.0,,754,2.66,%,,20
investment_securities,,8713,2.09,%,,182,,11431,2.21,%,,252.0,,7603,2.35,%,,179
loan_receivables,,,,,,,,,,,,,,,,,,
credit_card_loans,,69365,12.57,%,,8717,,71447,12.58,%,,8985.0,,72740,13.32,%,,9690
private_student_loans,,10057,7.38,%,,742,,9890,7.63,%,,754.0,,9559,8.54,%,,817
personal_loans,,6945,12.64,%,,878,,7406,12.93,%,,958.0,,7522,13.07,%,,983
other,,2054,5.57,%,,114,,1660,6.35,%,,105.0,,1065,6.63,%,,71
total_loan_receivables,,88421,11.82,%,,10451,,90403,11.95,%,,10802.0,,90886,12.72,%,,11561


In [14]:
ser_unit_cols=ex.find_unit_cols(temp_df)
ser_unit_cols

Unnamed: 1             [$]
Unnamed: 3             [%]
new_col: Unnamed: 4    [$]
Unnamed: 5             [$]
Unnamed: 7             [%]
new_col: Unnamed: 8    [$]
Unnamed: 9             [$]
Unnamed: 11            [%]
Unnamed: 12            [$]
dtype: object

In [15]:
dict_unit={}

for col in ser_unit_cols.keys():
    idx=list(temp_df.columns).index(col)
    if ser_unit_cols[col][0]== "$":
        dict_unit[temp_df.columns[idx+1]]="$"
    elif ser_unit_cols[col][0]== "%":
        dict_unit[temp_df.columns[idx-1]]="%"


temp_df=temp_df.drop(columns=list(ser_unit_cols.keys()))
temp_df

Unnamed: 0_level_0,Unnamed: 2,2021,Unnamed: 4,Unnamed: 6,2020,Unnamed: 8,Unnamed: 10,2019,Unnamed: 13
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
cash_and_cash_equivalents,14236,0.13,18,11348,0.3,35.0,9667,2.27,219
restricted_cash,695,0.03,NM,438,0.45,2.0,620,2.24,14
other_short_term_investments,176,0.12,NM,2677,0.14,4.0,754,2.66,20
investment_securities,8713,2.09,182,11431,2.21,252.0,7603,2.35,179
loan_receivables,,,,,,,,,
credit_card_loans,69365,12.57,8717,71447,12.58,8985.0,72740,13.32,9690
private_student_loans,10057,7.38,742,9890,7.63,754.0,9559,8.54,817
personal_loans,6945,12.64,878,7406,12.93,958.0,7522,13.07,983
other,2054,5.57,114,1660,6.35,105.0,1065,6.63,71
total_loan_receivables,88421,11.82,10451,90403,11.95,10802.0,90886,12.72,11561


In [16]:
temp_df=temp_df.dropna(how="all")

In [17]:
li_dict_df=ex.separate_df(temp_df,3)

In [18]:
row_desc_ser

Unnamed: 2     average_balance
2021                yield_rate
Unnamed: 4            interest
Unnamed: 6     average_balance
2020                yield_rate
Unnamed: 8            interest
Unnamed: 10    average_balance
2019                yield_rate
Unnamed: 13           interest
dtype: object

In [19]:
li_dict_df=[
    dict(
        year=dict_df["year"],
        df=dict_df["df"].rename(
            columns=ex.assign_col_nms_from_desc(
                dict_df["df"],
                row_desc_ser
            )
        )
    )

    for dict_df in li_dict_df
]

In [21]:
li_dict_df[1]["df"]

Unnamed: 0_level_0,average_balance,yield_rate,interest
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cash_and_cash_equivalents,11348,0.3,35.0
restricted_cash,438,0.45,2.0
other_short_term_investments,2677,0.14,4.0
investment_securities,11431,2.21,252.0
credit_card_loans,71447,12.58,8985.0
private_student_loans,9890,7.63,754.0
personal_loans,7406,12.93,958.0
other,1660,6.35,105.0
total_loan_receivables,90403,11.95,10802.0
total_interest_earning_assets,116297,9.54,11095.0
