# Import Packages

In [1]:
import tabula
from re import sub
import pandas as pd
import numpy as np
import os

In [2]:
import extract as ex

# Import Data

In [3]:
dict_pos = {'Loan receivables, including held for sale': {'name': 'average_loan_receivables',
                                                          'uom': '$_mill',
                                                          'period': 'average'},
            'Total assets': {'name': 'average_assets', 'uom': '$_mill', 'period': 'average'},
            'Deposits': {'name': 'average_deposits', 'uom': '$_mill', 'period': 'average'},
            'Borrowings': {'name': 'average_corporate_loans', 'uom': '$_mill', 'period': 'average'},
            'Total equity': {'name': 'average_equity', 'uom': '$_mill', 'period': 'average'},
            'Purchase volume': {'name': 'net_credit_sales', 'uom': '$_mill', 'period': 'end'},
            'Retail Card': {'name': 'net_credit_sales_retail_card',
                            'uom': '$_mill',
                            'period': 'end'},
            'Payment Solutions': {'name': 'net_credit_sales_payment_solutions',
                                  'uom': '$_mill',
                                  'period': 'end'},
            'CareCredit': {'name': 'net_credit_sales_care_credit',
                           'uom': '$_mill',
                           'period': 'end'},
            'Home & Auto': {'name': 'net_credit_sales_home_auto',
                            'uom': '$_mill',
                            'period': 'end'},
            'Digital': {'name': 'net_credit_sales_digital', 'uom': '$_mill', 'period': 'end'},
            'Diversified & Value': {'name': 'net_credit_sales_diversified_value',
                                    'uom': '$_mill',
                                    'period': 'end'},
            'Health & Wellness': {'name': 'net_credit_sales_health_wellness',
                                  'uom': '$_mill',
                                  'period': 'end'},
            'Lifestyle': {'name': 'net_credit_sales_lifestyle', 'uom': '$_mill', 'period': 'end'},
            'Corp, Other': {'name': 'net_credit_sales_corp_other',
                            'uom': '$_mill',
                            'period': 'end'},
            'Average active accounts': {'name': 'average_active_accounts',
                                        'uom': 'thousands',
                                        'period': 'average'},
            'Net interest margin': {'name': 'net_interest_income_to_average_interest_earning_assets',
                                    'uom': 'perc',
                                    'period': 'average'},
            'Net charge-offs': {'name': 'net_charge_offs', 'uom': '$_mill', 'period': 'end'},
            'Net charge-offs as a % of average loan receivables, including held for sale': {'name': 'net_charge_offs_to_average_loan_receivables',
                                                                                            'uom': 'perc',
                                                                                            'period': 'average'},
            'Allowance coverage ratio': {'name': 'allowance_for_loan_losses_to_loan_receivables',
                                         'uom': 'perc',
                                         'period': 'end'},
            'Return on assets': {'name': 'return_on_average_assets',
                                 'uom': 'perc',
                                 'period': 'average'},
            'Return on equity': {'name': 'return_on_average_equity',
                                 'uom': 'perc',
                                 'period': 'average'},
            'Equity to assets': {'name': 'average_equity_to_average_assets',
                                 'uom': 'perc',
                                 'period': 'average'},
            'Other expense as a % of average loan receivables, including held for sale': {'name': 'other_expense_to_average_loan_receivables',
                                                                                          'uom': 'perc',
                                                                                          'period': 'average'},
            'Efficiency ratio': {'name': 'efficiency_ratio', 'uom': 'perc', 'period': 'end'},
            'Effective income tax rate': {'name': 'effective_income_tax_rate',
                                          'uom': 'perc',
                                          'period': 'end'},
            'Loan receivables': {'name': 'loan_receivables', 'uom': '$_mill', 'period': 'end'},
            'Allowance for credit losses': {'name': 'allowance_for_credit_losses',
                                            'uom': '$_mill',
                                            'period': 'end'},
            'Allowance for loan losses': {'name': 'allowance_for_credit_losses',
                                          'uom': '$_mill',
                                          'period': 'end'},
            '30+ days past due as a % of period-end loan receivables': {'name': '30_days_past_due_to_loan_receivables',
                                                                        'uom': '$_mill',
                                                                        'period': 'end'},
            '90+ days past due as a % of period-end loan receivables': {'name': '90_days_past_due_to_loan_receivables',
                                                                        'uom': '$_mill',
                                                                        'period': 'end'},
            'Total active accounts': {'name': 'active_accounts', 'uom': 'thousands', 'period': 'end'}}

In [4]:
pages={"2022":34,"2021":33,"2020":32,"2019":31,"2018":35,"2017":82,"2016":81}

In [5]:
dict_uom={_val["name"]:_val["uom"] for _val in dict_pos.values()}
dict_period={_val["name"]:_val["period"] for _val in dict_pos.values()}

In [6]:
results=[]
for year_current in ["2022","2021","2020","2019","2018","2017","2016"]:
    pdf_path=f"{os.getcwd()}/data/synchrony/synchrony_10K_{year_current}.pdf"
    pdf_data=tabula.read_pdf(pdf_path, pages=pages[year_current])[0]
    years=[str(int(year_current)-_year_add) for _year_add in range(3)]
    df=pdf_data.drop(columns=["Unnamed: 0","Unnamed: 2","Unnamed: 4" ])
    df=df.dropna()
    col_year=["Unnamed: 1","Unnamed: 3","Unnamed: 5"]
    year_mapping={_col:_year for _col,_year in zip(col_year,years)}
    df=df.rename(columns={"Financial Position Data (Average):":"position",**year_mapping})
    df["position"]=df["position"].apply(lambda x: x.split("(")[0])
    df["position"]=df["position"].apply(lambda x: x.strip())
    df["position"]=df["position"].replace({_key:_val["name"] for _key, _val in dict_pos.items()})
    for _col in df.columns:
        if _col!="position":
            df[_col]=df[_col].apply(lambda x: "".join(x.split(",")))
            df[_col]=df[_col].apply(lambda x:x.split(" ")[0])
            df[_col]=df[_col].apply(lambda x: x.replace('%', ''))
            df[_col]=df[_col].apply(lambda x: x.strip())
            df[_col]=df[_col].apply(float)
    df_pos_val=pd.melt(df, id_vars=['position'], value_vars=years,
            var_name='year', value_name='value')
    df_pos_val["fy"]=year_current
    df_pos_val["uom"]=df_pos_val["position"].apply(lambda x: dict_uom[x])
    df_pos_val["period"]=df_pos_val["position"].apply(lambda x: dict_period[x])
    results.append(df_pos_val)

In [7]:
df_result=pd.concat(results)

In [8]:
df_result

Unnamed: 0,position,year,value,fy,uom,period
0,average_loan_receivables,2022,84672.00,2022,$_mill,average
1,average_assets,2022,98152.00,2022,$_mill,average
2,average_deposits,2022,66006.00,2022,$_mill,average
3,average_corporate_loans,2022,13783.00,2022,$_mill,average
4,average_equity,2022,13372.00,2022,$_mill,average
...,...,...,...,...,...,...
70,loan_receivables,2014,61286.00,2016,$_mill,end
71,allowance_for_credit_losses,2014,3236.00,2016,$_mill,end
72,30_days_past_due_to_loan_receivables,2014,4.14,2016,$_mill,end
73,90_days_past_due_to_loan_receivables,2014,1.90,2016,$_mill,end


In [10]:
df_result.query("position== 'average_loan_receivables'")

Unnamed: 0,position,year,value,fy,uom,period
0,average_loan_receivables,2022,84672.0,2022,$_mill,average
28,average_loan_receivables,2021,78928.0,2022,$_mill,average
56,average_loan_receivables,2020,80138.0,2022,$_mill,average
0,average_loan_receivables,2021,78928.0,2021,$_mill,average
28,average_loan_receivables,2020,80138.0,2021,$_mill,average
56,average_loan_receivables,2019,88649.0,2021,$_mill,average
0,average_loan_receivables,2020,80138.0,2020,$_mill,average
25,average_loan_receivables,2019,88649.0,2020,$_mill,average
50,average_loan_receivables,2018,83304.0,2020,$_mill,average
0,average_loan_receivables,2019,88649.0,2019,$_mill,average


# PDF check

In [179]:
import PyPDF2

In [180]:
reader = PyPDF2.PdfReader(pdf_path)

# Average Balance

In [14]:
pdf_path=f"{os.getcwd()}/data/synchrony/synchrony_10K_{2021}.pdf"

In [15]:
tables = tabula.read_pdf_with_template(input_path = pdf_path, template_path = f"{os.getcwd()}/data/json/synchrony_10K_2021.json")

In [17]:
tables[0]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,2021,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,2020,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,2019,Unnamed: 12
0,,,,,Interest,Average,,,,Interest,Average,,,,Interest,Average
1,Years ended December 31,,Average,,Income /,Yield /,,Average,,Income/,Yield /,,Average,,Income/,Yield /
2,($ in millions),,Balance,,Expense,Rate(1),,Balance,,Expense,Rate(1),,Balance,,Expense,Rate(1)
3,Assets,,,,,,,,,,,,,,,
4,Interest-earning assets:,,,,,,,,,,,,,,,
5,Interest-earning cash and,,,,,,,,,,,,,,,
6,equivalents(2),$,11673,$,15,0.13 %,$,13301,$,53,0.40 %,$,12320,$,258,2.09 %
7,Securities available for sale,,5975,,28,0.47 %,,7367,,64,0.87 %,,5464,,127,2.32 %
8,"Loan receivables, including held for",,,,,,,,,,,,,,,
9,sale(3):,,,,,,,,,,,,,,,


In [None]:
 "x1": 7.65,
        "x2": 594.405,
        "y1": 111.3075,
        "y2": 506.0475,
        "width": 586.755,
        "height": 394.74

In [253]:
year_current="2022"
pdf_path=f"{os.getcwd()}/data/synchrony/synchrony_10K_{year_current}.pdf"
pdf_data=tabula.read_pdf(pdf_path, pages=35, stream=True)[0]

In [254]:
pdf_data_trans=pdf_data.transpose()
pdf_data_trans[0]=pdf_data_trans[0]+" "+ pdf_data_trans[1]
pdf_data_trans[4]=pdf_data_trans[4]+" "+ pdf_data_trans[5]
pdf_data_trans[7]=pdf_data_trans[7]+" "+ pdf_data_trans[8]
pdf_data_trans[13]=pdf_data_trans[13]+" "+ pdf_data_trans[14]
pdf_data_trans[25]=pdf_data_trans[25]+" "+ pdf_data_trans[26]
pdf_data=pdf_data_trans.drop(columns=[1,5,8,14,26]).transpose()

In [256]:
pdf_data.rename(columns={"Unnamed: 0":"position"})

Unnamed: 0,position,Unnamed: 1,Unnamed: 2,Unnamed: 3,Interest,Average,Unnamed: 4,Unnamed: 5,Unnamed: 6,Interest.1,Average.1,Unnamed: 7,Unnamed: 8,Unnamed: 9,Interest.2,Average.2
0,Years ended December 31 ($ in millions),,Average Balance,,Income / Expense,Yield / Rate(1),,Average Balance,,Income/ Expense,Yield / Rate(1),,Average Balance,,Income/ Expense,Yield / Rate(1)
2,Assets,,,,,,,,,,,,,,,
3,Interest-earning assets:,,,,,,,,,,,,,,,
4,Interest-earning cash and equivalents(2),,,,,,,,,,,,,,,
6,Securities available for sale,,5108,,71,1.39 %,,5975,,28,0.47 %,,7367,,64,0.87 %
7,"Loan receivables, including held for sale(3):",,,,,,,,,,,,,,,
9,Credit cards,,80119,,16471,20.56 %,,75052,,14880,19.83 %,,77115,,15672,20.32 %
10,Consumer installment loans,,2834,,287,10.13 %,,2460,,241,9.80 %,,1733,,168,9.69 %
11,Commercial credit products,,1642,,117,7.13 %,,1359,,103,7.58 %,,1231,,108,8.77 %
12,Other,,77,,6,7.79 %,,57,,4,7.02 %,,59,,2,3.39 %


In [248]:
pdf_data=pdf_data.reset_index(drop=True)

In [246]:
pdf_data.rename(columns={"Unnamed: 0})

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Interest,Average,Unnamed: 4,Unnamed: 5,Unnamed: 6,Interest.1,Average.1,Unnamed: 7,Unnamed: 8,Unnamed: 9,Interest.2,Average.2
0,Years ended December 31 ($ in millions),,Average Balance,,Income / Expense,Yield / Rate(1),,Average Balance,,Income/ Expense,Yield / Rate(1),,Average Balance,,Income/ Expense,Yield / Rate(1)
2,Assets,,,,,,,,,,,,,,,
3,Interest-earning assets:,,,,,,,,,,,,,,,
4,Interest-earning cash and equivalents(2),,,,,,,,,,,,,,,
6,Securities available for sale,,5108,,71,1.39 %,,5975,,28,0.47 %,,7367,,64,0.87 %
7,"Loan receivables, including held for sale(3):",,,,,,,,,,,,,,,
9,Credit cards,,80119,,16471,20.56 %,,75052,,14880,19.83 %,,77115,,15672,20.32 %
10,Consumer installment loans,,2834,,287,10.13 %,,2460,,241,9.80 %,,1733,,168,9.69 %
11,Commercial credit products,,1642,,117,7.13 %,,1359,,103,7.58 %,,1231,,108,8.77 %
12,Other,,77,,6,7.79 %,,57,,4,7.02 %,,59,,2,3.39 %


In [133]:
import extract as ex

In [137]:
row_desc_ser=ex.extract_col_nm(pdf_data)
row_desc_ser.replace('', np.nan, inplace=True)
row_desc_ser=row_desc_ser.dropna()
row_desc_ser

Unnamed: 0    assets_interest_earning_assets
dtype: object

In [168]:
# Delete the first two rows
pdf_data=pdf_data.drop([0,1]).reset_index(drop=True)
# Make the position column to index
pdf_data=pdf_data.rename(columns={"Unnamed: 0":"position"})
pdf_data["position"]=pdf_data["position"].apply(lambda x: x.split("(")[0])
#pdf_data["position"]=pdf_data["position"].apply(ex.replace_special_char)
#pdf_data["position"]=pdf_data["position"].apply(ex.snake_case)
pdf_data=pdf_data.set_index(keys="position",drop=True)
pdf_data

Unnamed: 0_level_0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Interest,Average,Unnamed: 4,Unnamed: 5,Unnamed: 6,Interest.1,Average.1,Unnamed: 7,Unnamed: 8,Unnamed: 9,Interest.2,Average.2
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Assets,,,,,,,,,,,,,,,
Interest-earning assets:,,,,,,,,,,,,,,,
Interest-earning cash and,,,,,,,,,,,,,,,
equivalents,$,10215,$,194.0,1.90 %,$,11673,$,15.0,0.13 %,$,13301,$,53.0,0.40 %
Securities available for sale,,5108,,71.0,1.39 %,,5975,,28.0,0.47 %,,7367,,64.0,0.87 %
"Loan receivables, including held for",,,,,,,,,,,,,,,
sale,,,,,,,,,,,,,,,
Credit cards,,80119,,16471.0,20.56 %,,75052,,14880.0,19.83 %,,77115,,15672.0,20.32 %
Consumer installment loans,,2834,,287.0,10.13 %,,2460,,241.0,9.80 %,,1733,,168.0,9.69 %
Commercial credit products,,1642,,117.0,7.13 %,,1359,,103.0,7.58 %,,1231,,108.0,8.77 %


In [143]:
ser_unit_cols=ex.find_unit_cols(pdf_data)
ser_unit_cols

Unnamed: 1    [$]
Unnamed: 3    [$]
Unnamed: 4    [$]
Unnamed: 6    [$]
Unnamed: 7    [$]
Unnamed: 9    [$]
dtype: object

In [144]:
# Remove unit columns
temp_df=pdf_data
li_col=list(temp_df.columns)
[li_col.remove(col) for col in ser_unit_cols.keys()]
print(li_col)

['Unnamed: 2', 'Interest', 'Average', 'Unnamed: 5', 'Interest.1', 'Average.1', 'Unnamed: 8', 'Interest.2', 'Average.2']


In [145]:
li_new_col=[f"new_col_{str(li_col.index(col))}" for col in li_col]

In [147]:
for col in li_col:
    
    # get value and unit of the column
    col_val_unit=ex.get_val_unit(dataframe=temp_df,col_nm=col)

    # Add unit column whether we have unit in the unit column
    dict_check_unit=ex.check_unit(col_val_unit)
    
    if dict_check_unit["fg_unit"]:

        # get the index of the column
        idx_col=temp_df.columns.get_loc(col)

        if dict_check_unit["unit"]== "$":
            idx_col=idx_col
        elif dict_check_unit["unit"]== "%":
            idx_col=idx_col+1
        
        temp_df[col]=col_val_unit["val"]
        temp_df.insert(loc=idx_col, column=f'new_col: {col}', value=col_val_unit["unit"])

In [148]:
temp_df

Unnamed: 0_level_0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Interest,Average,Unnamed: 4,Unnamed: 5,Unnamed: 6,Interest.1,Average.1,Unnamed: 7,Unnamed: 8,Unnamed: 9,Interest.2,Average.2
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
other,,77,,6.0,7.79 %,,57,,4.0,7.02 %,,59,,2.0,3.39 %
total_loan_receivables_including,,,,,,,,,,,,,,,
held_for_sale,,84672,,16881.0,19.94 %,,78928,,15228.0,19.29 %,,80138,,15950.0,19.90 %
total_interest_earning_assets,,99995,,17146.0,17.15 %,,96576,,15271.0,15.81 %,,100806,,16067.0,15.94 %
non_interest_earning_assets,,,,,,,,,,,,,,,
cash_and_due_from_banks,,1472,,,,,1597,,,,,1488,,,
allowance_for_credit_losses,,"(8,844)",,,,,"(9,402)",,,,,"(9,488)",,,
other_assets,,5529,,,,,5343,,,,,4932,,,
total_non_interest_earning_assets,,"(1,843)",,,,,"(2,462)",,,,,"(3,068)",,,
total_assets,$,98152,,,,$,94114,,,,$,97738,,,


In [149]:
ser_unit_cols=ex.find_unit_cols(temp_df)
ser_unit_cols

Unnamed: 1    [$]
Unnamed: 3    [$]
Unnamed: 4    [$]
Unnamed: 6    [$]
Unnamed: 7    [$]
Unnamed: 9    [$]
dtype: object

In [150]:
dict_unit={}

for col in ser_unit_cols.keys():
    idx=list(temp_df.columns).index(col)
    if ser_unit_cols[col][0]== "$":
        dict_unit[temp_df.columns[idx+1]]="$"
    elif ser_unit_cols[col][0]== "%":
        dict_unit[temp_df.columns[idx-1]]="%"


temp_df=temp_df.drop(columns=list(ser_unit_cols.keys()))
temp_df

Unnamed: 0_level_0,Unnamed: 2,Interest,Average,Unnamed: 5,Interest.1,Average.1,Unnamed: 8,Interest.2,Average.2
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
other,77,6.0,7.79 %,57,4.0,7.02 %,59,2.0,3.39 %
total_loan_receivables_including,,,,,,,,,
held_for_sale,84672,16881.0,19.94 %,78928,15228.0,19.29 %,80138,15950.0,19.90 %
total_interest_earning_assets,99995,17146.0,17.15 %,96576,15271.0,15.81 %,100806,16067.0,15.94 %
non_interest_earning_assets,,,,,,,,,
cash_and_due_from_banks,1472,,,1597,,,1488,,
allowance_for_credit_losses,"(8,844)",,,"(9,402)",,,"(9,488)",,
other_assets,5529,,,5343,,,4932,,
total_non_interest_earning_assets,"(1,843)",,,"(2,462)",,,"(3,068)",,
total_assets,98152,,,94114,,,97738,,


In [151]:
temp_df=temp_df.dropna(how="all")

In [152]:
temp_df

Unnamed: 0_level_0,Unnamed: 2,Interest,Average,Unnamed: 5,Interest.1,Average.1,Unnamed: 8,Interest.2,Average.2
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
other,77,6.0,7.79 %,57,4.0,7.02 %,59,2.0,3.39 %
held_for_sale,84672,16881.0,19.94 %,78928,15228.0,19.29 %,80138,15950.0,19.90 %
total_interest_earning_assets,99995,17146.0,17.15 %,96576,15271.0,15.81 %,100806,16067.0,15.94 %
cash_and_due_from_banks,1472,,,1597,,,1488,,
allowance_for_credit_losses,"(8,844)",,,"(9,402)",,,"(9,488)",,
other_assets,5529,,,5343,,,4932,,
total_non_interest_earning_assets,"(1,843)",,,"(2,462)",,,"(3,068)",,
total_assets,98152,,,94114,,,97738,,
interest_bearing_deposit_accounts,65624,1008.0,1.54 %,60953,566.0,0.93 %,63755,1094.0,1.72 %
securitization_entities,6468,196.0,3.03 %,7248,169.0,2.33 %,8675,237.0,2.73 %


In [153]:
li_dict_df=ex.separate_df(temp_df,3)

In [155]:
li_dict_df=[
    dict(
        year=dict_df["year"],
        df=dict_df["df"].rename(
            columns=ex.assign_col_nms_from_desc(
                dict_df["df"],
                row_desc_ser
            )
        )
    )

    for dict_df in li_dict_df
]

In [156]:
li_dict_df

[{'year': None,
  'df':                                        Unnamed: 2 Interest  Average
  position                                                           
  other                                          77        6   7.79 %
  held_for_sale                              84,672   16,881  19.94 %
  total_interest_earning_assets              99,995   17,146  17.15 %
  cash_and_due_from_banks                     1,472      NaN      NaN
  allowance_for_credit_losses               (8,844)      NaN      NaN
  other_assets                                5,529      NaN      NaN
  total_non_interest_earning_assets         (1,843)      NaN      NaN
  total_assets                               98,152      NaN      NaN
  interest_bearing_deposit_accounts          65,624    1,008   1.54 %
  securitization_entities                     6,468      196   3.03 %
  senior_unsecured_notes                      7,315      317   4.33 %
  total_interest_bearing_liabilities         79,407    1,521   1.92 