# Model Dataset Generation

## Import Libraries

In [1]:
# import necessary libraries
import pandas as pd 
import random
import time

# note: the random library will be used to generate synthetic data
# note: synthetic data will be created to address the class imbalance in the F-SCORE variable

## Random Seed

In [2]:
# note: set the random seed for reproducibility of results

# set random seed
seed = 2
random.seed(seed)

## Read Dataset to Pandas Dataframe

In [3]:
# read fundamentals_fscore.csv file into a pandas dataframe
funda_df = pd.read_csv('fundamentals_fscore.csv')
# display funda_df
display(funda_df.head())

Unnamed: 0,CIK,Ticker_Symbol,Company_Name,Sector,Data_Date,Market_Capitalization,Total_Assets,Total_Assets_PY1,Total_Assets_PY2,Total_Liabilities,...,F_CLEVER,CLIQUID,F_CLIQUID,EQ_OFFER,F_EQ_OFFER,CMARGIN,F_CMARGIN,CTURN,F_CTURN,F_SCORE
0,1750,AIR,AAR CORP,Capital Goods,2011-05-31,1049.8206,1703.727,1501.042,1377.511,868.438,...,1,-0.45519,0,0.297,0,-0.004629,0,0.201443,1,6
1,1750,AIR,AAR CORP,Capital Goods,2012-05-31,485.2897,2195.653,1703.727,1501.042,1329.631,...,0,0.049832,1,0.492,0,-0.008424,0,0.034591,1,5
2,1750,AIR,AAR CORP,Capital Goods,2013-05-31,790.0029,2136.9,2195.653,1703.727,1217.4,...,1,0.410468,1,-0.891,1,0.010205,1,-0.230628,0,7
3,1750,AIR,AAR CORP,Capital Goods,2014-05-31,961.308,2199.5,2136.9,2195.653,1198.8,...,1,0.120341,1,0.178,0,0.014049,1,-0.034682,0,7
4,1750,AIR,AAR CORP,Capital Goods,2015-05-31,1046.3954,1515.0,2199.5,2136.9,669.9,...,1,-0.461891,0,-4.137,1,-0.065087,0,-0.227468,0,3


In [4]:
# note: the cik of each company is usuallly 10 digits in length
# note: the leading 0's from some ciks have been removed
# note: therefore, these leading 0's have to be added again to ensure all ciks are of length 10

# add leading 0's to ciks that have a length less than 10
funda_df['CIK'] = funda_df['CIK'].apply(lambda x: '{0:0>10}'.format(x))

## Filter High Book-to-Market Companies

In [5]:
# note: piotroski's study focuses on companies with high b/m ratio values
# note: i.e. the companies that are contained in the highest b/m quintile
# note: thus, it makes sense to construct the model dataset with high b/m valued companies only

# filter high b/m valued companies and reset index of funda_df
funda_df = funda_df[funda_df['BM_Quintile']=='Very High'].reset_index(drop=True)

## Minimum/Maximum Values of Market Capitalization by Company Size Terciles

In [6]:
# note: it is necessary to obtain the minimum/maximum values of market cpaitalization by company size terciles
# note: this will be used when generating the synthetic data

# store minimum/maximum values of market capitalization by company size terciles
# small companies
small_market_cap_min = funda_df.loc[funda_df['Size_Tercile']=='Small', 'Market_Capitalization'].min()
small_market_cap_max = funda_df.loc[funda_df['Size_Tercile']=='Small', 'Market_Capitalization'].max()
# medium companies
medium_market_cap_min = funda_df.loc[funda_df['Size_Tercile']=='Medium', 'Market_Capitalization'].min()
medium_market_cap_max = funda_df.loc[funda_df['Size_Tercile']=='Medium', 'Market_Capitalization'].max()
# large companies
large_market_cap_min = funda_df.loc[funda_df['Size_Tercile']=='Large', 'Market_Capitalization'].min()
large_market_cap_max = funda_df.loc[funda_df['Size_Tercile']=='Large', 'Market_Capitalization'].max()

## Numerically Code Company Size Terciles 

In [7]:
# note: it is necessary to code the company size tercile labels numerically
# note: this will be used when generating synthetic data

# numerically code size tercile column in funda_df
funda_df.loc[funda_df['Size_Tercile']=='Small', 'Size_Tercile'] = 0
funda_df.loc[funda_df['Size_Tercile']=='Medium', 'Size_Tercile'] = 1
funda_df.loc[funda_df['Size_Tercile']=='Large', 'Size_Tercile'] = 2

## Minimum/Maximum Values of Book-to-Market 

In [8]:
# note: it is necessary to obtain the minimum/maximum values of the b/m column in funda_df
# note: this will be used when generating the synthetic data

# store minimum/maximum values of b/m column 
bm_min = funda_df['BM_Ratio'].min()
bm_max = funda_df['BM_Ratio'].max()

## Remove Unnecessary Columns

In [9]:
# note: synthetic data will not be associated to any real life information
# note: e.g. a synthetic observation will not belong to a real company
# note: therefore, it is necessary to remove columns containing any information that is not relevant to randomness

# remove the following columns....
# - CIK
# - Ticker_Symbol
# - Comapny_Name
# - Sector
# - Data_Date
# - Size Tercile
# - BM Quintile
funda_df = funda_df.drop(['CIK', 'Ticker_Symbol', 'Company_Name', 'Sector',
                          'Data_Date', 'BM_Quintile'], axis=1).reset_index(drop=True)

## Synthetic Data Generation - F-SCORE = 0

In [10]:
# note: there is currently 1 instance with the F-SCORE = 0
# note: the synthetic data generation for this class will be slightly different to that of the other classes
# note: a small neighbourhood boundary will be defined around the exisiting point
# note: this neighbourhood will contain synthetic points that also have an F-SCORE = 0
# note: the F-SCORE = 5 had the most instances (355 instances)
# note: the total number of data points for an F-SCORE = 0 should be equal to 355 after synthetic data generation

# time
start = time.time()

# counter variable
count = 0

while count < 354:
    # random size tercile (0 to 2)
    size_tercile_syn = random.randint(funda_df['Size_Tercile'].min(), funda_df['Size_Tercile'].max())
    
    # random market capitalization defined in relevant size tercile neighbourhood
    if size_tercile_syn==0:
        market_cap_syn = random.uniform(small_market_cap_min, small_market_cap_max)
    elif size_tercile_syn==1:
        market_cap_syn = random.uniform(medium_market_cap_min, medium_market_cap_max)
    else:
        market_cap_syn = random.uniform(large_market_cap_min, large_market_cap_max)
    
    # random total assets defined in neighbourhood
    total_assets_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                   'Total_Assets'].min() - 5),
                                      (funda_df.loc[funda_df['F_SCORE']==0, 
                                                   'Total_Assets'].max() + 5))
    
    # random total assets for previous year defined in neighbourhood
    total_assets_py1_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                       'Total_Assets_PY1'].min() - 5),
                                          (funda_df.loc[funda_df['F_SCORE']==0, 
                                                       'Total_Assets_PY1'].max() + 5))
    
    # random total assets for pre-previous year defined in neighbourhood
    total_assets_py2_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                       'Total_Assets_PY2'].min() - 5),
                                          (funda_df.loc[funda_df['F_SCORE']==0, 
                                                       'Total_Assets_PY2'].max() + 5))
    
    # random total liabilities defined in neighbourhood
    total_liabilities_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                        'Total_Liabilities'].min() - 5),
                                           (funda_df.loc[funda_df['F_SCORE']==0, 
                                                        'Total_Liabilities'].max() + 5))
    
    # random net income before extraordinary items defined in neighbourhood
    net_income_bei_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                     'Net_Income_Before_Extra_Items'].min() - 5),
                                        (funda_df.loc[funda_df['F_SCORE']==0, 
                                                     'Net_Income_Before_Extra_Items'].max() + 5))
    
    # random net income before extraordinary items for previous year defined in neighbourhood
    net_income_bei_py1_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                         'Net_Income_Before_Extra_Items'].min() - 5),
                                            (funda_df.loc[funda_df['F_SCORE']==0, 
                                                         'Net_Income_Before_Extra_Items'].max() + 5))
    
    # random cash flow from operations defined in neighbourhood
    cash_flow_ops_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                    'Cash_Flow_From_Operations'].min() - 5),
                                       (funda_df.loc[funda_df['F_SCORE']==0, 
                                                    'Cash_Flow_From_Operations'].max() + 5))
    
    # random total long term debt defined in neighbourhood
    total_ltd_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                'Total_Long_Term_Debt'].min() - 5),
                                   (funda_df.loc[funda_df['F_SCORE']==0, 
                                                'Total_Long_Term_Debt'].max() + 5))
    
    # random total long term debt for previous year defined in neighbourhood
    total_ltd_py1_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                    'Total_Long_Term_Debt_PY1'].min() - 5),
                                       (funda_df.loc[funda_df['F_SCORE']==0, 
                                                    'Total_Long_Term_Debt_PY1'].max() + 5))
    
    # random current assets defined in neighbourhood
    current_assets_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                     'Current_Assets'].min() - 5),
                                        (funda_df.loc[funda_df['F_SCORE']==0, 
                                                     'Current_Assets'].max() + 5))
    
    # random current assets for previous year defined in neighbourhood
    current_assets_py1_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                         'Current_Assets_PY1'].min() - 5),
                                            (funda_df.loc[funda_df['F_SCORE']==0, 
                                                         'Current_Assets_PY1'].max() + 5))
    
    # random current liabilities defined in neighbourhood 
    current_liabilities_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                          'Current_Liabilities'].min() - 5),
                                             (funda_df.loc[funda_df['F_SCORE']==0, 
                                                          'Current_Liabilities'].max() + 5))
    
    # random current liabilities for previous year defined in neighbourhood
    current_liabilities_py1_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                          'Current_Liabilities_PY1'].min() - 5),
                                             (funda_df.loc[funda_df['F_SCORE']==0, 
                                                          'Current_Liabilities_PY1'].max() + 5))
    
    # random common shares outstanding defined in neighbourhood
    csho_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                           'Common_Shares_Outstanding'].min() - 5),
                              (funda_df.loc[funda_df['F_SCORE']==0, 
                                           'Common_Shares_Outstanding'].max() + 5))
    
    # random common shares outstanding for previous year defined in neighbourhood
    csho_py1_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                               'Common_Shares_Outstanding_PY1'].min() - 5),
                                  (funda_df.loc[funda_df['F_SCORE']==0, 
                                               'Common_Shares_Outstanding_PY1'].max() + 5))
    
    # random total sales defined in neighbourhood
    total_sales_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                  'Total_Sales'].min() - 5),
                                     (funda_df.loc[funda_df['F_SCORE']==0, 
                                                  'Total_Sales'].max() + 5))
    
    # random total sales for previous year defined in neighbourhood
    total_sales_py1_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                                      'Total_Sales_PY1'].min() - 5),
                                         (funda_df.loc[funda_df['F_SCORE']==0, 
                                                      'Total_Sales_PY1'].max() + 5))
    
    # random cost of goods sold defined in neighbourhood
    cogs_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                           'Cost_Of_Goods_Sold'].min() - 5),
                              (funda_df.loc[funda_df['F_SCORE']==0, 
                                           'Cost_Of_Goods_Sold'].max() + 5))
    
    # random cost of goods sold for previous year defined in neighbourhood
    cogs_py1_syn = random.uniform((funda_df.loc[funda_df['F_SCORE']==0, 
                                               'Cost_Of_Goods_Sold_PY1'].min() - 5),
                                  (funda_df.loc[funda_df['F_SCORE']==0, 
                                               'Cost_Of_Goods_Sold_PY1'].max() + 5))
    
    # calculate b/m ratio from synthetic variables
    bm_ratio = (total_assets_syn - total_liabilities_syn) / market_cap_syn
    
    # calculate ROA from synthetic variables
    roa = net_income_bei_syn / total_assets_py1_syn
    
    # calculate ROA for previous year from synthetic variables
    roa_py1 = net_income_bei_py1_syn / total_assets_py2_syn
    
    # calculate CFO from synthetic variables
    cfo = cash_flow_ops_syn / total_assets_py1_syn
    
    # calculate CROA from synthetic variables
    croa = roa - roa_py1
    
    # calculate ACCRUAL from synthetic variables
    accrual = roa - cfo
    
    # calculate CLEVER from synthetic variables
    clever = (total_ltd_syn / (0.5 * (total_assets_syn + total_assets_py1_syn))) - \
             (total_ltd_py1_syn / (0.5 * (total_assets_py1_syn + total_assets_py2_syn)))
    
    # calculate CLIQUID from synthetic variables
    cliquid = (current_assets_syn / current_liabilities_syn) - \
              (current_assets_py1_syn / current_liabilities_py1_syn)
    
    # calculate EQ_OFFER from synthetic variables
    eq_offer = csho_syn - csho_py1_syn
    
    # calculate CMARGIN from synthetic variables
    cmargin = ((total_sales_syn - cogs_syn) / total_sales_syn) - \
              ((total_sales_py1_syn - cogs_py1_syn) / total_sales_py1_syn)
    
    # calculate CTURN from synthetic variables
    cturn = (total_sales_syn / total_assets_py1_syn) - \
            (total_sales_py1_syn / total_assets_py2_syn)
    
    # define scoring functions and store scoring results
    # ROA
    if roa > 0:
        f_roa = 1
    else:
        f_roa = 0
    # CFO    
    if cfo > 0:
        f_cfo = 1
    else:
        f_cfo = 0
    # CROA
    if croa > 0:
        f_croa = 1
    else:
        f_croa = 0
    # ACCRUAL
    if cfo > roa:
        f_accrual = 1
    else:
        f_accrual = 0
    # CLEVER
    if clever < 0:
        f_clever = 1
    else:
        f_clever = 0
    # CLIQUID
    if cliquid > 0:
        f_cliquid = 1
    else:
        f_cliquid = 0
    # EQ_OFFER
    if eq_offer <= 0:
        f_eq_offer = 1
    else:
        f_eq_offer = 0
    # CMARGIN
    if cmargin > 0:
        f_cmargin = 1
    else:
        f_cmargin = 0
    # CTURN
    if cturn > 0:
        f_cturn = 1
    else:
        f_cturn = 0
    
    # calculate composite F-SCORE from scoring functions
    f_score = f_roa + f_cfo + f_croa + f_accrual + \
              f_clever + f_cliquid + f_eq_offer + \
              f_cmargin + f_cturn
    
    # define observation of items above as dataframe
    observation = pd.DataFrame([[market_cap_syn, total_assets_syn, total_assets_py1_syn, 
                                 total_assets_py2_syn, total_liabilities_syn, net_income_bei_syn,
                                 cash_flow_ops_syn, total_ltd_syn, total_ltd_py1_syn,
                                 current_assets_syn, current_assets_py1_syn, current_liabilities_syn,
                                 current_liabilities_py1_syn, csho_syn, csho_py1_syn,
                                 total_sales_syn, total_sales_py1_syn, cogs_syn, 
                                 cogs_py1_syn, size_tercile_syn, bm_ratio, 
                                 roa, f_roa, cfo, f_cfo, croa, f_croa, accrual, f_accrual,
                                 clever, f_clever, cliquid, f_cliquid, eq_offer, 
                                 f_eq_offer, cmargin, f_cmargin, cturn, f_cturn, f_score]],
                               columns=funda_df.columns)
    
    # check if f_score is equal to 0, and if so add to funda_df
    if f_score==0:
        funda_df = funda_df.append(observation).reset_index(drop=True)
        count += 1
    else:
        continue
        
end = time.time()

print('time taken:', end - start)

time taken: 97.38894963264465


## Synthetic Data Generation - Other Minority F-SCOREs

In [11]:
# note: the other minority F-SCOREs have more than 1 instance
# note: the neighbourhood for these classes will be defined as their minimum and maximum values
# note: this neighbourhood will contain synthetic points that also have an equal F-SCORE
# note: the F-SCORE = 5 had the most instances (355 instances)
# note: the total number of data points for these minority F-SCOREs should be equal to 354 after synthetic data generation

# define a function to produce synthetic data points for given class
def syn_data_generator(data, F_SCORE):
    # counter variable
    count = 0
    
    # create an empty dataframe to store synthetic data points
    observations_df = pd.DataFrame(columns=data.columns)
    
    while count < (355 - data.loc[data['F_SCORE']==F_SCORE].shape[0]):
        # random size tercile (0 to 2)
        size_tercile_syn = random.randint(data['Size_Tercile'].min(), data['Size_Tercile'].max())

        # random market capitalization defined in relevant size tercile neighbourhood
        if size_tercile_syn==0:
            market_cap_syn = random.uniform(small_market_cap_min, small_market_cap_max)
        elif size_tercile_syn==1:
            market_cap_syn = random.uniform(medium_market_cap_min, medium_market_cap_max)
        else:
            market_cap_syn = random.uniform(large_market_cap_min, large_market_cap_max)

        # random total assets defined in neighbourhood
        total_assets_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                   'Total_Assets'].min()),
                                          (data.loc[data['F_SCORE']==F_SCORE, 
                                                   'Total_Assets'].max()))
        
        # random total assets for previous year defined in neighbourhood
        total_assets_py1_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                       'Total_Assets_PY1'].min()),
                                              (data.loc[data['F_SCORE']==F_SCORE, 
                                                       'Total_Assets_PY1'].max()))
        
        # random total assets for pre-previous year defined in neighbourhood
        total_assets_py2_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                       'Total_Assets_PY2'].min()),
                                              (data.loc[data['F_SCORE']==F_SCORE, 
                                                       'Total_Assets_PY2'].max()))

        # random total liabilities defined in neighbourhood
        total_liabilities_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                        'Total_Liabilities'].min()),
                                               (data.loc[data['F_SCORE']==F_SCORE, 
                                                        'Total_Liabilities'].max()))

        # random net income before extraordinary items defined in neighbourhood
        net_income_bei_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                     'Net_Income_Before_Extra_Items'].min()),
                                            (data.loc[data['F_SCORE']==F_SCORE, 
                                                     'Net_Income_Before_Extra_Items'].max()))

        # random net income before extraordinary items for previous year defined in neighbourhood
        net_income_bei_py1_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                         'Net_Income_Before_Extra_Items'].min()),
                                                (data.loc[data['F_SCORE']==F_SCORE, 
                                                         'Net_Income_Before_Extra_Items'].max()))

        # random cash flow from operations defined in neighbourhood
        cash_flow_ops_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                    'Cash_Flow_From_Operations'].min()),
                                           (data.loc[data['F_SCORE']==F_SCORE, 
                                                    'Cash_Flow_From_Operations'].max()))

        # random total long term debt defined in neighbourhood
        total_ltd_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                'Total_Long_Term_Debt'].min()),
                                       (data.loc[data['F_SCORE']==F_SCORE, 
                                                'Total_Long_Term_Debt'].max()))

        # random total long term debt for previous year defined in neighbourhood
        total_ltd_py1_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                    'Total_Long_Term_Debt_PY1'].min()),
                                           (data.loc[data['F_SCORE']==F_SCORE, 
                                                    'Total_Long_Term_Debt_PY1'].max()))

        # random current assets defined in neighbourhood
        current_assets_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                     'Current_Assets'].min()),
                                            (data.loc[data['F_SCORE']==F_SCORE, 
                                                     'Current_Assets'].max()))

        # random current assets for previous year defined in neighbourhood
        current_assets_py1_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                         'Current_Assets_PY1'].min()),
                                                (data.loc[data['F_SCORE']==F_SCORE, 
                                                         'Current_Assets_PY1'].max()))

        # random current liabilities defined in neighbourhood 
        current_liabilities_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                          'Current_Liabilities'].min()),
                                                 (data.loc[data['F_SCORE']==F_SCORE, 
                                                          'Current_Liabilities'].max()))

        # random current liabilities for previous year defined in neighbourhood
        current_liabilities_py1_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                              'Current_Liabilities_PY1'].min()),
                                                     (data.loc[data['F_SCORE']==F_SCORE, 
                                                              'Current_Liabilities_PY1'].max()))

        # random common shares outstanding defined in neighbourhood
        csho_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                           'Common_Shares_Outstanding'].min()),
                                  (data.loc[data['F_SCORE']==F_SCORE, 
                                           'Common_Shares_Outstanding'].max()))

        # random common shares outstanding for previous year defined in neighbourhood
        csho_py1_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                               'Common_Shares_Outstanding_PY1'].min()),
                                      (data.loc[data['F_SCORE']==F_SCORE, 
                                               'Common_Shares_Outstanding_PY1'].max()))

        # random total sales defined in neighbourhood
        total_sales_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                  'Total_Sales'].min()),
                                         (data.loc[data['F_SCORE']==F_SCORE, 
                                                  'Total_Sales'].max()))

        # random total sales for previous year defined in neighbourhood
        total_sales_py1_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                                      'Total_Sales_PY1'].min()),
                                             (data.loc[data['F_SCORE']==F_SCORE, 
                                                      'Total_Sales_PY1'].max()))

        # random cost of goods sold defined in neighbourhood
        cogs_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                           'Cost_Of_Goods_Sold'].min()),
                                  (data.loc[data['F_SCORE']==F_SCORE, 
                                           'Cost_Of_Goods_Sold'].max()))

        # random cost of goods sold for previous year defined in neighbourhood
        cogs_py1_syn = random.uniform((data.loc[data['F_SCORE']==F_SCORE, 
                                               'Cost_Of_Goods_Sold_PY1'].min()),
                                      (data.loc[data['F_SCORE']==F_SCORE, 
                                               'Cost_Of_Goods_Sold_PY1'].max()))
        
        # calculate b/m ratio from synthetic variables
        bm_ratio = (total_assets_syn - total_liabilities_syn) / market_cap_syn

        # calculate ROA from synthetic variables
        roa = net_income_bei_syn / total_assets_py1_syn

        # calculate ROA for previous year from synthetic variables
        roa_py1 = net_income_bei_py1_syn / total_assets_py2_syn

        # calculate CFO from synthetic variables
        cfo = cash_flow_ops_syn / total_assets_py1_syn

        # calculate CROA from synthetic variables
        croa = roa - roa_py1

        # calculate ACCRUAL from synthetic variables
        accrual = roa - cfo

        # calculate CLEVER from synthetic variables
        clever = (total_ltd_syn / (0.5 * (total_assets_syn + total_assets_py1_syn))) - \
                 (total_ltd_py1_syn / (0.5 * (total_assets_py1_syn + total_assets_py2_syn)))

        # calculate CLIQUID from synthetic variables
        cliquid = (current_assets_syn / current_liabilities_syn) - \
                  (current_assets_py1_syn / current_liabilities_py1_syn)

        # calculate EQ_OFFER from synthetic variables
        eq_offer = csho_syn - csho_py1_syn

        # calculate CMARGIN from synthetic variables
        cmargin = ((total_sales_syn - cogs_syn) / total_sales_syn) - \
                  ((total_sales_py1_syn - cogs_py1_syn) / total_sales_py1_syn)

        # calculate CTURN from synthetic variables
        cturn = (total_sales_syn / total_assets_py1_syn) - \
                (total_sales_py1_syn / total_assets_py2_syn)

        # define scoring functions and store scoring results
        # ROA
        if roa > 0:
            f_roa = 1
        else:
            f_roa = 0
        # CFO    
        if cfo > 0:
            f_cfo = 1
        else:
            f_cfo = 0
        # CROA
        if croa > 0:
            f_croa = 1
        else:
            f_croa = 0
        # ACCRUAL
        if cfo > roa:
            f_accrual = 1
        else:
            f_accrual = 0
        # CLEVER
        if clever < 0:
            f_clever = 1
        else:
            f_clever = 0
        # CLIQUID
        if cliquid > 0:
            f_cliquid = 1
        else:
            f_cliquid = 0
        # EQ_OFFER
        if eq_offer <= 0:
            f_eq_offer = 1
        else:
            f_eq_offer = 0
        # CMARGIN
        if cmargin > 0:
            f_cmargin = 1
        else:
            f_cmargin = 0
        # CTURN
        if cturn > 0:
            f_cturn = 1
        else:
            f_cturn = 0

        # calculate composite F-SCORE from scoring functions
        f_score = f_roa + f_cfo + f_croa + f_accrual + \
                  f_clever + f_cliquid + f_eq_offer + \
                  f_cmargin + f_cturn
        
        # define observation of items above as dataframe
        observation = pd.DataFrame([[market_cap_syn, total_assets_syn, total_assets_py1_syn, 
                                     total_assets_py2_syn, total_liabilities_syn, net_income_bei_syn,
                                     cash_flow_ops_syn, total_ltd_syn, total_ltd_py1_syn,
                                     current_assets_syn, current_assets_py1_syn, current_liabilities_syn,
                                     current_liabilities_py1_syn, csho_syn, csho_py1_syn,
                                     total_sales_syn, total_sales_py1_syn, cogs_syn, 
                                     cogs_py1_syn, size_tercile_syn, bm_ratio, 
                                     roa, f_roa, cfo, f_cfo, croa, f_croa, accrual, f_accrual,
                                     clever, f_clever, cliquid, f_cliquid, eq_offer, 
                                     f_eq_offer, cmargin, f_cmargin, cturn, f_cturn, f_score]],
                                    columns=data.columns)
        
        # check if f_score is equal to 'F_SCORE', and if so add to funda_df
        if f_score==F_SCORE:
            observations_df = observations_df.append(observation)
            count += 1
        else:
            continue
            
    return observations_df

In [12]:
# time
start = time.time()

# run syn_data_generator function for minority F-SCOREs (1, 2, 3, 4, 6, 7, 8, 9)
F_SCORE_1_df = syn_data_generator(funda_df, 1)
F_SCORE_2_df = syn_data_generator(funda_df, 2)
F_SCORE_3_df = syn_data_generator(funda_df, 3)
F_SCORE_4_df = syn_data_generator(funda_df, 4)
F_SCORE_6_df = syn_data_generator(funda_df, 6)
F_SCORE_7_df = syn_data_generator(funda_df, 7)
F_SCORE_8_df = syn_data_generator(funda_df, 8)
F_SCORE_9_df = syn_data_generator(funda_df, 9)

# append dataframes containing synthetic data points to funda_df
funda_df = funda_df.append(F_SCORE_1_df)
funda_df = funda_df.append(F_SCORE_2_df)
funda_df = funda_df.append(F_SCORE_3_df)
funda_df = funda_df.append(F_SCORE_4_df)
funda_df = funda_df.append(F_SCORE_6_df)
funda_df = funda_df.append(F_SCORE_7_df)
funda_df = funda_df.append(F_SCORE_8_df)
funda_df = funda_df.append(F_SCORE_9_df)

end = time.time()

print('time taken:', end - start)

# reset funda_df index
funda_df = funda_df.reset_index(drop=True)

time taken: 711.2443113327026


## Model Dataframe

In [15]:
# note: a new dataframe containing the 'raw' inputs of the model(s) has to be created

# create new dataframe to store raw model inputs
model_df = funda_df[['ROA', 'F_ROA', 'CFO', 'F_CFO', 'CROA', 'F_CROA', 
                     'ACCRUAL', 'F_ACCRUAL', 'CLEVER', 'F_CLEVER', 'CLIQUID', 
                     'F_CLIQUID', 'EQ_OFFER', 'F_EQ_OFFER', 'CMARGIN', 'F_CMARGIN',
                     'CTURN', 'F_CTURN', 'F_SCORE']]

## Save Data to CSV File

In [16]:
# save model_df to csv file
model_df.to_csv('modelinputdata.csv', index=False)