In [184]:
import pandas as pd
import features
import importlib

# Loading the Cleaned DataFrames for feature engineering.
comprehensive_income_df = pd.read_csv('../Data/Cleaned_Pandas_DataFrames/comprehensive_income_df.csv', index_col = [0])

income_statement_df = pd.read_csv('../Data/Cleaned_Pandas_DataFrames/income_statement_df.csv', index_col = [0])

cash_flow_df = pd.read_csv('../Data/Cleaned_Pandas_DataFrames/cash_flow_df.csv', index_col = [0])

balance_sheet_df = pd.read_csv('../Data/Cleaned_Pandas_DataFrames/balance_sheet_df.csv', index_col = [0])

filing_information_df = pd.read_csv('../Data/Cleaned_Pandas_DataFrames/filing_information_df.csv', index_col = [0])

company_information_df = pd.read_csv('../Data/Cleaned_Pandas_DataFrames/company_information_df.csv', index_col = [0])

full_df_list = [comprehensive_income_df, income_statement_df, cash_flow_df, balance_sheet_df, filing_information_df, company_information_df]

statements_list = [comprehensive_income_df, income_statement_df, cash_flow_df, balance_sheet_df]
                   
other_information_list = [filing_information_df, company_information_df]


In [191]:
# The feature functions are ran on the proper DataFrames, and outputted into a series for each.
importlib.reload(features)

current_ratio = features.current_ratio(balance_sheet_df)
operating_cash_flow = features.operating_cash_flow(cash_flow_df, balance_sheet_df)
debt_to_equity = features.debt_to_equity(balance_sheet_df)
interest_coverage = features.interest_coverage(income_statement_df)
operating_margin = features.operating_margin(income_statement_df)
return_on_assets = features.return_on_assets(income_statement_df, balance_sheet_df)
return_on_equity = features.return_on_equity(income_statement_df, balance_sheet_df)

# The series resulting from each function are concatenated into a DataFrame.
feature_list = [current_ratio, operating_cash_flow, debt_to_equity, interest_coverage, operating_margin, return_on_assets, return_on_equity]
feature_df = pd.concat(feature_list, axis = 1)
feature_df.columns = ['current_ratio', 'operating_cash_flow', 'debt_to_equity', 'interest_coverage', 'operating_margin', 'return_on_assets', 'return_on_equity']

In [192]:
# Some brief exploration of the created DataFrame.
feature_df['ticker'] = filing_information_df[filing_information_df.index.isin(feature_df.index)]['ticker'].values

feature_df = feature_df.merge(company_information_df[['office', 'industry']], how = 'right', left_on = 'ticker', right_index = True)

feature_df.drop('ticker', axis = 1, inplace = True)

feature_df

Unnamed: 0,current_ratio,operating_cash_flow,debt_to_equity,interest_coverage,operating_margin,return_on_assets,return_on_equity,office,industry
0001090872:2011:FY,3.031573,0.477953,1.098471,12.453488,0.533485,0.111737,0.234476,Industrial Applications and Services,INSTRUMENTS FOR MEAS & TESTING OF ELECTRICITY ...
0001090872:2012:FY,2.445325,-0.621236,1.032015,11.079208,0.525518,0.109434,0.222372,Industrial Applications and Services,INSTRUMENTS FOR MEAS & TESTING OF ELECTRICITY ...
0001090872:2013:FY,3.110487,0.202247,1.020420,8.887850,0.521233,0.067752,0.136888,Industrial Applications and Services,INSTRUMENTS FOR MEAS & TESTING OF ELECTRICITY ...
0001090872:2014:FY,3.231492,0.207403,1.043199,7.353982,0.514683,0.046533,0.095076,Industrial Applications and Services,INSTRUMENTS FOR MEAS & TESTING OF ELECTRICITY ...
0001090872:2015:FY,3.776639,-1.050205,0.793525,7.909091,0.505448,0.053617,0.096163,Industrial Applications and Services,INSTRUMENTS FOR MEAS & TESTING OF ELECTRICITY ...
...,...,...,...,...,...,...,...,...,...
0000783325:2018:FY,,,,3.299034,0.622645,,,Office of Technology,SERVICES-PREPACKAGED SOFTWARE
0000783325:2019:FY,,,,3.053639,0.643923,,,Office of Technology,GENERAL INDUSTRIAL MACHINERY & EQUIPMENT
0000783325:2020:FY,,,,3.455742,0.679702,,,Industrial Applications and Services,ELECTROMEDICAL & ELECTROTHERAPEUTIC APPARATUS
0000783325:2021:FY,,,,3.640204,0.601852,,,Industrial Applications and Services,ELECTROMEDICAL & ELECTROTHERAPEUTIC APPARATUS


In [193]:
# A final bout of cleaning is conducted on this final dataframe.
feature_df = features.final_df_check(feature_df)
feature_df

<class 'pandas.core.frame.DataFrame'>
Index: 22860 entries, 0001090872:2011:FY to 0001131457:2011:FY
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   current_ratio        22779 non-null  float64
 1   operating_cash_flow  22681 non-null  float64
 2   debt_to_equity       22779 non-null  float64
 3   interest_coverage    22719 non-null  float64
 4   operating_margin     22719 non-null  float64
 5   return_on_assets     22660 non-null  float64
 6   return_on_equity     22660 non-null  float64
 7   office               22860 non-null  object 
 8   industry             22860 non-null  object 
dtypes: float64(7), object(2)
memory usage: 1.7+ MB
None
The column current_ratio has 0 rows with values of zero!
The column operating_cash_flow has 31 rows with values of zero!
The column debt_to_equity has 0 rows with values of zero!
The column interest_coverage has 6897 rows with values of zero!
The column operat

Unnamed: 0,current_ratio,operating_cash_flow,debt_to_equity,interest_coverage,operating_margin,return_on_assets,return_on_equity,office,industry,has_interest_payments
0001090872:2011:FY,3.031573,0.477953,1.098471,12.453488,0.533485,0.111737,0.234476,Industrial Applications and Services,INSTRUMENTS FOR MEAS & TESTING OF ELECTRICITY ...,1
0001090872:2012:FY,2.445325,-0.621236,1.032015,11.079208,0.525518,0.109434,0.222372,Industrial Applications and Services,INSTRUMENTS FOR MEAS & TESTING OF ELECTRICITY ...,1
0001090872:2013:FY,3.110487,0.202247,1.020420,8.887850,0.521233,0.067752,0.136888,Industrial Applications and Services,INSTRUMENTS FOR MEAS & TESTING OF ELECTRICITY ...,1
0001090872:2014:FY,3.231492,0.207403,1.043199,7.353982,0.514683,0.046533,0.095076,Industrial Applications and Services,INSTRUMENTS FOR MEAS & TESTING OF ELECTRICITY ...,1
0001090872:2015:FY,3.776639,-1.050205,0.793525,7.909091,0.505448,0.053617,0.096163,Industrial Applications and Services,INSTRUMENTS FOR MEAS & TESTING OF ELECTRICITY ...,1
...,...,...,...,...,...,...,...,...,...,...
0001423774:2022:FY,1.497216,0.088451,1.586365,0.000000,0.595931,-0.225325,-0.582772,Office of Trade & Services,WHOLESALE-METALS SERVICE CENTERS & OFFICES,0
0001439288:2021:FY,1.975042,-0.642679,7.526108,3.083573,0.409705,0.112183,0.956487,Office of Trade & Services,WHOLESALE-METALS SERVICE CENTERS & OFFICES,1
0000846475:2019:FY,4.342120,0.752742,0.438813,0.000000,0.806166,0.335679,0.482980,Office of Trade & Services,WHOLESALE-METALS SERVICE CENTERS & OFFICES,0
0000846475:2020:FY,6.229416,2.484234,0.269457,0.000000,0.782619,0.125722,0.159599,Office of Trade & Services,WHOLESALE-METALS SERVICE CENTERS & OFFICES,0


In [196]:
# The DataFrame is outputted for use in the machine learning models.
print(feature_df.info())
feature_df.to_csv('../Data/model_data.csv')

<class 'pandas.core.frame.DataFrame'>
Index: 22596 entries, 0001090872:2011:FY to 0000846475:2021:FY
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   current_ratio          22596 non-null  float64
 1   operating_cash_flow    22596 non-null  float64
 2   debt_to_equity         22596 non-null  float64
 3   interest_coverage      22596 non-null  float64
 4   operating_margin       22596 non-null  float64
 5   return_on_assets       22596 non-null  float64
 6   return_on_equity       22596 non-null  float64
 7   office                 22596 non-null  object 
 8   industry               22596 non-null  object 
 9   has_interest_payments  22596 non-null  int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 1.9+ MB
None
