In [36]:
import numpy as np
import pandas as pd
import xgboost
import pickle
from sklearn.preprocessing import FunctionTransformer, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

pd.options.display.max_rows = 200
pd.options.display.max_columns = 200
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [12]:
def clip_series(s, lower, upper):
   clipped = s.clip(lower=s.quantile(lower), upper=s.quantile(upper), axis=0)
   return clipped

In [13]:
def generate_features(data):
    features = pd.DataFrame()
    # ratios from https://corporatefinanceinstitute.com/resources/knowledge/finance/financial-ratios/
    # LIQUIDITY RATIOS
    # current ratio
    # a company’s ability to pay off short-term liabilities with current assets
    features['currentRatio'] = data.totalCurrentAssets / data.totalCurrentLiabilities
    # acid-test ratio
    # a company’s ability to pay off short-term liabilities with quick assets
    features['acidTestRatio'] = (data.totalCurrentAssets - data.inventory)/data.totalCurrentLiabilities
    # cash ratio
    # a company’s ability to pay off short-term liabilities
    features['cashRatio'] = data.cashAndCashEquivalents / data.totalCurrentLiabilities
    # operating cash flow ratio
    # number of times a company can pay off current liabilities with the cash generated in a given period
    features['operatingCashFlowRatio'] = data.operatingCashFlow / data.totalCurrentLiabilities
    # LEVERAGE FINANCIAL RATIOS
    # debt ratio
    # relative amount of a company’s assets that are provided from debt
    features['debtRatio'] = data.totalLiabilities / data.totalAssets
    # debt to equity ratio
    # weight of total debt and financial liabilities against shareholders equity
    features['debtToEquityRatio'] = data.totalLiabilities / data.totalStockholdersEquity
    # interest coverage ratio
    # how easily a company can pay its interest expenses
    features['interestCoverageRatio'] = data.operatingIncome / data.interestExpense
    # debt service coverage ratio
    #  how easily a company can pay its debt obligations
    features['debtServiceCoverageRatio'] = data.operatingIncome/data.debtRepayment
    # EFFICIENCY RATIOS
    # asset turnover ratio
    # a company’s ability to generate sales from assets
    avg_total_assets = data.groupby('symbol').totalAssets.rolling(2).mean().reset_index(0,drop=True)
    features['assetTurnoverRatio'] = data.revenue / avg_total_assets
    # inventory turnover ratio
    #  how many times a company’s inventory is sold and replaced over a given period
    avg_inventory = data.groupby('symbol').inventory.rolling(2).mean().reset_index(0,drop=True)
    features['inventoryTurnoverRatio'] = data.costOfRevenue/avg_inventory
    # daysales in inventory ratio
    # average number of days that a company holds on to inventory before selling it to customers
    features['daysSalesInventoryRatio'] = 365/features['inventoryTurnoverRatio']
    # PROFITABILITY
    # Gross profit ratio
    # how much profit a company makes after paying its cost of goods sold
    features['grossProfitRatio'] = data.grossProfitRatio
    # Operating Income ratio
    # operating income of a company to its net sales to determine operating efficiency
    features['operatingIncomeRatio'] = data.operatingIncomeRatio
    # return on assets ratio
    # how efficiently a company is using its assets to generate profit
    features['returnOnAssetsRatio'] = data.netIncome/data.totalAssets
    # return on equity ratio
    # how efficiently a company is using its equity to generate profit
    features['returnOnEquityRatio'] = data.netIncome / data.totalStockholdersEquity
    # VERTICAL ANALYSIS OF INCOME
    features['revenue'] = data.revenue
    features['costOfRevenueRatio'] = data.costOfRevenue / data.revenue
    features['researchAndDevelopmentExpensesRatio'] = data.researchAndDevelopmentExpenses / data.revenue
    features['sellingGeneralAndAdministrativeExpensesRatio'] = data.sellingGeneralAndAdministrativeExpenses / data.revenue
    features['interestExpenseRatio'] = data.interestExpense / data.revenue
    features['incomeBeforeTaxRatio'] = data.incomeBeforeTax / data.revenue
    features['incomeTaxExpenseRatio'] = data.incomeTaxExpense / data.revenue
    features['netIncomeRatio'] = data.netIncome / data.revenue
    features['netIncome'] = data.netIncome
    # HORIZONTAL ANALYSIS of the income statement
    features['revenue_yoy'] = data.groupby('symbol').revenue.pct_change(1)
    features['costOfRevenue_yoy'] = data.groupby('symbol').costOfRevenue.pct_change(1)
    features['researchAndDevelopmentExpenses_yoy'] = data.groupby('symbol').researchAndDevelopmentExpenses.pct_change(1)
    features['sellingGeneralAndAdministrativeExpenses_yoy'] = data.groupby('symbol').sellingGeneralAndAdministrativeExpenses.pct_change(1)
    features['interestExpense_yoy'] = data.groupby('symbol').interestExpense.pct_change(1)
    features['incomeBeforeTax_yoy'] = data.groupby('symbol').incomeBeforeTax.pct_change(1)
    features['incomeTaxExpense_yoy'] = data.groupby('symbol').incomeTaxExpense.pct_change(1)
    features['netIncome_yoy'] = data.groupby('symbol').netIncome.pct_change(1)
    # VERTICAL ANALYSIS OF CASH FLOW
    # Cash flow from Operating Activities CFO
    features['CFO'] = data['netCashProvidedByOperatingActivities']
    features['depreciationAndAmortizationRatio'] = data.depreciationAndAmortization / data.netCashProvidedByOperatingActivities
    features['stockBasedCompensationRatio'] = data.stockBasedCompensation / data.netCashProvidedByOperatingActivities
    features['changeInWorkingCapitalRatio'] = data.changeInWorkingCapital / data.netCashProvidedByOperatingActivities
    # Cash flow from Investing Activities CFI
    features['CFI'] = data['netCashUsedForInvestingActivites']
    features['capitalExpenditurePctSales'] = data.capitalExpenditure / data.revenue
    features['capitalExpenditureRatio'] = data.capitalExpenditure / data.netCashUsedForInvestingActivites
    # Free Cash flow
    features['freeCashFlow'] = data['freeCashFlow']
    features['freeCashFlowRatio'] = data.freeCashFlow / data.revenue
    # Cash flow from Financial Activities CFF
    features['CFF'] = data.netCashUsedProvidedByFinancingActivities
    features['TotalSpentRepurchase'] = data.commonStockRepurchased - data.commonStockIssued
    features['dividendsPaid'] = data['dividendsPaid']
    features['repurchaseDividendsRatio'] = (features['TotalSpentRepurchase'] + features['dividendsPaid'])/features['freeCashFlow']
    # HORIZONTAL ANALYSIS CASH FLOW
    features['freeCashFlow_yoy'] =  data.groupby('symbol').freeCashFlow.pct_change(1)
    features['capitalExpenditure_yoy'] =  data.groupby('symbol').capitalExpenditure.pct_change(1)
    features['CFI_yoy'] = data.groupby('symbol').netCashProvidedByOperatingActivities.pct_change(1)
    return features

In [26]:
train = pd.read_csv('../data/processed/train.csv')
target_train = train.target
train = generate_features(train).replace([np.inf, -np.inf], np.nan).fillna(0)
train.shape, target_train.shape

((7168, 48), (7168,))

In [32]:
benchmark = make_pipeline(
    PowerTransformer(), 
    xgboost.XGBRegressor(
        n_estimators = 1000,
        alpha=0.0014402765639493342,
        colsample_bytree=0.8572705090242294,
        eta=0.16926740470716026,
        gamma=15,
        max_depth=10,
        min_child_weight=77,
        subsample=0.9666913889304912
        ))
benchmark.fit(train, target_train)

Pipeline(steps=[('powertransformer', PowerTransformer()),
                ('xgbregressor',
                 XGBRegressor(alpha=0.0014402765639493342, base_score=0.5,
                              booster='gbtree', colsample_bylevel=1,
                              colsample_bynode=1,
                              colsample_bytree=0.8572705090242294,
                              enable_categorical=False, eta=0.16926740470716026,
                              gamma=15, gpu_id=-1, importance_type=None,
                              interaction_constraints='',
                              learning_rate=0.169267401, max_delta_step=0,
                              max_depth=10, min_child_weight=77, missing=nan,
                              monotone_constraints='()', n_estimators=1000,
                              n_jobs=12, num_parallel_tree=1, predictor='auto',
                              random_state=0, reg_alpha=0.00144027651,
                              reg_lambda=1, scale_pos_we

In [33]:
test = pd.read_csv('../data/processed/test.csv')
target_test = test.target
test = generate_features(test).replace([np.inf, -np.inf], np.nan).fillna(0)
test.shape, target_test.shape

((2390, 48), (2390,))

In [35]:
test.to_csv('./test.csv', index=False)

In [37]:
with open('model_v0.pickle', 'wb') as f:
    pickle.dump(benchmark, f)

In [34]:
preds = benchmark.predict(test)
mean_squared_error(y_true=target_test, y_pred=preds)

2422.4737923926623