In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
companies = pd.read_parquet('companies.parquet', engine='fastparquet')
deals = pd.read_parquet('deals.parquet', engine='fastparquet')
print('Loaded companies and deals; join is handled by EDA (data_clean.ipynb)')

Loaded companies and deals; join is handled by EDA (data_clean.ipynb)


## Transforming and cleaning "deals"


In [3]:
print('Joining companies onto deals (left join on CompanyID)')

_deals = deals.copy()
_companies = companies.copy()

deals_joined = _deals.merge(_companies, on='CompanyID', how='left')
matched = deals_joined['RowID_y'].notna().sum()

print(f'Before shape: {_deals.shape}, After shape: {deals_joined.shape}, Matched company rows: {matched}')

# write joined parquet for reuse by EDA and other scripts
joined_path = 'deals_joined.parquet'
deals_joined.to_parquet(joined_path, index=False)
print(f'Wrote joined parquet: {joined_path} (rows: {len(deals_joined)})')


Joining companies onto deals (left join on CompanyID)
Before shape: (30680, 39), After shape: (30680, 79), Matched company rows: 30680
Wrote joined parquet: deals_joined.parquet (rows: 30680)


In [4]:

deals_joined.columns

Index(['CompanyID', 'CompanyName_x', 'DealNo', 'DealDate', 'AnnouncedDate',
       'DealSize', 'DealStatus', 'DealSizeStatus', 'PremoneyValuation',
       'PostValuation', 'PostValuationStatus', 'PercentAcquired',
       'RaisedToDate', 'VCRound', 'VCRoundUp_Down_Flat',
       'TotalInvestedCapital', 'InvestorOwnership', 'StockSplit', 'DealType',
       'DealType2', 'DealType3', 'DealClass', 'DealSynopsis',
       'NativeCurrencyOfDeal', 'TotalInvestedEquity', 'AddOn', 'AddOnSponsors',
       'AddOnPlatform', 'TotalNewDebt', 'Debts', 'DebtRaisedInRound',
       'ContingentPayout', 'Employees_x', 'BusinessStatus_x',
       'FinancingStatus', 'SiteLocation', 'ExitScope', 'RowID_x',
       'LastUpdated_x', 'CompanyName_y', 'CompanyAlsoKnownAs',
       'CompanyFormerName', 'CompanyLegalName', 'Description', 'Keywords',
       'CompanyFinancingStatus', 'CompanyFinancingStatusDate', 'TotalRaised',
       'TotalRaisedNativeAmount', 'TotalRaisedNativeCurrency',
       'BusinessStatus_y', 'Busi

In [None]:

deals_joined

In [9]:
for dtcol in ['DealDate','AnnouncedDate']:
    deals_joined[dtcol + '_year'] = deals_joined[dtcol].dt.year

In [None]:




numerical_feat = ["DealNo", "DealSize", "PercentAcquired", "RaisedToDate", "TotalInvestedCapital", "TotalInvestedEquity", "TotalNewDebt", "DebtRaisedInRound", "TotalRaised", "TotalRaisedNativeAmount", "Employees_y", "YearFounded", "DealDate_year", "AnnouncedDate_year"]
# categorical_feat = ["NativeCurrencyOfDeal", "AddOn", "BusinessStatus_x", "FinancingStatus", "CompanyFinancingStatus", "BusinessStatus_y"]

# run imputer

deals_joined_log = deals_joined.copy()

for col in numerical_feat:
    # Set log to NaN where value <= 0
    deals_joined_log[col] = np.where(deals_joined_log[col] > 0, np.log(deals_joined_log[col]), np.nan)


df_imputed, model = imputer.impute_df_basic(
    deals_joined, 
    feature_cols=numerical_feat,
    target_col='PostValuation')

# df_imputed, model = imputer.impute_df_basic(
#     deals_joined, 
#     feature_cols=[
#     "DealNo", "DealSize", "PremoneyValuation", 
#     "PercentAcquired", "RaisedToDate", 
#     "TotalInvestedCapital", "InvestorOwnership", 
#     "TotalInvestedEquity", "TotalNewDebt", "DebtRaisedInRound", 
#     "ContingentPayout", "Employees_x", "TotalRaised", 
#     "TotalRaisedNativeAmount", "Employees_y", "YearFounded", 
#     "DealDate_year", "AnnouncedDate_year",'DealType'],
#     categorical_cols=['DealType'],
#     target_col='PostValuation')

df_imputed, model = imputer.impute_df_basic(
    deals_joined, 
    feature_cols=numerical_feat+categorical_feat,
    categorical_cols = categorical_feat,
    target_col='PostValuation')

# Train RMSE: 586.0705, Train R2: 0.9526
# Test  RMSE: 1016.5981, Test R2: 0.8911
# Train RMSE: 570.3943, Train R2: 0.9551
# Test  RMSE: 1134.0478, Test R2: 0.8645


  result = getattr(ufunc, method)(*inputs, **kwargs)


Train RMSE: 586.0705, Train R2: 0.9526
Test  RMSE: 1016.5981, Test R2: 0.8911
Train RMSE: 577.6674, Train R2: 0.9539
Test  RMSE: 1092.2788, Test R2: 0.8743


In [None]:
# Run minimal imputer from scripts/impute_postvaluation.py using only Employees as feature
from importlib import util
spec = util.spec_from_file_location('imputer_mod', 'scripts/impute_postvaluation.py')
imputer = util.module_from_spec(spec)
spec.loader.exec_module(imputer)

dfimputed, model = imputer.impute_df_basic(
    deals_joined,
    numerical = [
        'DealSize', 'PremoneyValuation', 'PercentAcquired', 'RaisedToDate', 'TotalInvestedCapital', 
        'InvestorOwnership', 'TotalInvestedEquity', 'TotalNewDebt', 'DebtRaisedInRound', 
        'ContingentPayout', 'Employees_x', 'TotalRaised', 
        'TotalRaisedNativeAmount', 'Employees_y', 'YearFounded', 'DealDate_year', 'AnnouncedDate_year'],
    numerical_to_log=[],
    categorical=[
        'DealStatus','DealSizeStatus','PostValuationStatus','VCRound','VCRoundUp_Down_Flat',
        'DealType','DealType2','DealType3','DealClass','NativeCurrencyOfDeal','AddOn',
        'BusinessStatus_x','FinancingStatus','SiteLocation','ExitScope','CompanyFinancingStatus',
        'TotalRaisedNativeCurrency','OwnershipStatus','Universe','Exchange','PrimaryContactPrefix',
        'PrimaryContactSuffix','PrimaryContactTitle'],
    target_col='PostValuation'
)

# Train RMSE: 474.1165, Train R2: 0.9690
# Test  RMSE: 1249.9156, Test R2: 0.8353


# Train RMSE: 511.4826, Train R2: 0.9639
# Test  RMSE: 1531.9159, Test R2: 0.7527

# {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 15, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END bootstrap=True, max_depth=5, max_features=0.5, min_samples_leaf=1, min_samples_split=2; total time=  16.9s
[CV] END bootstrap=True, max_depth=5, max_features=0.5, min_samples_leaf=1, min_samples_split=2; total time=  17.3s
[CV] END bootstrap=True, max_depth=5, max_features=0.5, min_samples_leaf=1, min_samples_split=2; total time=  18.2s
[CV] END bootstrap=True, max_depth=5, max_features=0.5, min_samples_leaf=1, min_samples_split=2; total time=  17.4s
[CV] END bootstrap=True, max_depth=5, max_features=0.5, min_samples_leaf=2, min_samples_split=2; total time=  23.9s
[CV] END bootstrap=True, max_depth=5, max_features=0.5, min_samples_leaf=1, min_samples_split=2; total time=  24.0s
[CV] END bootstrap=True, max_depth=5, max_features=0.5, min_samples_leaf=2, min_samples_split=2; total time=  24.6s
[CV] END bootstrap=True, max_depth=5, max_features=0.5, min_samples_leaf=2, min_samples_split=2; total time=  25.6s
[CV] END b



[CV] END bootstrap=True, max_depth=5, max_features=1.0, min_samples_leaf=1, min_samples_split=3; total time=  38.1s
[CV] END bootstrap=True, max_depth=5, max_features=1.0, min_samples_leaf=1, min_samples_split=3; total time=  37.2s
[CV] END bootstrap=True, max_depth=5, max_features=1.0, min_samples_leaf=1, min_samples_split=3; total time=  29.5s
[CV] END bootstrap=True, max_depth=5, max_features=1.0, min_samples_leaf=1, min_samples_split=3; total time=  29.4s
[CV] END bootstrap=True, max_depth=5, max_features=1.0, min_samples_leaf=2, min_samples_split=2; total time=  28.4s
[CV] END bootstrap=True, max_depth=5, max_features=1.0, min_samples_leaf=2, min_samples_split=2; total time=  28.4s
[CV] END bootstrap=True, max_depth=5, max_features=1.0, min_samples_leaf=2, min_samples_split=2; total time=  29.1s
[CV] END bootstrap=True, max_depth=5, max_features=1.0, min_samples_leaf=2, min_samples_split=2; total time=  29.2s
[CV] END bootstrap=True, max_depth=5, max_features=1.0, min_samples_leaf

In [None]:
# Run minimal imputer from scripts/impute_postvaluation.py using only Employees as feature
from importlib import util
spec = util.spec_from_file_location('imputer_mod', 'scripts/impute_postvaluation.py')
imputer = util.module_from_spec(spec)
spec.loader.exec_module(imputer)

dfimputed, model = imputer.impute_df_basic(
    deals_joined,
    numerical = [
        'PercentAcquired',
        'InvestorOwnership', 'TotalNewDebt',
        'ContingentPayout', 'Employees_x', 
        'Employees_y', 'YearFounded', 'DealDate_year', 'AnnouncedDate_year'
        ],
    numerical_to_log=[
        'DealSize', 'PremoneyValuation', 'RaisedToDate', 'TotalInvestedCapital', 
        'DebtRaisedInRound', 'TotalInvestedEquity','TotalRaised','TotalRaisedNativeAmount'
    ],
    categorical=[
        'DealStatus','DealSizeStatus','PostValuationStatus','VCRound','VCRoundUp_Down_Flat',
        'DealType','DealType2','DealType3','DealClass','NativeCurrencyOfDeal','AddOn',
        'BusinessStatus_x','FinancingStatus','SiteLocation','ExitScope','CompanyFinancingStatus',
        'TotalRaisedNativeCurrency','OwnershipStatus','Universe','Exchange','PrimaryContactPrefix',
        'PrimaryContactSuffix','PrimaryContactTitle'],
    target_col='PostValuation'
)

# Train RMSE: 598.9358, Train R2: 0.9569
# Test  RMSE: 764.9720, Test R2: 0.8869


# xgboost
# Train RMSE: 28.1177, Train R2: 0.9999
# Test  RMSE: 1500.6531, Test R2: 0.5648

KeyError: "[''] not in index"

In [213]:
import sys
print(sys.executable)

/Library/Developer/CommandLineTools/usr/bin/python3


In [None]:
deals_joined[['DealNo']]

for col in deals_joined.columns:
    print(f"--- Column: {col} ---")
    # Get value counts for the current column, sorted in descending order
    value_counts = deals_joined[col].value_counts()

    # Get the top 5 distinct values and their counts
    top_5 = value_counts.head(5)
    print(top_5)
