In [2]:
import pandas as pd
import numpy as np
import sklearn

In [3]:
companies = pd.read_parquet('companies.parquet', engine='fastparquet')
deals = pd.read_parquet('deals.parquet', engine='fastparquet')
print('Loaded companies and deals; join is handled by EDA (data_clean.ipynb)')

Loaded companies and deals; join is handled by EDA (data_clean.ipynb)


## Transforming and cleaning "deals"


In [5]:
print('Joining companies onto deals (left join on CompanyID)')

_deals = deals.copy()
_companies = companies.copy()

deals_joined = _deals.merge(_companies, on='CompanyID', how='left')
matched = deals_joined['RowID_y'].notna().sum()

print(f'Before shape: {_deals.shape}, After shape: {deals_joined.shape}, Matched company rows: {matched}')

# write joined parquet for reuse by EDA and other scripts
joined_path = 'deals_joined.parquet'
deals_joined.to_parquet(joined_path, index=False)
print(f'Wrote joined parquet: {joined_path} (rows: {len(deals_joined)})')


Joining companies onto deals (left join on CompanyID)
Before shape: (30680, 39), After shape: (30680, 79), Matched company rows: 30680
Wrote joined parquet: deals_joined.parquet (rows: 30680)
Wrote joined parquet: deals_joined.parquet (rows: 30680)


In [6]:

deals_joined.columns

Index(['CompanyID', 'CompanyName_x', 'DealNo', 'DealDate', 'AnnouncedDate',
       'DealSize', 'DealStatus', 'DealSizeStatus', 'PremoneyValuation',
       'PostValuation', 'PostValuationStatus', 'PercentAcquired',
       'RaisedToDate', 'VCRound', 'VCRoundUp_Down_Flat',
       'TotalInvestedCapital', 'InvestorOwnership', 'StockSplit', 'DealType',
       'DealType2', 'DealType3', 'DealClass', 'DealSynopsis',
       'NativeCurrencyOfDeal', 'TotalInvestedEquity', 'AddOn', 'AddOnSponsors',
       'AddOnPlatform', 'TotalNewDebt', 'Debts', 'DebtRaisedInRound',
       'ContingentPayout', 'Employees_x', 'BusinessStatus_x',
       'FinancingStatus', 'SiteLocation', 'ExitScope', 'RowID_x',
       'LastUpdated_x', 'CompanyName_y', 'CompanyAlsoKnownAs',
       'CompanyFormerName', 'CompanyLegalName', 'Description', 'Keywords',
       'CompanyFinancingStatus', 'CompanyFinancingStatusDate', 'TotalRaised',
       'TotalRaisedNativeAmount', 'TotalRaisedNativeCurrency',
       'BusinessStatus_y', 'Busi

In [None]:

deals_joined

In [None]:
# Run minimal imputer from scripts/impute_postvaluation.py using only Employees as feature
from importlib import util
spec = util.spec_from_file_location('imputer_mod', 'scripts/impute_postvaluation.py')
imputer = util.module_from_spec(spec)
spec.loader.exec_module(imputer)

# prepare features: fill missing Employees with -1 (simple placeholder)
df_for_impute = deals_joined.copy()
if 'Employees' in df_for_impute.columns:
    df_for_impute['Employees_filled'] = df_for_impute['Employees'].fillna(-1)
    feature_cols = ['Employees_filled']
else:
    raise KeyError('Employees column not found in deals_joined')

print('Missing PostValuation before:', df_for_impute['PostValuation'].isna().sum())

# run imputer
df_imputed, model = imputer.impute_df_basic(df_for_impute, feature_cols, target_col='PostValuation')

print('Missing PostValuation after:', df_imputed['PostValuation'].isna().sum())

# show some imputed rows
imputed_mask = df_for_impute['PostValuation'].isna() & df_imputed['PostValuation'].notna()
display(df_imputed.loc[imputed_mask, ['CompanyID','CompanyName_x','Employees','PostValuation']].head(10))

# overwrite deals_joined with imputed results for downstream cells
deals_joined = df_imputed