used newly processed feng data 

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# data
import pandas as pd
import numpy as np
import ast
from numpy import mean

# visualization
import matplotlib.pyplot as plt

# chosen models
from statsmodels.regression.linear_model import OLS

# data preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# model training selection
from sklearn.model_selection import train_test_split
# from sklearn.model_selection import RepeatedStratifiedKFold

## model evaluation metrics
from collections import Counter
# from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse

# important cols
import shap

# print the JS visualization code to the notebook
shap.initjs()

In [2]:
df = pd.read_csv('../data/feature_engineering/combined_feng_v8.csv', index_col=0)
df.shape

(3000, 2145)

### get data

In [3]:
# get data
X = df.drop(columns='total_funding_amount_usd')
y = df['total_funding_amount_usd']

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### scale numerical features 
*also scale output feature but note data leakage, split data first then standard scale it on train data, then also use same scaler to transform test data*

In [28]:
# x scaler
# scaler = StandardScaler().fit(X)
# X_scaled = scaler.transform(X)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# y scaler
# scale2 = MinMaxScaler().fit(np.array(y).reshape(-1, 1))
# y_scaled = scaler2.transform(np.array(y).reshape(-1, 1))
scaler2 = MinMaxScaler().fit(np.array(y_train).reshape(-1, 1))
y_train_scaled = scaler2.transform(np.array(y_train).reshape(-1, 1))

## machine learning

In [29]:
model = OLS(y_train_scaled, X_train_scaled)
# model = OLS(y_scaled, X_scaled)

In [30]:
results = model.fit()

In [7]:
# results.params

In [31]:
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.771
Model:,OLS,Adj. R-squared (uncentered):,-0.28
Method:,Least Squares,F-statistic:,0.7337
Date:,"Wed, 27 Apr 2022",Prob (F-statistic):,1.0
Time:,13:06:01,Log-Likelihood:,4909.2
No. Observations:,2010,AIC:,-6518.0
Df Residuals:,360,BIC:,2731.0
Df Model:,1650,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.0039,0.004,-0.956,0.340,-0.012,0.004
x2,-0.0069,0.004,-1.556,0.121,-0.016,0.002
x3,-0.0048,0.003,-1.709,0.088,-0.010,0.001
x4,0.0059,0.004,1.624,0.105,-0.001,0.013
x5,-0.0031,0.003,-0.901,0.368,-0.010,0.004
x6,-0.0044,0.003,-1.323,0.187,-0.011,0.002
x7,0.0076,0.003,2.378,0.018,0.001,0.014
x8,-0.0010,0.002,-0.571,0.569,-0.004,0.002
x9,-0.0011,0.002,-0.698,0.486,-0.004,0.002

0,1,2,3
Omnibus:,1195.637,Durbin-Watson:,1.307
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1383405.33
Skew:,1.281,Prob(JB):,0.0
Kurtosis:,131.498,Cond. No.,4.13e+16


In [32]:
# r squared value has to be over 60% (it is currently negative???)
results.rsquared #0.6436

0.7707861957697217

In [33]:
# more than half of the variables are significant?
Counter(results.pvalues<0.05)

Counter({False: 2118, True: 26})

In [34]:
sig_cols = []
for i in range(len(results.pvalues)):
    if results.pvalues[i]<0.05:
        sig_cols.append(X.columns[i])

In [35]:
sig_cols

['asia-pacific (apac)',
 'china',
 'moscow city',
 'moscow',
 'saratoga',
 'suzhou shi',
 'estimated_revenue_range',
 'video',
 'autonomous vehicles',
 'cosmetic surgery',
 'insurance',
 'insurtech',
 'mobile devices',
 'pc games',
 'ticketing',
 'video advertising',
 'video games',
 'web browsers',
 'alibaba group',
 'idg capital',
 'sbi group',
 'lenovo',
 'warburg pincus',
 'morningside group',
 'seed capital',
 'bdmi']

In [26]:
pred = model.predict(X_scaled)

ValueError: shapes (3000,2144) and (3000,2144) not aligned: 2144 (dim 1) != 3000 (dim 0)

In [None]:
rmse = np.sqrt(mse(y_test_scaled, pred))
rmse

<span style="color:red">

- data_v6: 
    - R-squared = -0.23687
    - p-values: Counter({True: 281, False: 141})
    
- data_v7:
    - R-squared = -0.02924 (worse???)
    - p-values: Counter({True: 751, False: 421})
    
- data_v8:
    - R-squared = 0.6436 (finally !!!)
    - p-values: Counter({False: 2097, True: 47})

[x] go through xgboost classification, PCA, and get what seems to be important

[x] go back to data and see what i intuitively think would affect, come up with a list from a concept standpoint of information / columns that probably matters (e.g. which funding round, gender, where, invetsors etc.), and think about how I can encode them in the data

[ ] try to fit OLS and check which covariants matter; eventually, 
    
    [?] in the OLS, all coeff should have p values less than .05 
    [?] R squared value has to be over 60%

[ ] next, start seeing if we can add more nuanced columns, e.g. from US to China, Europe etc., and see if coeff are still significant

    [?] already added nuanced cols but not enough explainability?

[ ] do the same for Lasso and Ridge 

OLS
1. start with as few columns as possible, see if all coeff are significant, R-squared is probably low
2. need to add more cols to increase R-square, check significant coeffs, and rerun the model with only the significant coeffs and see what the R-squared is
3. ultimate goal is to get a model with all coefffs significant and high R-squared (because you can get a higher R-squared just by adding more coeffs but they are not actually significant)

need to argue what the "base" is, and "leave one out"
- for equal_cats need to take one cat out --> what is the constant as the base level
- industries: multi-label --> we would never include all to the degree that if you are not in all the rest you would be in this

add columns
- include "top 5 industry groups" (bool) --> separating each of the top 5 to binary --> sexy hot industries
- include "out of top 50 industries how many each company belongs to" --> diversified across industries

the priority is to get all coeffs to be significant, and when they are, try to boost up r-squared

so start with a broader question, e.g. "top 20 industry groups" (bool), to breaking down to all the industries, so come up with the highest level representation of "location", "industry", "investors"

specificty, e.g. interaction variables, not really important for us, because we already have too many coeffs to get specific on

train_test_split, make sure the number of cols dont exceed 3000*0.33 so columns is still less than number of rows

what if even very low number of coeffs are still not significant? this is when breaking down would help