# Predição de Sucesso de Startups com Random Forest

Notebook inicial focado em gerar rapidamente um modelo baseline e variações de Random Forest para submissão.

Serão testadas 4 abordagens:
1. Baseline (RandomForest padrão)
2. Tuning manual simples (hiperparâmetros escolhidos por heurística)
3. RandomizedSearchCV (tuning sem grid exaustivo)
4. GridSearchCV (tuning com grid menor focado)

Critério de escolha: maior acurácia média em cross-validation (StratifiedKFold). Em caso de empate, escolhe-se o modelo com menor desvio padrão; se ainda empatar, o mais simples.

Tarefas mais extensas (EDA aprofundada, engenharia de features avançada, análise de hipóteses detalhada) serão adicionadas depois conforme as regras do README. Aqui priorizamos gerar uma primeira "submission" consistente.

In [1]:
# Imports principais (apenas libs permitidas: numpy, pandas, scikit-learn, matplotlib, seaborn)
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

pd.set_option('display.max_columns', None)

In [2]:
# Carregamento dos dados
train_path = 'data/train.csv'
test_path = 'data/test.csv'
sample_sub_path = 'data/sample_submission.csv'

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
sample_sub = pd.read_csv(sample_sub_path)

print('Dimensões train:', df_train.shape)
print('Dimensões test :', df_test.shape)
df_train.head()

Dimensões train: (646, 33)
Dimensões test : (277, 32)


Unnamed: 0,id,age_first_funding_year,age_last_funding_year,age_first_milestone_year,age_last_milestone_year,relationships,funding_rounds,funding_total_usd,milestones,is_CA,is_NY,is_MA,is_TX,is_otherstate,category_code,is_software,is_web,is_mobile,is_enterprise,is_advertising,is_gamesvideo,is_ecommerce,is_biotech,is_consulting,is_othercategory,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,labels
0,719,10.42,13.09,8.98,12.72,4,3,4087500,3,1,0,0,0,0,enterprise,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1.0,0
1,429,3.79,3.79,,,21,1,45000000,0,0,1,0,0,0,advertising,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1.0,1
2,178,0.71,2.28,1.95,2.28,5,2,5200000,2,1,0,0,0,0,photo_video,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1.0,0
3,197,3.0,5.0,9.62,10.39,16,2,14500000,2,0,0,1,0,0,advertising,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,2.0,1
4,444,0.66,5.88,6.21,8.61,29,5,70000000,4,1,0,0,0,0,web,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,2.8,1


In [3]:
# Checagem rápida de valores ausentes (resumido)
missing = df_train.isna().mean().sort_values(ascending=False)
missing[missing>0].head(15)

age_first_milestone_year    0.213622
age_last_milestone_year     0.171827
age_first_funding_year      0.054180
age_last_funding_year       0.013932
dtype: float64

## Pré-processamento
- 'category_code' será one-hot encoded (handle_unknown='ignore').
- Colunas numéricas: imputação por mediana.
- RandomForest não exige escala, então não aplicamos StandardScaler agora.
- Mantemos dummies já existentes como numéricas.
- 'id' será removido das features.

In [4]:
# Definir colunas
target_col = 'labels'
id_col = 'id'
categorical_cols = ['category_code']  # apenas esta categórica 'bruta'
# Numéricas: todas as numéricas exceto id e target
numeric_cols = [c for c in df_train.columns if c not in [target_col, id_col] and c not in categorical_cols]
# (Isto inclui as dummies e variáveis contínuas)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_cols),
        ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                           ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_cols)
    ]
)

def build_pipeline(rf_model: RandomForestClassifier) -> Pipeline:
    return Pipeline(steps=[('prep', preprocessor), ('clf', rf_model)])

X = df_train.drop(columns=[target_col])
y = df_train[target_col]
X_test_final = df_test.copy()  # para depois gerar submissão

In [5]:
# Split para avaliação adicional (além do cross-validation)
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
print('Distribuição treino:', y_tr.value_counts(normalize=True).round(3).to_dict())
print('Distribuição validação:', y_val.value_counts(normalize=True).round(3).to_dict())

Distribuição treino: {1: 0.647, 0: 0.353}
Distribuição validação: {1: 0.646, 0: 0.354}


## Funções de Avaliação

In [6]:
def evaluate_pipeline(pipeline: Pipeline, X_train, y_train, X_valid, y_valid, cv_splits=5, name='model'):
    cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=RANDOM_STATE)
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_valid)
    acc = accuracy_score(y_valid, preds)
    report = classification_report(y_valid, preds, output_dict=True)
    cm = confusion_matrix(y_valid, preds)
    return {
        'name': name,
        'cv_mean_acc': cv_scores.mean(),
        'cv_std_acc': cv_scores.std(),
        'val_acc': acc,
        'val_precision_1': report['1']['precision'],
        'val_recall_1': report['1']['recall'],
        'val_f1_1': report['1']['f1-score'],
        'confusion_matrix': cm,
        'fitted_pipeline': pipeline
    }

results = []

## 1. Baseline RandomForest (parâmetros padrão)

In [7]:
rf_baseline = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)
pipe_baseline = build_pipeline(rf_baseline)
res_baseline = evaluate_pipeline(pipe_baseline, X_tr, y_tr, X_val, y_val, name='Baseline')
results.append(res_baseline)
res_baseline['cv_mean_acc'], res_baseline['val_acc']

(np.float64(0.800429424943988), 0.7846153846153846)

## 2. Tuning Manual (heurístico)
Escolhas: aumentar n_estimators, limitar profundidade moderada, ajustar min_samples_split/leaf e max_features.

In [8]:
rf_manual = RandomForestClassifier(
    n_estimators=400,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=True,
    class_weight=None,
    random_state=RANDOM_STATE,
    n_jobs=-1
)
pipe_manual = build_pipeline(rf_manual)
res_manual = evaluate_pipeline(pipe_manual, X_tr, y_tr, X_val, y_val, name='Manual Tuning')
results.append(res_manual)
res_manual['cv_mean_acc'], res_manual['val_acc']

(np.float64(0.7849141150112025), 0.7692307692307693)

## 3. RandomizedSearchCV (tuning sem grid exaustivo)
Exploramos um espaço mais amplo com amostragem aleatória.

In [9]:
param_dist = {
    'clf__n_estimators': [200, 300, 400, 500, 600, 800],
    'clf__max_depth': [None, 5, 8, 10, 12, 15, 20],
    'clf__min_samples_split': [2, 5, 10, 15],
    'clf__min_samples_leaf': [1, 2, 4, 6],
    'clf__max_features': ['sqrt', 'log2', 0.6, 0.8, None],
    'clf__bootstrap': [True, False]
}
base_rf = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)
pipe_rand = build_pipeline(base_rf)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
rand_search = RandomizedSearchCV(
    estimator=pipe_rand,
    param_distributions=param_dist,
    n_iter=40,
    scoring='accuracy',
    cv=cv,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=1
)
rand_search.fit(X_tr, y_tr)
best_rand = rand_search.best_estimator_
# Avaliar no holdout
preds_val = best_rand.predict(X_val)
acc_val = accuracy_score(y_val, preds_val)
report = classification_report(y_val, preds_val, output_dict=True)
cm = confusion_matrix(y_val, preds_val)
res_rand = {
    'name': 'RandomizedSearch',
    'cv_mean_acc': rand_search.best_score_,
    'cv_std_acc': np.nan,  # (sklearn não expõe std direto do melhor conjunto)
    'val_acc': acc_val,
    'val_precision_1': report['1']['precision'],
    'val_recall_1': report['1']['recall'],
    'val_f1_1': report['1']['f1-score'],
    'confusion_matrix': cm,
    'fitted_pipeline': best_rand
}
results.append(res_rand)
rand_search.best_params_, acc_val

Fitting 5 folds for each of 40 candidates, totalling 200 fits


({'clf__n_estimators': 400,
  'clf__min_samples_split': 10,
  'clf__min_samples_leaf': 1,
  'clf__max_features': 'log2',
  'clf__max_depth': 12,
  'clf__bootstrap': False},
 0.7692307692307693)

## 4. GridSearchCV (tuning com grid focado)

In [10]:
param_grid = {
    'clf__n_estimators': [300, 400, 500],
    'clf__max_depth': [None, 10, 15],
    'clf__min_samples_split': [2, 5],
    'clf__min_samples_leaf': [1, 2],
    'clf__max_features': ['sqrt', 'log2']
}
base_rf2 = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)
pipe_grid = build_pipeline(base_rf2)
grid_search = GridSearchCV(
    estimator=pipe_grid,
    param_grid=param_grid,
    scoring='accuracy',
    cv=cv,
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_tr, y_tr)
best_grid = grid_search.best_estimator_
preds_val = best_grid.predict(X_val)
acc_val = accuracy_score(y_val, preds_val)
report = classification_report(y_val, preds_val, output_dict=True)
cm = confusion_matrix(y_val, preds_val)
res_grid = {
    'name': 'GridSearch',
    'cv_mean_acc': grid_search.best_score_,
    'cv_std_acc': np.nan,
    'val_acc': acc_val,
    'val_precision_1': report['1']['precision'],
    'val_recall_1': report['1']['recall'],
    'val_f1_1': report['1']['f1-score'],
    'confusion_matrix': cm,
    'fitted_pipeline': best_grid
}
results.append(res_grid)
grid_search.best_params_, acc_val

Fitting 5 folds for each of 72 candidates, totalling 360 fits


({'clf__max_depth': 10,
  'clf__max_features': 'log2',
  'clf__min_samples_leaf': 1,
  'clf__min_samples_split': 2,
  'clf__n_estimators': 500},
 0.7615384615384615)

## Comparação dos Resultados

In [11]:
results_df = pd.DataFrame([{k:v for k,v in r.items() if k not in ['fitted_pipeline','confusion_matrix']} for r in results])
results_df.sort_values(by='cv_mean_acc', ascending=False)

Unnamed: 0,name,cv_mean_acc,cv_std_acc,val_acc,val_precision_1,val_recall_1,val_f1_1
2,RandomizedSearch,0.80239,,0.769231,0.78125,0.892857,0.833333
0,Baseline,0.800429,0.039836,0.784615,0.804348,0.880952,0.840909
3,GridSearch,0.798525,,0.761538,0.773196,0.892857,0.828729
1,Manual Tuning,0.784914,0.042985,0.769231,0.787234,0.880952,0.831461


In [12]:
# Selecionar melhor modelo conforme regra (maior cv_mean_acc, depois menor std, depois preferência por simplicidade)
sorted_res = sorted(results, key=lambda r: (r['cv_mean_acc'], - (0 if np.isnan(r['cv_std_acc']) else r['cv_std_acc'])), reverse=True)
best = sorted_res[0]
print('Melhor modelo:', best['name'], ' - CV Acc:', best['cv_mean_acc'], ' - Val Acc:', best['val_acc'])
best_pipeline = best['fitted_pipeline']

Melhor modelo: RandomizedSearch  - CV Acc: 0.8023898431665423  - Val Acc: 0.7692307692307693


## Treino Final em TODO o Conjunto de Treino e Geração da Submission

In [13]:
# Refit no dataset completo de treino (X, y) para aproveitar todos os exemplos
# (Se o best já for resultado de search, re-fit para garantir uso de todos os dados)
best_params_model = best_pipeline
best_params_model.fit(X, y)
test_preds = best_params_model.predict(X_test_final)
submission = pd.DataFrame({
    'id': X_test_final[id_col],
    'labels': test_preds.astype(int)
})
submission_path = 'submission.csv'
submission.to_csv(submission_path, index=False)
submission.head()

Unnamed: 0,id,labels
0,70,1
1,23,0
2,389,1
3,872,1
4,920,1


### Observações Próximos Passos
- Adicionar análise exploratória (distribuições, correlações, hipóteses).
- Verificar importância de features (`feature_importances_`) e eventualmente reduzir dimensionalidade.
- Avaliar impacto de class_weight='balanced'.
- Testar threshold tuning e outras métricas (F1, ROC AUC).
- Documentar formalmente 3 hipóteses e validar.

Este notebook entrega rapidamente um pipeline funcional e uma submissão inicial.

## Melhoria: Engenharia de Features e Flags de Missing
Vamos criar novas features simples para tentar ultrapassar 0.80 de acurácia:

Novas Features:
- funding_per_round = funding_total_usd / (funding_rounds + 1)
- relationships_per_round = relationships / (funding_rounds + 1)
- age_funding_span = age_last_funding_year - age_first_funding_year
- milestone_per_round = milestones / (funding_rounds + 1)
- funding_rounds_squared (não-linearidade)
- log_funding_total_usd = log1p(funding_total_usd)
- avg_participants_squared (curvatura)

Flags de Missing (0/1): age_first_funding_year, age_last_funding_year, age_first_milestone_year, age_last_milestone_year, funding_total_usd.

Depois criamos novo pipeline e repetimos tuning.


In [None]:
# Criação de novas features (train e test devem receber transformações consistentes)

def add_engineered_features(df: pd.DataFrame, is_train=True):
    d = df.copy()
    # Avoid division by zero adding +1
    d['funding_per_round'] = d['funding_total_usd'] / (d['funding_rounds'] + 1)
    d['relationships_per_round'] = d['relationships'] / (d['funding_rounds'] + 1)
    d['age_funding_span'] = d['age_last_funding_year'] - d['age_first_funding_year']
    d['milestone_per_round'] = d['milestones'] / (d['funding_rounds'] + 1)
    d['funding_rounds_squared'] = d['funding_rounds'] ** 2
    d['avg_participants_squared'] = d['avg_participants'] ** 2
    d['log_funding_total_usd'] = np.log1p(d['funding_total_usd'])

    # Flags de missing
    miss_cols = ['age_first_funding_year','age_last_funding_year','age_first_milestone_year',
                 'age_last_milestone_year','funding_total_usd']
    for c in miss_cols:
        d[f'flag_missing_{c}'] = d[c].isna().astype(int)
    return d

X_eng = add_engineered_features(X)
X_tr_eng = add_engineered_features(X_tr)
X_val_eng = add_engineered_features(X_val)
X_test_eng = add_engineered_features(X_test_final)

# Atualizar listas de colunas
categorical_cols_eng = ['category_code']
# Numeric columns = everything except id, category_code, target (target já fora) e manter consistência
numeric_cols_eng = [c for c in X_eng.columns if c not in [id_col] + categorical_cols_eng]

preprocessor_eng = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_cols_eng),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols_eng)
    ]
)

def build_pipeline_eng(rf_model: RandomForestClassifier) -> Pipeline:
    return Pipeline(steps=[('prep', preprocessor_eng), ('clf', rf_model)])

print('Novas colunas criadas:', set(X_eng.columns) - set(X.columns))
print('Total de features antes:', len(X.columns)-1, ' | depois:', len(X_eng.columns)-1)  # -1 exclui id

## Tuning Avançado com Espaço Expandido (RandomizedSearch)
Incluímos agora hiperparâmetros adicionais:
- class_weight (para lidar com leve desbalanceamento)
- max_samples (subamostragem para aumentar diversidade de árvores)
E aumentamos n_iter.


In [None]:
from scipy.stats import randint, uniform

cv_eng = StratifiedKFold(n_splits=6, shuffle=True, random_state=RANDOM_STATE)

rf_base_eng = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)
pipe_eng = build_pipeline_eng(rf_base_eng)

param_dist_eng = {
    'clf__n_estimators': randint(400, 1200),
    'clf__max_depth': [None] + list(range(8, 26, 2)),
    'clf__min_samples_split': randint(2, 15),
    'clf__min_samples_leaf': randint(1, 8),
    'clf__max_features': ['sqrt', 'log2', 0.5, 0.7, None],
    'clf__bootstrap': [True, False],
    'clf__class_weight': [None, 'balanced'],
    'clf__max_samples': [None, 0.7, 0.85, 0.9]
}

rand_search_eng = RandomizedSearchCV(
    estimator=pipe_eng,
    param_distributions=param_dist_eng,
    n_iter=80,
    scoring='accuracy',
    cv=cv_eng,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=1
)

rand_search_eng.fit(X_tr_eng, y_tr)

best_eng = rand_search_eng.best_estimator_
val_preds_eng = best_eng.predict(X_val_eng)
acc_val_eng = accuracy_score(y_val, val_preds_eng)
print('Melhor params (eng):', rand_search_eng.best_params_)
print('CV best score:', rand_search_eng.best_score_)
print('Val acc:', acc_val_eng)

## Ensemble (Stacking Simples por Média de Probabilidades)
Combinamos o melhor modelo anterior e o novo modelo melhorado (se o novo superar 0.80) via média de probabilidades para potencialmente ganhar alguns décimos.


In [None]:
# Preparar ensemble: usar best_pipeline (antigo) e best_eng (novo com features)

# Garantir que ambos estejam treinados em seus respectivos espaços completos de treino antes de ensemble final
# Refit best antigo em X (sem engineered) + y
best_pipeline.fit(X, y)
# Refit best_eng em X_eng + y
best_eng.fit(X_eng, y)

# Avaliação ensemble no holdout de validação
proba_old = best_pipeline.predict_proba(X_val)[:,1]
proba_new = best_eng.predict_proba(X_val_eng)[:,1]
proba_avg = (proba_old + proba_new)/2
preds_ensemble = (proba_avg >= 0.5).astype(int)
acc_ensemble = accuracy_score(y_val, preds_ensemble)
print('Acurácia ensemble (val):', acc_ensemble)

# Escolha final
candidates = [
    ('old_best', best_pipeline, accuracy_score(y_val, best_pipeline.predict(X_val))),
    ('new_best', best_eng, accuracy_score(y_val, best_eng.predict(X_val_eng))),
    ('ensemble_avg', ('ensemble', best_pipeline, best_eng), acc_ensemble)
]

for name, model_obj, acc in candidates:
    print(f"{name}: val_acc={acc:.5f}")

selected_name, selected_model, selected_acc = max(candidates, key=lambda x: x[2])
print('\nSelecionado para submissão:', selected_name, ' - Val Acc:', selected_acc)

# Treino final no conjunto completo
if selected_name == 'old_best':
    final_model = best_pipeline.fit(X, y)
    test_proba = final_model.predict_proba(X_test_final)[:,1]
    test_preds_final = (test_proba >= 0.5).astype(int)
elif selected_name == 'new_best':
    final_model = best_eng.fit(X_eng, y)
    test_proba = final_model.predict_proba(X_test_eng)[:,1]
    test_preds_final = (test_proba >= 0.5).astype(int)
else:
    # ensemble
    best_pipeline.fit(X, y)
    best_eng.fit(X_eng, y)
    proba_old_test = best_pipeline.predict_proba(X_test_final)[:,1]
    proba_new_test = best_eng.predict_proba(X_test_eng)[:,1]
    test_proba = (proba_old_test + proba_new_test)/2
    test_preds_final = (test_proba >= 0.5).astype(int)

submission2 = pd.DataFrame({'id': X_test_final[id_col], 'labels': test_preds_final})
submission2.to_csv('submission_v2.csv', index=False)
print('Gerado submission_v2.csv')
submission2.head()

## Ajuste de Threshold (Opcional)
Podemos verificar se mover o limiar de 0.5 melhora a acurácia (dado leve desbalanceamento). Avaliamos thresholds de 0.4 a 0.6.


In [None]:
# Threshold tuning baseado no modelo selecionado (se ensemble, usa média)
import numpy as np

# Garantir que test_proba está disponível (gerada anteriormente). Se não, recalculamos para selected_name.
if 'test_proba' not in globals():
    # Recalcular proba de validação para threshold search
    if selected_name == 'old_best':
        proba_val_ref = best_pipeline.predict_proba(X_val)[:,1]
    elif selected_name == 'new_best':
        proba_val_ref = best_eng.predict_proba(X_val_eng)[:,1]
    else:
        proba_old_tmp = best_pipeline.predict_proba(X_val)[:,1]
        proba_new_tmp = best_eng.predict_proba(X_val_eng)[:,1]
        proba_val_ref = (proba_old_tmp + proba_new_tmp)/2
else:
    # Precisamos da proba de validação
    if selected_name == 'old_best':
        proba_val_ref = best_pipeline.predict_proba(X_val)[:,1]
    elif selected_name == 'new_best':
        proba_val_ref = best_eng.predict_proba(X_val_eng)[:,1]
    else:
        proba_old_tmp = best_pipeline.predict_proba(X_val)[:,1]
        proba_new_tmp = best_eng.predict_proba(X_val_eng)[:,1]
        proba_val_ref = (proba_old_tmp + proba_new_tmp)/2

thresholds = np.linspace(0.4, 0.6, 21)
accs = []
for t in thresholds:
    preds_t = (proba_val_ref >= t).astype(int)
    accs.append(accuracy_score(y_val, preds_t))

best_t_idx = int(np.argmax(accs))
best_threshold = thresholds[best_t_idx]
print('Melhor threshold em validação:', best_threshold, ' - acc:', accs[best_t_idx])

# Se o threshold melhor for diferente de 0.5, refazer submissão final com esse threshold
if abs(best_threshold - 0.5) > 1e-6:
    if selected_name == 'old_best':
        final_model = best_pipeline.fit(X, y)
        proba_test_final = final_model.predict_proba(X_test_final)[:,1]
    elif selected_name == 'new_best':
        final_model = best_eng.fit(X_eng, y)
        proba_test_final = final_model.predict_proba(X_test_eng)[:,1]
    else:
        best_pipeline.fit(X, y)
        best_eng.fit(X_eng, y)
        po = best_pipeline.predict_proba(X_test_final)[:,1]
        pn = best_eng.predict_proba(X_test_eng)[:,1]
        proba_test_final = (po + pn)/2
    tuned_preds = (proba_test_final >= best_threshold).astype(int)
    submission_tuned = pd.DataFrame({'id': X_test_final[id_col], 'labels': tuned_preds})
    submission_tuned.to_csv('submission_tuned.csv', index=False)
    print('Gerado submission_tuned.csv com threshold ajustado.')
else:
    print('Threshold 0.5 já é o melhor. Mantida submission anterior.')