In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import resample
from scipy.stats import randint, uniform
import numpy as np

In [2]:
# Carregar os dados
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
# One-hot encoding da coluna 'category_code'
category_dummies_train = pd.get_dummies(train['category_code'], prefix='category')
category_dummies_test = pd.get_dummies(test['category_code'], prefix='category')

# Alinhar colunas entre treino e teste
category_dummies_test = category_dummies_test.reindex(columns=category_dummies_train.columns, fill_value=0)


In [4]:
# Concatenar dummies ao dataset
train = pd.concat([train, category_dummies_train], axis=1)
test = pd.concat([test, category_dummies_test], axis=1)

# Remover a coluna original 'category_code'
train = train.drop(columns=['category_code'])
test = test.drop(columns=['category_code'])

In [5]:
# Colunas com valores faltantes para imputação
cols_with_nan = [
    'age_first_funding_year', 'age_last_funding_year',
    'age_first_milestone_year', 'age_last_milestone_year',
    'funding_total_usd'
]

# Imputação por mediana
imputer = SimpleImputer(strategy='median')
train[cols_with_nan] = imputer.fit_transform(train[cols_with_nan])
test[cols_with_nan] = imputer.transform(test[cols_with_nan])

In [6]:
# Colunas numéricas para padronizar
num_cols = [
    'age_first_funding_year', 'age_last_funding_year', 'age_first_milestone_year',
    'age_last_milestone_year', 'relationships', 'funding_rounds',
    'funding_total_usd', 'milestones', 'avg_participants'
]

# Padronização
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

In [7]:
# Features selecionadas com base na importância
selected_features = ['relationships', 'funding_total_usd', 'age_last_milestone_year', 'age_last_funding_year', 
                     'age_first_milestone_year', 'age_first_funding_year', 'milestones', 'avg_participants', 
                     'funding_rounds', 'is_otherstate']

In [8]:
# Balancear classes com oversampling
classe_0 = train[train['labels'] == 0]
classe_1 = train[train['labels'] == 1]

if len(classe_0) > len(classe_1):
    classe_1_over = resample(classe_1, replace=True, n_samples=len(classe_0), random_state=42)
    train_balanced = pd.concat([classe_0, classe_1_over])
else:
    classe_0_over = resample(classe_0, replace=True, n_samples=len(classe_1), random_state=42)
    train_balanced = pd.concat([classe_0_over, classe_1])

X_train = train_balanced[selected_features]
y_train = train_balanced['labels']
X_test = test[selected_features]

In [9]:
# Definir modelos
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)
ensemble = VotingClassifier(estimators=[('rf', rf_model), ('gb', gb_model)], voting='soft')

In [10]:
# Treinar ensemble com dados balanceados
ensemble.fit(X_train, y_train)

# Prever no conjunto de teste
predictions = ensemble.predict(X_test)

In [11]:
# Criar arquivo de submissão para Kaggle
submission = sample_submission.copy()
submission['labels'] = predictions
submission.to_csv('submission_v5.csv', index=False)

print('Arquivo de submissão criado com sucesso: submission.csv')

Arquivo de submissão criado com sucesso: submission.csv
