In [10]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [11]:
# Carregar os dados
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [12]:
# One-hot encoding da coluna 'category_code'
category_dummies_train = pd.get_dummies(train['category_code'], prefix='category')
category_dummies_test = pd.get_dummies(test['category_code'], prefix='category')

# Alinhar colunas entre treino e teste
category_dummies_test = category_dummies_test.reindex(columns=category_dummies_train.columns, fill_value=0)


In [13]:
# Concatenar dummies ao dataset
train = pd.concat([train, category_dummies_train], axis=1)
test = pd.concat([test, category_dummies_test], axis=1)

# Remover a coluna original 'category_code'
train = train.drop(columns=['category_code'])
test = test.drop(columns=['category_code'])

In [14]:
# Colunas com valores faltantes para imputação
cols_with_nan = [
    'age_first_funding_year', 'age_last_funding_year',
    'age_first_milestone_year', 'age_last_milestone_year',
    'funding_total_usd'
]

# Imputação por mediana
imputer = SimpleImputer(strategy='median')
train[cols_with_nan] = imputer.fit_transform(train[cols_with_nan])
test[cols_with_nan] = imputer.transform(test[cols_with_nan])

In [15]:
# Colunas numéricas para padronizar
num_cols = [
    'age_first_funding_year', 'age_last_funding_year', 'age_first_milestone_year',
    'age_last_milestone_year', 'relationships', 'funding_rounds',
    'funding_total_usd', 'milestones', 'avg_participants'
]

# Padronização
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

In [16]:
# Separar features e alvo
X = train.drop(columns=['id', 'labels'])
y = train['labels']
X_test = test.drop(columns=['id'])

# Função para treinar e avaliar o modelo
def train_and_evaluate(model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f'Acurácia média (5-fold CV): {np.mean(scores):.4f}')
    model.fit(X, y)
    return model

In [17]:
# Treinar RandomForest
rf = RandomForestClassifier(random_state=42)
rf = train_and_evaluate(rf, X, y)

# Fazer previsões no conjunto teste
predictions = rf.predict(X_test)

Acurácia média (5-fold CV): 0.7987


In [18]:
# Preparar arquivo de submissão
submission = sample_submission.copy()
submission['labels'] = predictions
submission.to_csv('submission.csv', index=False)
print('Arquivo submission.csv criado com sucesso!')

Arquivo submission.csv criado com sucesso!
